# Logistic Regression
This document will look at implementing the Logistic Regression Model on a subset of the Titanic Dataset.

In [1]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing Data

In [2]:
raw_titanic_df = pd.read_csv(r'data\partial_titanic.csv')
print("Number of Observations: " + str(raw_titanic_df.iloc[:,1].count()))
raw_titanic_df.head(5)

Number of Observations: 891


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Selecting Features & Preprocessing
I will select some features that do not require a lot of engineering to be compatible with the model.
To Note:
- EDA not performed, undertaken an understanding of this dataset in other documents.
- Simple Selection to demonstrate the Logitic Regression Classifier.
- Pclass and Gender will be one-hot-encoded
- All features will be scaled

In [3]:
# Select a subset of features
sub_titanic_df = raw_titanic_df[["Pclass", "Sex", "Age", "Fare", "Survived"]].copy()
print(sub_titanic_df.isna().sum())
print("Length before NaN drop: " + str(sub_titanic_df.iloc[:, 1].count()))

Pclass        0
Sex           0
Age         177
Fare          0
Survived      0
dtype: int64
Length before NaN drop: 891


In [4]:
# Drop NaN's from Age column which is the only column they exist in
sub_titanic_df = sub_titanic_df[sub_titanic_df["Age"].notna()]
print("Length After NaN drop: " + str(sub_titanic_df.iloc[:, 1].count()))

Length After NaN drop: 714


In [5]:
# Split Target and Dependant Variables
X = sub_titanic_df.iloc[:, :-1]
y = sub_titanic_df.iloc[:, -1]

print("X Dependant Variables: ")
print(str(X.head()) + "\n")
print(r"y Target Predictor (Survived): ")
print(y.head())

X Dependant Variables: 
   Pclass     Sex   Age     Fare
0       3    male  22.0   7.2500
1       1  female  38.0  71.2833
2       3  female  26.0   7.9250
3       1  female  35.0  53.1000
4       3    male  35.0   8.0500

y Target Predictor (Survived): 
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64


In [6]:
# Encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 0 & 1 as thats the index of the columns to transform
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [7]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [8]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Scale Train and Test sets by the Scaler that knows the Training Data
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [13]:
print("X_train Dependant Variables One-Hot-Encoded & Feature Scaled: ")
print(str(X_train[0:5]) + "\n")
print(r"y Target Predictor (Survived): ")
print(y_train[:5])

X Dependant Variables: 
[[-0.59822071 -0.56655772  1.01317084  1.28906615 -1.28906615 -0.51357623
  -0.52149257]
 [-0.59822071  1.76504522 -0.98700038  1.28906615 -1.28906615 -0.30929987
  -0.17728793]
 [-0.59822071  1.76504522 -0.98700038 -0.77575538  0.77575538  0.84826618
  -0.4224748 ]
 [ 1.67162383 -0.56655772 -0.98700038 -0.77575538  0.77575538  1.18872678
  -0.0260114 ]
 [-0.59822071 -0.56655772  1.01317084  1.28906615 -1.28906615 -1.39877379
  -0.38011782]]

y Target Predictor (Survived): 
141    1
880    1
288    1
515    0
852    0
Name: Survived, dtype: int64


## Building Logistic Regression Model

In [10]:
# Training the Logistic Regression model on the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

# Predict Test Set
y_pred = classifier.predict(X_test)

In [11]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Model Accuracy: " + str(accuracy_score(y_test, y_pred)))

[[99 12]
 [21 47]]
Model Accuracy: 0.8156424581005587
