### Import the libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

### Load the dataset

In [2]:
df = pd.read_csv(r'data/train.csv', usecols=['Age', 'Fare', 'Survived'])

In [3]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [4]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

### Train Test Split

In [5]:
X = df.drop('Survived', axis=1)
y = df['Survived']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((712, 2), (179, 2), (712,), (179,))

### Logistic Regression without Missing Indicator

#### Missing values imputation using SimpleImputer

In [8]:
si = SimpleImputer()

In [9]:
X_train_si = si.fit_transform(X_train)
X_test_si = si.transform(X_test)

In [10]:
type(X_train_si), type(X_test_si)

(numpy.ndarray, numpy.ndarray)

#### Fit the model

In [11]:
lr = LogisticRegression()

In [12]:
lr.fit(X_train_si, y_train)

#### Prediction on test data

In [13]:
y_pred = lr.predict(X_test_si)

#### Accuracy Score

In [14]:
accuracy_score(y_test, y_pred)

0.6480446927374302

### Logistic Regression using Missing Indicator

#### Missing values imputation using MissingIndicator

In [15]:
mi = MissingIndicator()

In [16]:
X_train_mi = mi.fit_transform(X_train)
X_test_mi = mi.transform(X_test)

#### Creating columns indicating missing values

In [17]:
X_train['Age_NA'] = X_train_mi
X_test['Age_NA'] = X_test_mi

In [18]:
X_train.head()

Unnamed: 0,Age,Fare,Age_NA
331,45.5,28.5,False
733,23.0,13.0,False
382,32.0,7.925,False
704,26.0,7.8542,False
813,6.0,31.275,False


In [19]:
X_test.head()

Unnamed: 0,Age,Fare,Age_NA
709,,15.2458,True
439,31.0,10.5,False
840,20.0,7.925,False
720,6.0,33.0,False
39,14.0,11.2417,False


#### SimpleImputer

In [20]:
si = SimpleImputer()

In [21]:
X_train_si = si.fit_transform(X_train)
X_test_si = si.transform(X_test)

#### Fit the model

In [22]:
lr = LogisticRegression()

In [23]:
lr.fit(X_train_si, y_train)

#### Prediction on test data

In [24]:
y_pred = lr.predict(X_test_si)

#### Accuracy Score

In [25]:
accuracy_score(y_test, y_pred)

0.6368715083798883

### Missing Indicator using SkLearn

In [26]:
si = SimpleImputer(add_indicator=True)

In [27]:
X_train_si = si.fit_transform(X_train)
X_test_si = si.transform(X_test)

#### Fit the model

In [28]:
lr = LogisticRegression()

In [29]:
lr.fit(X_train_si, y_train)

#### Prediction on test data

In [30]:
y_pred = lr.predict(X_test_si)

#### Accuracy Score

In [31]:
accuracy_score(y_test, y_pred)

0.6368715083798883