In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator, SimpleImputer

In [2]:
df = pd.read_csv('train.csv', usecols = ['Age', 'Fare', 'Survived'])

In [3]:
df.sample(5)

Unnamed: 0,Survived,Age,Fare
681,1,27.0,76.7292
124,0,54.0,77.2875
855,1,18.0,9.35
2,1,26.0,7.925
366,1,60.0,75.25


In [4]:
X =  df.drop(columns = ['Survived'])
Y = df['Survived']

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state =2 )

In [6]:
X_train.sample(3)

Unnamed: 0,Age,Fare
32,,7.75
736,48.0,34.375
276,45.0,7.75


In [7]:
 si = SimpleImputer()

In [8]:
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.fit_transform(X_test)

In [9]:
X_train_trf[:3]

array([[40.    , 27.7208],
       [ 4.    , 16.7   ],
       [47.    ,  9.    ]])

## Call Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
clf = LogisticRegression()
clf.fit(X_train_trf, Y_train)
y_pred = clf.predict( X_test_trf)

In [12]:
clf = LogisticRegression()
clf.fit(X_train_trf, Y_train)
y_pred = clf.predict( X_test_trf)

In [13]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, y_pred)

0.6145251396648045

## Define Missing Indicator

In [14]:
mi = MissingIndicator()

In [15]:
mi.fit(X_train)

In [16]:
MissingIndicator()

In [17]:
mi.features_

array([0], dtype=int64)

## Transform: Train Missing

In [18]:
X_train_missing = mi.transform(X_train)

In [19]:
X_train_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

## Transform: Test Missing

In [20]:
X_test_missing = mi.transform(X_test)

In [21]:
X_test_missing

array([[False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [ True],
       [False],
       [ True],
       [False],
       [False],
       [ True],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [False],
       [

In [25]:
X_train['Age_NA'] = X_train_missing
X_test['Age_NA'] = X_test_missing
X_train, X_test

(      Age      Fare  Age_NA
 30   40.0   27.7208   False
 10    4.0   16.7000   False
 873  47.0    9.0000   False
 182   9.0   31.3875   False
 876  20.0    9.8458   False
 ..    ...       ...     ...
 534  30.0    8.6625   False
 584   NaN    8.7125    True
 493  71.0   49.5042   False
 527   NaN  221.7792    True
 168   NaN   25.9250    True
 
 [712 rows x 3 columns],
       Age     Fare  Age_NA
 707  42.0  26.2875   False
 37   21.0   8.0500   False
 615  24.0  65.0000   False
 169  28.0  56.4958   False
 68   17.0   7.9250   False
 ..    ...      ...     ...
 89   24.0   8.0500   False
 80   22.0   9.0000   False
 846   NaN  69.5500    True
 870  26.0   7.8958   False
 251  29.0  10.4625   False
 
 [179 rows x 3 columns])


## Checking accuracy

In [23]:
si = SimpleImputer()
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [24]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf2, Y_train)
y_pred = clf.predict(X_test_trf2)
accuracy_score(Y_test, y_pred)

0.6312849162011173