In [298]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.impute import MissingIndicator
from sklearn.impute import SimpleImputer

In [299]:
df = pd.read_csv('train.csv', usecols = ['Age', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [300]:
x = df.drop(columns=['Survived'])
y = df['Survived']

In [301]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=2, test_size= 0.2)

x_train.head()

Unnamed: 0,Age,Fare
30,40.0,27.7208
10,4.0,16.7
873,47.0,9.0
182,9.0,31.3875
876,20.0,9.8458


In [302]:
si = SimpleImputer()
x_train_trf = si.fit_transform(x_train)            # Here, we replacing the value by mean
x_test_trf = si.transform(x_test)                  # Here, we replacing the value by mean

In [303]:
x_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [304]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train_trf, y_train)
y_pred = clf.predict(x_test_trf)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6145251396648045

In [305]:
mi = MissingIndicator()

mi.fit(x_train)

In [306]:
mi.features_

array([0], dtype=int64)

In [307]:
x_train_missing = mi.transform(x_train)
x_test_missing = mi.transform(x_test)

In [308]:
# x_train_missing
# x_test_missing

In [309]:
x_train['Age_NA'] = x_train_missing

In [310]:
x_test['Fare_NA'] = x_test_missing

In [311]:
x_train.head()

Unnamed: 0,Age,Fare,Age_NA
30,40.0,27.7208,False
10,4.0,16.7,False
873,47.0,9.0,False
182,9.0,31.3875,False
876,20.0,9.8458,False


In [312]:
si = SimpleImputer()

x_train_trf2 = si.fit_transform(x_train)
# x_test_trf2 = si.transform(x_test)

In [313]:
x_train_trf2

array([[ 40.        ,  27.7208    ,   0.        ],
       [  4.        ,  16.7       ,   0.        ],
       [ 47.        ,   9.        ,   0.        ],
       ...,
       [ 71.        ,  49.5042    ,   0.        ],
       [ 29.78590426, 221.7792    ,   1.        ],
       [ 29.78590426,  25.925     ,   1.        ]])

In [314]:
si2 = SimpleImputer()

x_test_trf2 = si2.fit_transform(x_test)

In [315]:
x_test_trf2

array([[ 42.    ,  26.2875,   0.    ],
       [ 21.    ,   8.05  ,   0.    ],
       [ 24.    ,  65.    ,   0.    ],
       [ 28.    ,  56.4958,   0.    ],
       [ 17.    ,   7.925 ,   0.    ],
       [ 30.    ,   7.8958,   0.    ],
       [ 80.    ,  30.    ,   0.    ],
       [ 25.    ,   7.25  ,   0.    ],
       [ 50.    , 133.65  ,   0.    ],
       [ 25.    ,  26.    ,   0.    ],
       [ 35.    ,  26.    ,   0.    ],
       [ 35.    ,  90.    ,   0.    ],
       [ 55.    ,  16.    ,   0.    ],
       [ 29.3728,  56.4958,   1.    ],
       [ 29.3728,  56.4958,   1.    ],
       [ 19.    ,   7.8542,   0.    ],
       [ 29.3728,  15.2458,   1.    ],
       [ 49.    ,   0.    ,   0.    ],
       [ 18.    ,  14.4542,   0.    ],
       [ 65.    ,   7.75  ,   0.    ],
       [ 18.    , 108.9   ,   0.    ],
       [ 29.3728,  22.3583,   1.    ],
       [ 16.    ,  18.    ,   0.    ],
       [ 21.    ,   9.825 ,   0.    ],
       [ 19.    ,   6.75  ,   0.    ],
       [  1.    ,  11.133

In [316]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train_trf2, y_train)
y_pred = clf.predict(x_test_trf2)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6312849162011173

# Alternate code for same accuracy


In [318]:
si = SimpleImputer(add_indicator=True)

In [319]:
x_train = si.fit_transform(x_train)
x_train

array([[ 40.        ,  27.7208    ,   0.        ,   0.        ],
       [  4.        ,  16.7       ,   0.        ,   0.        ],
       [ 47.        ,   9.        ,   0.        ,   0.        ],
       ...,
       [ 71.        ,  49.5042    ,   0.        ,   0.        ],
       [ 29.78590426, 221.7792    ,   1.        ,   1.        ],
       [ 29.78590426,  25.925     ,   1.        ,   1.        ]])

In [325]:
si2 = SimpleImputer(add_indicator=True)
x_test = si2.fit_transform(x_test)
x_test

array([[ 42.    ,  26.2875,   0.    ,   0.    ],
       [ 21.    ,   8.05  ,   0.    ,   0.    ],
       [ 24.    ,  65.    ,   0.    ,   0.    ],
       [ 28.    ,  56.4958,   0.    ,   0.    ],
       [ 17.    ,   7.925 ,   0.    ,   0.    ],
       [ 30.    ,   7.8958,   0.    ,   0.    ],
       [ 80.    ,  30.    ,   0.    ,   0.    ],
       [ 25.    ,   7.25  ,   0.    ,   0.    ],
       [ 50.    , 133.65  ,   0.    ,   0.    ],
       [ 25.    ,  26.    ,   0.    ,   0.    ],
       [ 35.    ,  26.    ,   0.    ,   0.    ],
       [ 35.    ,  90.    ,   0.    ,   0.    ],
       [ 55.    ,  16.    ,   0.    ,   0.    ],
       [ 29.3728,  56.4958,   1.    ,   1.    ],
       [ 29.3728,  56.4958,   1.    ,   1.    ],
       [ 19.    ,   7.8542,   0.    ,   0.    ],
       [ 29.3728,  15.2458,   1.    ,   1.    ],
       [ 49.    ,   0.    ,   0.    ,   0.    ],
       [ 18.    ,  14.4542,   0.    ,   0.    ],
       [ 65.    ,   7.75  ,   0.    ,   0.    ],
       [ 18.    , 10

In [327]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6312849162011173