# **<ins style="color:aqua">Handling Missing Data</ins>**
## **<ins style="color:green">Missing Indicator</ins>**
- Make a new column for missing value, taking True/False Values

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import MissingIndicator, SimpleImputer

In [2]:
df = pd.read_csv("../data/csvData/train.csv", usecols=['Age', 'Fare', 'Survived'])
df.sample(7)

Unnamed: 0,Survived,Age,Fare
792,0,,69.55
224,1,38.0,90.0
17,1,,13.0
642,0,2.0,27.9
775,0,18.0,7.75
849,1,,89.1042
758,0,34.0,8.05


In [3]:
X = df.drop(columns=['Survived'])
y = df['Survived']
X.shape, y.shape

((891, 2), (891,))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
X_train.sample(7)

Unnamed: 0,Age,Fare
336,29.0,66.6
51,21.0,7.8
823,27.0,12.475
684,60.0,39.0
648,,7.55
767,30.5,7.75
720,6.0,33.0


### **Without Using Missing Indicator**

In [5]:
si = SimpleImputer()
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [6]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf, y_train)
y_pred = clf.predict(X_test_trf)

In [7]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100

61.452513966480446

### **Using Missing Indicator Method: 1**

In [8]:
mi = MissingIndicator()
mi.fit(X_train)

In [9]:
X_train_miss = mi.transform(X_train)
X_test_miss = mi.transform(X_test)

In [10]:
X_train['Age_NA'] = X_train_miss
X_test['Age_NA'] = X_test_miss

In [11]:
X_train.sample(7)

Unnamed: 0,Age,Fare,Age_NA
530,2.0,26.0,False
729,25.0,7.925,False
57,28.5,7.2292,False
850,4.0,31.275,False
738,,7.8958,True
259,50.0,26.0,False
537,30.0,106.425,False


In [12]:
X_test.sample(7)

Unnamed: 0,Age,Fare,Age_NA
631,51.0,7.0542,False
172,1.0,11.1333,False
732,,0.0,True
486,35.0,90.0,False
717,27.0,10.5,False
870,26.0,7.8958,False
520,30.0,93.5,False


In [13]:
si = SimpleImputer()
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [14]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf2, y_train)
y_pred = clf.predict(X_test_trf2)

In [15]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100

63.128491620111724

### **Using Missing Indicator Method: 2**

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
X_train.sample(7)

Unnamed: 0,Age,Fare
662,47.0,25.5875
170,61.0,33.5
298,,30.5
646,19.0,7.8958
678,43.0,46.9
543,32.0,26.0
623,21.0,7.8542


In [17]:
si = SimpleImputer(add_indicator=True)
X_train = si.fit_transform(X_train)
X_test = si.transform(X_test)

In [18]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(X_train_trf2, y_train)
y_pred = clf.predict(X_test_trf2)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)*100

63.128491620111724