# Importing libraries

In [82]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.model_selection import train_test_split

# Importing dataset

In [83]:
# Read data from a CSV file and select specific columns (Age, Fare, Survived)
df = pd.read_csv("train.csv", usecols=['Age', 'Fare', 'Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


# Handling Missing Values

In [84]:
# Split the data into features (X) and target (y)
X = df.drop(columns=['Survived'])
y = df['Survived']

In [85]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [86]:
# Initialize a SimpleImputer for missing value imputation
si = SimpleImputer()

# Transform the training and testing data using the imputer
X_train_trf = si.fit_transform(X_train)
X_test_trf = si.transform(X_test)

In [87]:
X_train_trf

array([[ 40.        ,  27.7208    ],
       [  4.        ,  16.7       ],
       [ 47.        ,   9.        ],
       ...,
       [ 71.        ,  49.5042    ],
       [ 29.78590426, 221.7792    ],
       [ 29.78590426,  25.925     ]])

In [88]:
X_test_trf

array([[ 42.        ,  26.2875    ],
       [ 21.        ,   8.05      ],
       [ 24.        ,  65.        ],
       [ 28.        ,  56.4958    ],
       [ 17.        ,   7.925     ],
       [ 30.        ,   7.8958    ],
       [ 80.        ,  30.        ],
       [ 25.        ,   7.25      ],
       [ 50.        , 133.65      ],
       [ 25.        ,  26.        ],
       [ 35.        ,  26.        ],
       [ 35.        ,  90.        ],
       [ 55.        ,  16.        ],
       [ 29.78590426,  56.4958    ],
       [ 29.78590426,  56.4958    ],
       [ 19.        ,   7.8542    ],
       [ 29.78590426,  15.2458    ],
       [ 49.        ,   0.        ],
       [ 18.        ,  14.4542    ],
       [ 65.        ,   7.75      ],
       [ 18.        , 108.9       ],
       [ 29.78590426,  22.3583    ],
       [ 16.        ,  18.        ],
       [ 21.        ,   9.825     ],
       [ 19.        ,   6.75      ],
       [  1.        ,  11.1333    ],
       [ 50.        ,  10.5       ],
 

## Accuracy without Missing Indicator technique

In [89]:
# Import the LogisticRegression model and accuracy_score metric
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Create a LogisticRegression classifier and train it on the transformed data
clf = LogisticRegression()
clf.fit(X_train_trf, y_train)

# Make predictions on the test data and calculate accuracy
y_pred = clf.predict(X_test_trf)
accuracy = accuracy_score(y_test, y_pred) * 100
print('Accuracy without using missing indicator technique :',accuracy)

Accuracy without using missing indicator technique : 61.452513966480446


## Initializing Missing Indicator technique

In [90]:
# Initialize a MissingIndicator to detect missing values
mi = MissingIndicator()
mi.fit(X_train)

MissingIndicator()

In [91]:
# Transform the training and testing data to represent missing value indicators
X_train_missing = mi.transform(X_train)
X_test_missing = mi.transform(X_test)

In [92]:
# Add a new feature to indicate missing values in the 'Age' column
X_train['Age_NA'] = X_train_missing
X_test['Age_NA'] = X_test_missing

In [93]:
# Reinitialize the SimpleImputer for missing value imputation
si = SimpleImputer()

# Transform the training and testing data with the new feature
X_train_trf2 = si.fit_transform(X_train)
X_test_trf2 = si.transform(X_test)

In [94]:
X_train_trf2

array([[ 40.        ,  27.7208    ,   0.        ],
       [  4.        ,  16.7       ,   0.        ],
       [ 47.        ,   9.        ,   0.        ],
       ...,
       [ 71.        ,  49.5042    ,   0.        ],
       [ 29.78590426, 221.7792    ,   1.        ],
       [ 29.78590426,  25.925     ,   1.        ]])

In [95]:
X_test_trf2

array([[ 42.        ,  26.2875    ,   0.        ],
       [ 21.        ,   8.05      ,   0.        ],
       [ 24.        ,  65.        ,   0.        ],
       [ 28.        ,  56.4958    ,   0.        ],
       [ 17.        ,   7.925     ,   0.        ],
       [ 30.        ,   7.8958    ,   0.        ],
       [ 80.        ,  30.        ,   0.        ],
       [ 25.        ,   7.25      ,   0.        ],
       [ 50.        , 133.65      ,   0.        ],
       [ 25.        ,  26.        ,   0.        ],
       [ 35.        ,  26.        ,   0.        ],
       [ 35.        ,  90.        ,   0.        ],
       [ 55.        ,  16.        ,   0.        ],
       [ 29.78590426,  56.4958    ,   1.        ],
       [ 29.78590426,  56.4958    ,   1.        ],
       [ 19.        ,   7.8542    ,   0.        ],
       [ 29.78590426,  15.2458    ,   1.        ],
       [ 49.        ,   0.        ,   0.        ],
       [ 18.        ,  14.4542    ,   0.        ],
       [ 65.        ,   7.75   

## Accuracy after incorporating the Missing Indicator technique

In [96]:
# Create a new LogisticRegression classifier and train it on the transformed data
clf = LogisticRegression()
clf.fit(X_train_trf2, y_train)

# Make predictions on the test data and calculate accuracy
y_pred = clf.predict(X_test_trf2)
accuracy2 = accuracy_score(y_test, y_pred) * 100
print('Accuracy after using missing indicator technique :',accuracy2)

Accuracy after using missing indicator technique : 63.128491620111724


Notice that the accuracy we obtained from logistic regression without the missing indicator technique was 61.4. Now, after incorporating the missing indicator technique, our accuracy has increased to 63.4. This demonstrates that using the missing indicator technique can sometimes yield better results.