# Importing libraries

In [52]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Loading Dataset

In [53]:
# Reading a CSV file and selecting specific columns
df = pd.read_csv("train.csv", usecols=['Age','Pclass','Fare','Survived'])

# Displaying the first few rows of the DataFrame
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


# Handling Missing Values

In [54]:
# Calculate the percentage of missing values in each column
df.isnull().mean() * 100

Survived     0.00000
Pclass       0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [55]:
# Storing all the input columns in variable 'X' and target column in variable 'y'

X = df.drop(columns=['Survived'])
y = df['Survived']

In [56]:
#  Dividing dataset into two subsets: one for training the model (X_train, y_train) and the other for testing its 
# performance (y_train, y_test), helping evaluate how well the model generalizes to new unseen data.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [57]:
# Create a KNNImputer instance with 3 neighbors and 'distance' weighting
knn = KNNImputer(n_neighbors=3, weights='distance')

# Fit and transform the training data (X_train) using KNN imputation
Xtrain_trf1 = knn.fit_transform(X_train)

# Transform the test data (X_test) using the same KNN imputer
Xtest_trf1 = knn.transform(X_test)

In [58]:
# Create a LogisticRegression instance
trf1 = LogisticRegression()

# Fit the logistic regression model on the transformed training data (Xtrain_trf1) and corresponding labels (y_train)
trf1.fit(Xtrain_trf1, y_train)

# Use the trained model to make predictions on the transformed test data (Xtest_trf1)
y_pred = trf1.predict(Xtest_trf1)

# Calculate the accuracy of the model by comparing predicted labels (y_pred) to actual test labels (y_test)
print('Accuracy achieved with KNN imputer :', accuracy_score(y_pred, y_test)*100)

Accuracy achieved with KNN imputer : 71.50837988826815


In [59]:
# Create a SimpleImputer instance with default settings (mean imputation)
SimpImputer = SimpleImputer()

# Fit and transform the training data (X_train) using Simple Imputer
Xtrain_trf2 = SimpImputer.fit_transform(X_train)

# Transform the test data (X_test) using the same Simple Imputer
Xtest_trf2 = SimpImputer.transform(X_test)

In [60]:
# Create a LogisticRegression instance
trf2 = LogisticRegression()

# Fit the logistic regression model on the transformed training data (Xtrain_trf2) and corresponding labels (y_train)
trf2.fit(Xtrain_trf2, y_train)

# Use the trained model to make predictions on the transformed test data (Xtest_trf2)
y_pred = trf2.predict(Xtest_trf2)

# Calculate the accuracy of the model by comparing predicted labels (y_pred) to actual test labels (y_test)
print('Accuracy achieved with mean imputer :', accuracy_score(y_pred, y_test)*100)

Accuracy achieved with mean imputer : 69.27374301675978


### Conclusion

Imputing missing values with the mean provides us with an accuracy of almost 69%, whereas using the KNN imputer yields an accuracy of approximately 71.5%. Therefore, we can conclude that the KNN imputer is delivering better results