In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

In [2]:
from sklearn.impute import KNNImputer, SimpleImputer

In [4]:
df = pd.read_csv('train.csv')[['Age','Pclass','Fare', 'Survived']]
df.head()

Unnamed: 0,Age,Pclass,Fare,Survived
0,22.0,3,7.25,0
1,38.0,1,71.2833,1
2,26.0,3,7.925,1
3,35.0,1,53.1,1
4,35.0,3,8.05,0


## Check for the missing value in each column

In [6]:
df.isnull().mean()*100

Age         19.86532
Pclass       0.00000
Fare         0.00000
Survived     0.00000
dtype: float64

In [7]:
x = df.drop("Survived", axis = 1)
y = df['Survived']

In [8]:
x.head()

Unnamed: 0,Age,Pclass,Fare
0,22.0,3,7.25
1,38.0,1,71.2833
2,26.0,3,7.925
3,35.0,1,53.1
4,35.0,3,8.05


### Now train test split

In [15]:
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size=0.7, random_state=2)

### Let's use KNN imputer in order to fill missing value 

##### weights : {‘uniform’, ‘distance’}
Weight function used in prediction. Possible values:

- ‘uniform’ : uniform weights. All points in each neighborhood are weighted equally.

- ‘distance’ : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

In [16]:
knn = KNNImputer(n_neighbors=5,weights='distance')

X_train_trf = knn.fit_transform(x_train)
X_test_trf = knn.transform(x_test)

### Let's apply Logistic Regression 

In [17]:
lr = LogisticRegression()

lr.fit(X_train_trf,y_train)

y_pred = lr.predict(X_test_trf)

accuracy_score(y_test,y_pred)

0.7388059701492538

### Let's compare the wotking of the knn imputer with simple imputer which actually use mean in order to impute missing values

In [18]:
si = SimpleImputer()

X_train_trf1 = si.fit_transform(x_train)
X_test_trf1 = si.transform(x_test)

In [19]:
lr = LogisticRegression()

lr.fit(X_train_trf1,y_train)

y_pred = lr.predict(X_test_trf1)

accuracy_score(y_test,y_pred)

0.7238805970149254

## Here we can see that there is slight difference between the knn imputer and simple imputer but we mostly prefer knn bcz it prevent the model to get baised and also presever the dis