# KNN Imputer
1. **In KNN imputer we can fill the missing values using `neighbour`**

# Load packages

In [1]:
import pandas as pd
import numpy as np

# Load Dataset

In [3]:
df=pd.read_csv("train.csv",usecols=['Survived','Age','Fare','Pclass'])
df.head(2)

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833


# Null Value

In [4]:
df.isnull().sum()

Survived      0
Pclass        0
Age         177
Fare          0
dtype: int64

# Now fill values using KNN imputer

# Train test split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [7]:
from sklearn.impute import KNNImputer

In [18]:
impute=KNNImputer(n_neighbors=3,weights='distance')
x_train_trf=impute.fit_transform(x_train)
x_test_trf=impute.fit_transform(x_test)

In [19]:
pd.DataFrame(x_train_trf)

Unnamed: 0,0,1,2
0,1.0,45.5,28.5000
1,2.0,23.0,13.0000
2,3.0,32.0,7.9250
3,3.0,26.0,7.8542
4,3.0,6.0,31.2750
...,...,...,...
707,3.0,21.0,7.6500
708,1.0,37.0,31.0000
709,3.0,41.0,14.1083
710,1.0,14.0,120.0000


# Conclussion
1. See that all the missing value should be impute.

# Train the model

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
model=LogisticRegression()
model.fit(x_train_trf,y_train)

In [22]:
pre=model.predict(x_test_trf)

# Accuracy

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
accuracy_score(y_test,pre)

0.7597765363128491

# now select the best `neighbour`

In [16]:
for i in range(1,10):
    impute=KNNImputer(n_neighbors=i,weights='uniform')
    x_train_trf=impute.fit_transform(x_train)
    x_test_trf=impute.fit_transform(x_test)
    
    model=LogisticRegression()
    model.fit(x_train_trf,y_train)
    
    pre=model.predict(x_test_trf)
    
    score=accuracy_score(y_test,pre)
    print(score)

0.7430167597765364
0.7541899441340782
0.7430167597765364
0.7486033519553073
0.7486033519553073
0.7486033519553073
0.7486033519553073
0.7486033519553073
0.7486033519553073


# Now wee see that we can get 75% accuracy at selecting 2 neighbour

# Now try `distance` parameter

In [17]:
for i in range(1,10):
    impute=KNNImputer(n_neighbors=i,weights='distance')
    x_train_trf=impute.fit_transform(x_train)
    x_test_trf=impute.fit_transform(x_test)
    
    model=LogisticRegression()
    model.fit(x_train_trf,y_train)
    
    pre=model.predict(x_test_trf)
    
    score=accuracy_score(y_test,pre)
    print(score)

0.7430167597765364
0.7541899441340782
0.7597765363128491
0.7541899441340782
0.7541899441340782
0.7541899441340782
0.7541899441340782
0.7541899441340782
0.7541899441340782


**we can get 75.9 at 3 neigbbour with `distance` parameter