# KNN Imputer

## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Loading Titanic Dataset

In [4]:
df = pd.read_csv('/content/train.csv', usecols = ['Age', 'Pclass', 'Fare', 'Survived'])

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


## Finding Missing Values

In [6]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Age,177
Fare,0


In [7]:
df.isnull().mean()*100

Unnamed: 0,0
Survived,0.0
Pclass,0.0
Age,19.86532
Fare,0.0


## Seperating X and y variables

In [8]:
X = df.drop(columns = ['Survived'])
y = df['Survived']

# Splitting Train and test value

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [10]:
X_train.head()

Unnamed: 0,Pclass,Age,Fare
331,1,45.5,28.5
733,2,23.0,13.0
382,3,32.0,7.925
704,3,26.0,7.8542
813,3,6.0,31.275


## Implementing KNN Imputer

In [11]:
knn = KNNImputer()
X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

## Converting the numpy array to the DataFrame

In [12]:
pd.DataFrame(X_train_trf, columns = X_train.columns)

Unnamed: 0,Pclass,Age,Fare
0,1.0,45.5,28.5000
1,2.0,23.0,13.0000
2,3.0,32.0,7.9250
3,3.0,26.0,7.8542
4,3.0,6.0,31.2750
...,...,...,...
707,3.0,21.0,7.6500
708,1.0,39.0,31.0000
709,3.0,41.0,14.1083
710,1.0,14.0,120.0000


## Implementing the Logistic Regression

In [13]:
lr = LogisticRegression()
lr.fit(X_train_trf, y_train)
y_pred = lr.predict(X_test_trf)
accuracy_score(y_test, y_pred)

0.7430167597765364

## KNN Imputer transformations using neighbors

In [18]:
knn = KNNImputer(n_neighbors = 5)
X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

In [19]:
lr = LogisticRegression()
lr.fit(X_train_trf, y_train)
y_pred = lr.predict(X_test_trf)
accuracy_score(y_test, y_pred)

0.7430167597765364

## KNN Imputer transformations using weights and neighbors

There are two types of calculation of the weights: `uniform` and `distance`.

- `uniform`: this is the simple calculation which we know (NOTES):

    `[valueOfPoint_1 + valueOfPoint_2 + ...]/N`

- `distance`:

  `(1/nan_distance_1)*ValueofPoint + .... + (1/nan_distance_N)*ValueofPoint_N`

In [21]:
knn = KNNImputer(n_neighbors = 5, weights = 'distance')
X_train_trf = knn.fit_transform(X_train)
X_test_trf = knn.transform(X_test)

In [22]:
lr = LogisticRegression()
lr.fit(X_train_trf, y_train)
y_pred = lr.predict(X_test_trf)
accuracy_score(y_test, y_pred)

0.7374301675977654

## Comparison with Mean Imputation with SimpleImputer

In [29]:
si = SimpleImputer()
X_train_trf_si = si.fit_transform(X_train)
X_test_trf_si = si.transform(X_test)

In [30]:
lr = LogisticRegression()
lr.fit(X_train_trf_si, y_train)
y_pred = lr.predict(X_test_trf_si)
accuracy_score(y_test, y_pred)

0.7374301675977654