# Fill Missing values either using SimpleImputer OR KNNImputer

In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer, SimpleImputer

In [2]:
df = pd.read_csv('train.csv')[['Age','Pclass','Fare','Survived']]

In [3]:
df.head()

Unnamed: 0,Age,Pclass,Fare,Survived
0,22.0,3,7.25,0
1,38.0,1,71.2833,1
2,26.0,3,7.925,1
3,35.0,1,53.1,1
4,35.0,3,8.05,0


In [4]:
X = df.drop(columns="Survived")

In [5]:
y = df["Survived"]

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2)

In [7]:
X_train

Unnamed: 0,Age,Pclass,Fare
225,22.0,3,9.3500
128,,3,22.3583
845,42.0,3,7.5500
371,18.0,3,6.4958
387,36.0,2,13.0000
...,...,...,...
36,,3,7.2292
689,15.0,1,211.3375
118,24.0,1,247.5208
839,,1,29.7000


In [8]:
X_train.isnull().mean()

Age       0.206461
Pclass    0.000000
Fare      0.000000
dtype: float64

# Fill missing values using KNN imputer

In [9]:
knn = KNNImputer(n_neighbors=3,weights='distance')

In [10]:
X_train_tnf = knn.fit_transform(X_train)
X_test_tnf = knn.transform(X_test)

In [11]:
type(X_train_tnf)

numpy.ndarray

In [12]:
pd.DataFrame(X_train_tnf).isnull().sum()

0    0
1    0
2    0
dtype: int64

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
m1 = LogisticRegression()

In [15]:
m1.fit(X_train_tnf,y_train)

LogisticRegression()

In [16]:
y_pred = m1.predict(X_test_tnf)

In [17]:
from sklearn.metrics import accuracy_score

In [18]:
print("USING KNN ",accuracy_score(y_test,y_pred))

USING KNN  0.6759776536312849


# Fill Missing Values using Simpleimputer

In [19]:
si = SimpleImputer()

X_train_tnf1 = si.fit_transform(X_train)
X_test_tnf1 = si.transform(X_test)

In [20]:
m2 = LogisticRegression()

m2.fit(X_train_tnf1,y_train)

LogisticRegression()

In [21]:
y_pred1 = m1.predict(X_test_tnf1)

In [22]:
print("USING SIMPLEIMPUTER :",accuracy_score(y_test,y_pred1))

USING SIMPLEIMPUTER : 0.6759776536312849
