In [118]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [119]:
df = pd.read_csv('train.csv', usecols=['Fare', 'Pclass', 'Age', 'Survived'])

In [120]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare
0,0,3,22.0,7.25
1,1,1,38.0,71.2833
2,1,3,26.0,7.925
3,1,1,35.0,53.1
4,0,3,35.0,8.05


In [121]:
df.isnull().mean()*100

Survived     0.00000
Pclass       0.00000
Age         19.86532
Fare         0.00000
dtype: float64

In [122]:
from sklearn.model_selection import train_test_split

x = df.iloc[:, 1:]
y = df.iloc[:, 0]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# using multivar imputation

In [137]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

tnf1 = ColumnTransformer([
    ('age_transformed', KNNImputer(n_neighbors=4, weights='distance', add_indicator=True), ['Age'])
], remainder='passthrough')

In [138]:
tnf2 = ColumnTransformer([
    ('trf', StandardScaler(), ['Pclass', 'Age', 'Fare'])
])

In [139]:
new_x_train = tnf1.fit_transform(x_train)
new_x_test = tnf1.transform(x_test)

In [148]:
new_x_train = pd.DataFrame(data=new_x_train, columns=['Age', 'Pclass', 'Fare', 'Age_NA'])
new_x_test = pd.DataFrame(data=new_x_test, columns=['Age', 'Age_NA', 'Pclass', 'Fare'])

In [153]:
new_x_train.columns

Index(['Age', 'Pclass', 'Fare', 'Age_NA'], dtype='object')

In [None]:
transformed_xtrain = tnf2.fit_transform(new_x_train)
transformed_xtest = tnf2.transform(new_x_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()
clf2 = DecisionTreeClassifier()

clf.fit(transformed_xtrain, y_train)
clf2.fit(transformed_xtrain, y_train)

pred = clf.predict(transformed_xtest)
pred2 = clf2.predict(transformed_xtest)

In [128]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

print(accuracy_score(y_test, pred))
print(accuracy_score(y_test, pred2))

0.7374301675977654
0.6927374301675978


In [129]:
print(np.mean(cross_val_score(estimator=clf, X=transformed_xtrain, y=y_train, cv=10)))
print(np.mean(cross_val_score(estimator=clf2, X=transformed_xtrain, y=y_train, cv=10)))

0.693838028169014
0.6446596244131454


# using univariate imputation

In [41]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

def imputer(data=df, col_imputed=str, col_original=str):
    mask = df[col_imputed].isnull()
    vals_required = df[col_original].isnull().sum()
    df[col_imputed][mask] = df[col_original].sample(vals_required).values

tnf2 = ColumnTransformer([
    ('trf', StandardScaler(), ['Pclass', 'Age', 'Fare'])
])