# Univariate Imputation

In [31]:
import pandas as pd 
import numpy as np

## Numerical

#### Mean/Median Imputation
1. Skewed : Median 
2. Normal : Mean
3. Very Easy
4. Recreation Easy on server
5. Best for < 5% missing
6. Disadv : Outliers occurs, Shape changes of distribution, Correlation changes

In [32]:
df = pd.read_csv('titanic_toy.csv')
df.isnull().mean()*100

Age         19.865320
Fare         5.050505
Family       0.000000
Survived     0.000000
dtype: float64

In [33]:
df.head()

Unnamed: 0,Age,Fare,Family,Survived
0,22.0,7.25,1,0
1,38.0,71.2833,1,1
2,26.0,7.925,0,1
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [34]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(df.iloc[:,:3],df.iloc[:,-1],test_size=0.3)

In [35]:
mean_age = xtrain['Age'].mean()
median_age = xtrain['Age'].median()

mean_fare = xtrain['Fare'].mean()
median_fare = xtrain['Fare'].median()

In [36]:
xtrain['Age_median'] = xtrain['Age'].fillna(median_age)
xtrain['Age_mean'] = xtrain['Age'].fillna(mean_age)
xtrain['Fare_median'] = xtrain['Fare'].fillna(median_fare)
xtrain['Fare_mean'] = xtrain['Fare'].fillna(mean_fare)

In [37]:
xtrain.head()

Unnamed: 0,Age,Fare,Family,Age_median,Age_mean,Fare_median,Fare_mean
165,9.0,20.525,2,9.0,9.0,20.525,20.525
303,,12.35,0,28.0,29.651405,12.35,12.35
537,30.0,106.425,0,30.0,30.0,106.425,106.425
463,48.0,13.0,0,48.0,48.0,13.0,13.0
657,32.0,15.5,2,32.0,32.0,15.5,15.5


In [38]:
print('Original Age variable variance: ', xtrain['Age'].var())
print('Age Variance after median imputation: ', xtrain['Age_median'].var())
print('Age Variance after mean imputation: ', xtrain['Age_mean'].var())
print()
print('Original Fare variable variance: ', xtrain['Fare'].var())
print('Fare Variance after median imputation: ', xtrain['Fare_median'].var())
print('Fare Variance after mean imputation: ', xtrain['Fare_mean'].var())


Original Age variable variance:  220.79608638846167
Age Variance after median imputation:  174.39516460080617
Age Variance after mean imputation:  173.93903911631224

Original Fare variable variance:  1350.8952983660654
Fare Variance after median imputation:  1295.0188308372904
Fare Variance after mean imputation:  1283.5677191870493


In [39]:
#  Outliers Occur
xtrain[['Age','Age_median','Age_mean']].boxplot()

<Axes: >

### Sklearn

In [49]:
xtrain,xtest,ytrain,ytest = train_test_split(df.iloc[:,:3],df.iloc[:,-1],test_size=0.3)

Unnamed: 0,Age,Fare,Family
626,57.0,12.3500,0
647,56.0,35.5000,0
76,,7.8958,0
312,26.0,26.0000,2
711,,26.5500,0
...,...,...,...
722,34.0,13.0000,0
757,18.0,11.5000,0
84,17.0,10.5000,0
35,42.0,52.0000,1


In [41]:
from sklearn.impute import SimpleImputer
imp1 = SimpleImputer(strategy='median')
imp2 = SimpleImputer(strategy='mean')

In [42]:
from sklearn.compose import ColumnTransformer
trf = ColumnTransformer([(
    'imp1',imp1,['Age']),
    ('imp2',imp2,['Fare'])
],remainder='passthrough')

In [43]:
trf.fit(xtrain)

In [50]:
xtrain_trans = trf.transform(xtrain)