# Missing value handling

In [55]:
import pandas as pd

# titanic dataset
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [56]:
# Total missing values
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            178
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          692
Embarked         2
dtype: int64

In [57]:
# Percentage of missing
(df.isnull().sum() / len(df)) * 100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.866071
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.232143
Embarked        0.223214
dtype: float64

In [58]:
df_no_missing_embarked = df.dropna(subset=["Embarked"])

df_no_missing_embarked.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            178
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          692
Embarked         0
dtype: int64

In [59]:
# index of missing values in Age col
age_missing_index = df[df['Age'].isnull()].index.tolist()
df.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S



## The imputation strategy.

- If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.

- If “median”, then replace missing values using the median along each column. Can only be used with numeric data.

- If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.

- If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.

In [60]:
# Replace missing values with the mean of the "Age" column
mean_age = df["Age"].mean()
df["Age_imputed_mean"] = df["Age"].fillna(mean_age)

# Replace missing values with the median of the "Age" column
median_age = df["Age"].median()
df["Age_imputed_median"] = df["Age"].fillna(median_age)

# Replace missing values with the mode of the "Age" column
df["Age_imputed_mode"] = df["Age"].fillna(df["Age"].mode().iloc[0])


In [61]:
const = -1
df["Age_imputed_const"] = df["Age"].fillna(const)

In [62]:
df.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_imputed_mean,Age_imputed_median,Age_imputed_mode,Age_imputed_const
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,29.710543,28.0,24.0,-1.0
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,29.710543,28.0,24.0,-1.0
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,29.710543,28.0,24.0,-1.0
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,29.710543,28.0,24.0,-1.0
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,29.710543,28.0,24.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,29.710543,28.0,24.0,-1.0
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,29.710543,28.0,24.0,-1.0
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,29.710543,28.0,24.0,-1.0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,29.710543,28.0,24.0,-1.0


# Category wise mean/median

In [63]:
df.groupby('Sex')['Age'].mean()

Sex
female    27.961832
male      30.715285
Name: Age, dtype: float64

In [64]:
df.groupby("Sex")['Age'].median()

Sex
female    27.0
male      29.0
Name: Age, dtype: float64

In [65]:
# groupby category
# mean
df["groupby_mean_Age"] = df.groupby("Sex")['Age'].transform(lambda x: x.fillna(x.mean()))
# median
df["groupby_median_Age"] = df.groupby("Sex")['Age'].transform(lambda x: x.fillna(x.median()))


# check missing values
df[["Sex", "Age", "groupby_mean_Age", "groupby_median_Age"]].iloc[age_missing_index]

Unnamed: 0,Sex,Age,groupby_mean_Age,groupby_median_Age
5,male,,30.715285,29.0
17,male,,30.715285,29.0
19,female,,27.961832,27.0
26,male,,30.715285,29.0
28,female,,27.961832,27.0
...,...,...,...,...
863,female,,27.961832,27.0
868,male,,30.715285,29.0
878,male,,30.715285,29.0
888,female,,27.961832,27.0


# K-Nearest Neighbor Imputation
The KNNImputer class provides imputation for filling in missing values using the k-Nearest Neighbors approach.Each missing feature is imputed using values from n_neighbors nearest neighbors that have a value for the feature. The feature of the neighbors are averaged uniformly or weighted by distance to each neighbor.

In [68]:
from sklearn.impute import KNNImputer
# data copy
train_knn = df.copy()

knn_imputer = KNNImputer(n_neighbors=3, weights="uniform")
train_knn[['Age', 'Pclass', 'Fare', 'Survived']] = knn_imputer.fit_transform(train_knn[['Age', 'Pclass', 'Fare', 'Survived']])

In [69]:
train_knn.iloc[age_missing_index]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_imputed_mean,Age_imputed_median,Age_imputed_mode,Age_imputed_const,groupby_mean_Age,groupby_median_Age
5,6,0.0,3.0,"Moran, Mr. James",male,31.666667,0,0,330877,8.4583,,Q,29.710543,28.0,24.0,-1.0,30.715285,29.0
17,18,1.0,2.0,"Williams, Mr. Charles Eugene",male,32.833333,0,0,244373,13.0000,,S,29.710543,28.0,24.0,-1.0,30.715285,29.0
19,20,1.0,3.0,"Masselmani, Mrs. Fatima",female,16.666667,0,0,2649,7.2250,,C,29.710543,28.0,24.0,-1.0,27.961832,27.0
26,27,0.0,3.0,"Emir, Mr. Farred Chehab",male,38.500000,0,0,2631,7.2250,,C,29.710543,28.0,24.0,-1.0,30.715285,29.0
28,29,1.0,3.0,"O'Dwyer, Miss. Ellen ""Nellie""",female,22.333333,0,0,330959,7.8792,,Q,29.710543,28.0,24.0,-1.0,27.961832,27.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
863,864,0.0,3.0,"Sage, Miss. Dorothy Edith ""Dolly""",female,39.333333,8,2,CA. 2343,69.5500,,S,29.710543,28.0,24.0,-1.0,27.961832,27.0
868,869,0.0,3.0,"van Melkebeke, Mr. Philemon",male,24.000000,0,0,345777,9.5000,,S,29.710543,28.0,24.0,-1.0,30.715285,29.0
878,879,0.0,3.0,"Laleff, Mr. Kristo",male,31.333333,0,0,349217,7.8958,,S,29.710543,28.0,24.0,-1.0,30.715285,29.0
888,889,0.0,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",female,30.000000,1,2,W./C. 6607,23.4500,,S,29.710543,28.0,24.0,-1.0,27.961832,27.0


In [70]:
# !pip install scikit-learn

### For self study: https://scikit-learn.org/stable/modules/impute.html