# Imputation of missing values - By supervised learning imputation

# a. Simple imputation 

i. Simple imputation through mean and median : for numerical features

In [55]:
import seaborn as sb 
df_mean_impute = sb.load_dataset('titanic') 
# Printing the first five rows of the dataset
df_mean_impute.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [56]:
# check for null values 
df_mean_impute.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [57]:
# handling null values in age column through simple mean imputation 
df_mean_impute['age'] = df_mean_impute['age'].fillna(df_mean_impute['age'].mean())
# checking for null values in age column 
df_mean_impute['age'].isna().sum()

np.int64(0)

In [58]:
# handling null values in age column through simple median imputation 
df_median_impute = sb.load_dataset('titanic')
df_median_impute['age'] = df_median_impute['age'].fillna(df_median_impute['age'].median())
# checking for null values in age column 
df_median_impute['age'].isna().sum()

np.int64(0)

ii. Simple imputation through mode : for categorical values 

In [59]:
# handling null values in deck column through simple mode imputation 
df_mode_impute = sb.load_dataset('titanic')
df_mode_impute['deck'] = df_mode_impute['deck'].fillna(df_mode_impute['deck'].mode()) 
# checking for null values in deck column 
df_mode_impute['deck'].isna().sum()

np.int64(687)

We were unable to handle the null values in deck column through mode imputation so we gonna drop the entire deck column

In [60]:
# dropping deck column from the dataframe 
df_mode_impute.drop('deck',axis=1,inplace=True)
df_mode_impute.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


## b. Decision Tree imputation 

i. Imputation through decision tree classifier : for categorical variable

In [61]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
df_decision_impute = sb.load_dataset('titanic')
df_decision_impute['sex_encoded'] = LabelEncoder().fit_transform(df_decision_impute['sex'])
feature_cols = ['fare', 'age', 'pclass', 'sex_encoded']
known_mask = df_decision_impute['embark_town'].notna()
unknown_mask = df_decision_impute['embark_town'].isna()
df_known = df_decision_impute.loc[known_mask, feature_cols + ['embark_town']].dropna()
X_train = df_known[feature_cols]
y_train_raw = df_known['embark_town']
le = LabelEncoder()
y_train = le.fit_transform(y_train_raw)
dtc_imputer = DecisionTreeClassifier()
dtc_imputer.fit(X_train, y_train)
df_missing = df_decision_impute.loc[unknown_mask, feature_cols].dropna()
missing_indices = df_missing.index
y_pred_encoded = dtc_imputer.predict(df_missing)
y_pred = le.inverse_transform(y_pred_encoded)
df_decision_impute.loc[missing_indices, 'embark_town'] = y_pred

In [62]:
# checking for null values in embark_town column
df_decision_impute['embark_town'].isna().sum()

np.int64(0)

ii. Imputation through decision tree regressor : for numerical variable

In [63]:
from sklearn.tree import DecisionTreeRegressor

df_age_impute = sb.load_dataset('titanic')

feature_cols = ['fare', 'pclass', 'sex', 'embark_town']

df_age_impute['sex_encoded'] = LabelEncoder().fit_transform(df_age_impute['sex'])
df_age_impute['embark_town_encoded'] = LabelEncoder().fit_transform(df_age_impute['embark_town'])


feature_cols = ['fare', 'pclass', 'sex_encoded', 'embark_town_encoded']

known_mask = df_age_impute['age'].notna()
unknown_mask = df_age_impute['age'].isna()

df_known = df_age_impute.loc[known_mask, feature_cols + ['age']].dropna()

X_train = df_known[feature_cols]
y_train = df_known['age']

dtr_imputer = DecisionTreeRegressor()
dtr_imputer.fit(X_train, y_train)

df_missing = df_age_impute.loc[unknown_mask, feature_cols].dropna()
missing_indices = df_missing.index

y_pred = dtr_imputer.predict(df_missing)

df_age_impute.loc[missing_indices, 'age'] = y_pred

In [64]:
# checking for null values in age column 
df_age_impute['age'].isna().sum()

np.int64(0)

# c. Random forest imputation

i. Imputation through random forest classifier : for categorical variable

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
df_rfc_impute = sb.load_dataset('titanic')
df_rfc_impute['sex_encoded'] = LabelEncoder().fit_transform(df_rfc_impute['sex'])
feature_cols = ['fare', 'age', 'pclass', 'sex_encoded']
known_mask = df_rfc_impute['embark_town'].notna()
unknown_mask = df_rfc_impute['embark_town'].isna()
df_known = df_rfc_impute.loc[known_mask, feature_cols + ['embark_town']].dropna()
X_train = df_known[feature_cols]
y_train_raw = df_known['embark_town']
le = LabelEncoder()
y_train = le.fit_transform(y_train_raw)
rfc_imputer = RandomForestClassifier()
rfc_imputer.fit(X_train, y_train)
df_missing = df_rfc_impute.loc[unknown_mask, feature_cols].dropna()
missing_indices = df_missing.index
y_pred_encoded = rfc_imputer.predict(df_missing)
y_pred = le.inverse_transform(y_pred_encoded)
df_rfc_impute.loc[missing_indices, 'embark_town'] = y_pred

In [66]:
# checking for null values in embark_town column 
df_rfc_impute['embark_town'].isna().sum()

np.int64(0)

ii. imputation through random forest regressor: for numerical variable

In [67]:
from sklearn.ensemble import RandomForestRegressor

df_rfr_impute = sb.load_dataset('titanic')

feature_cols = ['fare', 'pclass', 'sex', 'embark_town']

df_rfr_impute['sex_encoded'] = LabelEncoder().fit_transform(df_rfr_impute['sex'])
df_rfr_impute['embark_town_encoded'] = LabelEncoder().fit_transform(df_rfr_impute['embark_town'])


feature_cols = ['fare', 'pclass', 'sex_encoded', 'embark_town_encoded']

known_mask = df_rfr_impute['age'].notna()
unknown_mask = df_rfr_impute['age'].isna()

df_known = df_rfr_impute.loc[known_mask, feature_cols + ['age']].dropna()

X_train = df_known[feature_cols]
y_train = df_known['age']

rfr_imputer = RandomForestRegressor()
rfr_imputer.fit(X_train, y_train)

df_missing = df_age_impute.loc[unknown_mask, feature_cols].dropna()
missing_indices = df_missing.index

y_pred = dtr_imputer.predict(df_missing)

df_rfr_impute.loc[missing_indices, 'age'] = y_pred

In [68]:
# checking for null values in age column 
df_rfr_impute['age'].isna().sum()

np.int64(0)

# d. KNN(K-Nearest Neighbors) Imputation

i. imputation through KNN : for categorical variable.

In [69]:
from sklearn.impute import KNNImputer 
df_knn_impute = sb.load_dataset('titanic') 
# Creating a KNNImputer instance with 5 neighbors 
knn_imputer = KNNImputer(n_neighbors=5) 
# filling missing values in the age column through simple mean imputation
df_knn_impute['age'] = df_knn_impute['age'].fillna(int(df_knn_impute['age'].mean()))
# encoding categorical variable - embark_town for inverse transformation 
df_knn_impute['embark_town_encoded'] = le.fit_transform(df_knn_impute['embark_town'].astype(str))

# Apply KNN Imputer
knn = KNNImputer(n_neighbors=5)
df_knn_impute[['age','fare','pclass','embark_town_encoded']] = knn.fit_transform(df_knn_impute[['age','fare','pclass','embark_town_encoded']])

# Rounding and decoding back
df_knn_impute['embark_town'] = le.inverse_transform(df_knn_impute['embark_town_encoded'].round().astype(int))
# checking for null values in embark_town column
df_knn_impute['embark_town'].isna().sum()


np.int64(0)

ii. imputation through KNN : for numerical variable

In [None]:
df_knn_impute_age = sb.load_dataset('titanic') 
df_knn_impute_age['sex_encoded'] = le.fit_transform(df_knn_impute_age['sex']) 
# imputing missing values in embark_town column with mode 
df_knn_impute_age['embark_town'] = df_knn_impute_age['embark_town'].fillna(df_knn_impute_age['embark_town'].mode()) 
df_knn_impute_age['embark_town_encoded'] = le.fit_transform(df_knn_impute_age['embark_town'].astype(str))  
# features for KNN imputation
features = ['age','fare','pclass','sex_encoded','embark_town_encoded']
imputed = knn.fit_transform(df_knn_impute_age[features]) 
df_knn_impute_age['age'] = imputed[:,0] 
df_knn_impute_age['age'].apply(lambda x: int(x))
# checking for null values in age column 
df_knn_impute_age['age'].isna().sum()

np.int64(0)