# Imputation of missing values

# a. Simple imputation 

i. Simple imputation through mean and median : for numerical features

In [120]:
import seaborn as sb 
df_mean_impute = sb.load_dataset('titanic') 
# Printing the first five rows of the dataset
df_mean_impute.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [121]:
# check for null values 
df_mean_impute.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [122]:
# handling null values in age column through simple mean imputation 
df_mean_impute['age'] = df_mean_impute['age'].fillna(df_mean_impute['age'].mean())
# checking for null values in age column 
df_mean_impute['age'].isna().sum()

np.int64(0)

In [123]:
# handling null values in age column through simple median imputation 
df_median_impute = sb.load_dataset('titanic')
df_median_impute['age'] = df_median_impute['age'].fillna(df_median_impute['age'].median())
# checking for null values in age column 
df_median_impute['age'].isna().sum()

np.int64(0)

ii. Simple imputation through mode : for categorical values 

In [124]:
# handling null values in deck column through simple mode imputation 
df_mode_impute = sb.load_dataset('titanic')
df_mode_impute['deck'] = df_mode_impute['deck'].fillna(df_mode_impute['deck'].mode()) 
# checking for null values in deck column 
df_mode_impute['deck'].isna().sum()

np.int64(687)

We were unable to handle the null values in deck column through mode imputation so we gonna drop the entire deck column

In [125]:
# dropping deck column from the dataframe 
df_mode_impute.drop('deck',axis=1,inplace=True)
df_mode_impute.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True


## b. Decision Tree imputation 

i. Imputation through decision tree classifier : for categorical variable

In [126]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
df_decision_impute = sb.load_dataset('titanic')
df_decision_impute['sex_encoded'] = LabelEncoder().fit_transform(df_decision_impute['sex'])
feature_cols = ['fare', 'age', 'pclass', 'sex_encoded']
known_mask = df_decision_impute['embark_town'].notna()
unknown_mask = df_decision_impute['embark_town'].isna()
df_known = df_decision_impute.loc[known_mask, feature_cols + ['embark_town']].dropna()
X_train = df_known[feature_cols]
y_train_raw = df_known['embark_town']
le = LabelEncoder()
y_train = le.fit_transform(y_train_raw)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
df_missing = df_decision_impute.loc[unknown_mask, feature_cols].dropna()
missing_indices = df_missing.index
y_pred_encoded = clf.predict(df_missing)
y_pred = le.inverse_transform(y_pred_encoded)
df_decision_impute.loc[missing_indices, 'embark_town'] = y_pred

In [127]:
# checking for null values in embark_town column
df_decision_impute['embark_town'].isna().sum()

np.int64(0)

ii. Imputation through decision tree regressor : for numerical variable

In [128]:
from sklearn.tree import DecisionTreeRegressor

df_age_impute = sb.load_dataset('titanic')

feature_cols = ['fare', 'pclass', 'sex', 'embark_town']

df_age_impute['sex_encoded'] = LabelEncoder().fit_transform(df_age_impute['sex'].astype(str))
df_age_impute['embark_town_encoded'] = LabelEncoder().fit_transform(df_age_impute['embark_town'])


feature_cols = ['fare', 'pclass', 'sex_encoded', 'embark_town_encoded']

known_mask = df_age_impute['age'].notna()
unknown_mask = df_age_impute['age'].isna()

df_known = df_age_impute.loc[known_mask, feature_cols + ['age']].dropna()

X_train = df_known[feature_cols]
y_train = df_known['age']

regressor = DecisionTreeRegressor()
regressor.fit(X_train, y_train)

df_missing = df_age_impute.loc[unknown_mask, feature_cols].dropna()
missing_indices = df_missing.index

y_pred = regressor.predict(df_missing)

df_age_impute.loc[missing_indices, 'age'] = y_pred

In [129]:
# checking for null values in age column 
df_age_impute['age'].isna().sum()

np.int64(0)