# Importing library

In [8]:
import pandas as pd

# Importing train_test_split, Simple Imputer and Column Transformer

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Loading dataset

In [10]:
df = pd.read_csv("titanic_toy.csv")
df.sample(5)

Unnamed: 0,Age,Fare,Family,Survived
716,38.0,227.525,0,1
220,16.0,8.05,0,1
162,26.0,,0,0
651,18.0,23.0,1,1
786,18.0,7.4958,0,1


#  Dividing dataset into two subset: train & test

In [11]:
# Storing all the input columns in variable 'X' and target column in variable 'y'

X = df.drop(columns=["Survived"])
y = df['Survived']

In [12]:
#  Dividing dataset into two subsets: one for training the model (X_train, y_train) and the other for testing its 
# performance (y_train, y_test), helping evaluate how well the model generalizes to new unseen data.

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=2)

# Creating Imputers 

In [13]:
# Creating two imputers, 'imputer1' and 'imputer2', with different strategies (median and mean) for filling missing values.

imputer1 = SimpleImputer(strategy='median')
imputer2 = SimpleImputer(strategy='mean')

# Column Transformer

In [18]:
# Column transformer named 'trf' applies 'imputer1' to the 'Age' column and 'imputer2' to the 'Fare' column, 
# while leaving other columns unchanged.

trf = ColumnTransformer([
    ('imputer1',imputer1,['Age']),
    ('imputer2',imputer2,['Fare'])
],remainder='passthrough')

# Training 'trf' to the training data

In [19]:
# Fits (trains) column transformer 'trf' to the training data 'X_train'

trf.fit(X_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('imputer1', SimpleImputer(strategy='median'),
                                 ['Age']),
                                ('imputer2', SimpleImputer(), ['Fare'])])

In [22]:
# Retrieves the statistics (median) calculated by the 'imputer1' imputer from the fitted column transformer 'trf'.

median = trf.named_transformers_['imputer1'].statistics_

print('The median of age column is', median)

The median of age column is [28.75]


In [23]:
# Retrieves the statistics (mean) calculated by the 'imputer2' imputer from the fitted column transformer 'trf'.

mean = trf.named_transformers_['imputer2'].statistics_

print('The mean of fare column is', mean)

The mean of fare column is [32.61759689]


In [24]:
# These lines of code transform the original training and test datasets, 'X_train' and 'X_test', 
# using the previously fitted column transformer 'trf', which applies imputation.

X_train = trf.transform(X_train)
X_test = trf.transform(X_test)

In [25]:
# All the missing values of X_train are now filled.

X_train

array([[ 40.    ,  27.7208,   0.    ],
       [  4.    ,  16.7   ,   2.    ],
       [ 47.    ,   9.    ,   0.    ],
       ...,
       [ 71.    ,  49.5042,   0.    ],
       [ 28.75  , 221.7792,   0.    ],
       [ 28.75  ,  25.925 ,   0.    ]])

In [28]:
# All the missing values of X_test are now filled.

X_test

array([[ 42.        ,  26.2875    ,   0.        ],
       [ 21.        ,   8.05      ,   0.        ],
       [ 24.        ,  65.        ,   3.        ],
       [ 28.        ,  56.4958    ,   0.        ],
       [ 17.        ,   7.925     ,   6.        ],
       [ 30.        ,   7.8958    ,   0.        ],
       [ 80.        ,  30.        ,   0.        ],
       [ 25.        ,   7.25      ,   0.        ],
       [ 50.        , 133.65      ,   2.        ],
       [ 25.        ,  26.        ,   1.        ],
       [ 35.        ,  26.        ,   0.        ],
       [ 35.        ,  90.        ,   1.        ],
       [ 55.        ,  16.        ,   0.        ],
       [ 28.75      ,  56.4958    ,   0.        ],
       [ 28.75      ,  56.4958    ,   0.        ],
       [ 19.        ,   7.8542    ,   1.        ],
       [ 28.75      ,  15.2458    ,   2.        ],
       [ 49.        ,   0.        ,   0.        ],
       [ 18.        ,  32.61759689,   1.        ],
       [ 65.        ,   7.75   