In [11]:
import pandas as pd
import numpy as np

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [13]:
df=pd.read_csv("D:\\DATASETS\\titanic_toy.csv")
df.head()

Unnamed: 0,Age,Fare,Family,Survived
0,22.0,7.25,1,0
1,38.0,71.2833,1,1
2,26.0,7.925,0,1
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Age       714 non-null    float64
 1   Fare      846 non-null    float64
 2   Family    891 non-null    int64  
 3   Survived  891 non-null    int64  
dtypes: float64(2), int64(2)
memory usage: 28.0 KB


In [15]:
df.isnull().mean()*100

Age         19.865320
Fare         5.050505
Family       0.000000
Survived     0.000000
dtype: float64

In [16]:
X_train,X_test,y_train,y_test=train_test_split(df.drop('Survived',axis= 1),df['Survived'],test_size=0.2,random_state=56)

In [17]:
X_train.shape,X_test.shape

((712, 3), (179, 3))

In [18]:
X_train.isnull().mean()*100

Age       20.08427
Fare       5.05618
Family     0.00000
dtype: float64

In [19]:
X_test.isnull().mean()*100

Age       18.994413
Fare       5.027933
Family     0.000000
dtype: float64

In [20]:
X_train['Age_Mean']=X_train['Age'].fillna(X_train['Age'].mean())
X_train['Age_Median']=X_train['Age'].fillna(X_train['Age'].median())

In [21]:
X_train['Fare_Mean']=X_train['Fare'].fillna(X_train['Fare'].mean())
X_train['Fare_Median']=X_train['Fare'].fillna(X_train['Fare'].median())

In [22]:
X_train.head()

Unnamed: 0,Age,Fare,Family,Age_Mean,Age_Median,Fare_Mean,Fare_Median
226,19.0,10.5,0,19.0,19.0,10.5,10.5
278,7.0,29.125,5,7.0,7.0,29.125,29.125
31,,146.5208,1,29.587153,28.0,146.5208,146.5208
449,52.0,30.5,0,52.0,52.0,30.5,30.5
632,32.0,30.5,0,32.0,32.0,30.5,30.5


In [23]:
X_train.cov()

Unnamed: 0,Age,Fare,Family,Age_Mean,Age_Median,Fare_Mean,Fare_Median
Age,215.921815,73.366045,-6.924343,215.921815,215.921815,69.125727,68.362657
Fare,73.366045,2303.918915,16.215155,58.168019,61.24295,2303.918915,2303.918915
Family,-6.924343,16.215155,2.824981,-5.531684,-5.503617,15.394134,15.452308
Age_Mean,215.921815,58.168019,-5.531684,172.494502,172.494502,55.222803,54.613206
Age_Median,215.921815,61.24295,-5.503617,172.494502,172.899392,58.142041,57.407232
Fare_Mean,69.125727,2303.918915,15.394134,55.222803,58.142041,2187.264793,2187.264793
Fare_Median,68.362657,2303.918915,15.452308,54.613206,57.407232,2187.264793,2201.759091


In [24]:
X_train.corr()

Unnamed: 0,Age,Fare,Family,Age_Mean,Age_Median,Fare_Mean,Fare_Median
Age,1.0,0.096124,-0.313331,1.0,1.0,0.094243,0.092821
Fare,0.096124,1.0,0.198714,0.091551,0.096284,1.0,1.0
Family,-0.313331,0.198714,1.0,-0.250589,-0.249025,0.195838,0.19593
Age_Mean,1.0,0.091551,-0.250589,1.0,0.998828,0.089904,0.088619
Age_Median,1.0,0.096284,-0.249025,0.998828,1.0,0.094546,0.093043
Fare_Mean,0.094243,1.0,0.195838,0.089904,0.094546,1.0,0.996703
Fare_Median,0.092821,1.0,0.19593,0.088619,0.093043,0.996703,1.0


In [25]:
X_train,X_test,y_train,y_test=train_test_split(df.drop('Survived',axis= 1),df['Survived'],test_size=0.2,random_state=56)

In [26]:
X_train

Unnamed: 0,Age,Fare,Family
226,19.0,10.5000,0
278,7.0,29.1250,5
31,,146.5208,1
449,52.0,30.5000,0
632,32.0,30.5000,0
...,...,...,...
418,30.0,,0
192,19.0,7.8542,1
399,28.0,12.6500,0
484,25.0,91.0792,1


In [27]:
imputer_mean=SimpleImputer(strategy='mean')
imputer_median=SimpleImputer(strategy='median')

In [28]:
tran=ColumnTransformer([
    ('imputerOne',imputer_median,['Age']),
    ('imputerTwo',imputer_mean,['Fare'])
],remainder='passthrough')

In [29]:
tran.fit(X_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [30]:
X_train=tran.transform(X_train)
X_test=tran.transform(X_test)

In [31]:
X_train

array([[ 19.    ,  10.5   ,   0.    ],
       [  7.    ,  29.125 ,   5.    ],
       [ 28.    , 146.5208,   1.    ],
       ...,
       [ 28.    ,  12.65  ,   0.    ],
       [ 25.    ,  91.0792,   1.    ],
       [ 16.    ,  39.4   ,   1.    ]])