In [1]:
import seaborn as sns
import pandas as pd
import numpy as np

In [56]:
from sklearn.compose import ColumnTransformer

In [20]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

In [9]:
from sklearn.model_selection import train_test_split

In [5]:
df = sns.load_dataset("titanic")

In [27]:
pd.crosstab(df['survived'],df['who'],normalize='columns')

who,child,man,woman
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.409639,0.836127,0.243542
1,0.590361,0.163873,0.756458


In [6]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
X = df[["pclass","age","fare","who","embarked"]]
y = df[["survived"]]

In [10]:
X

Unnamed: 0,pclass,age,fare,who,embarked
0,3,22.0,7.2500,man,S
1,1,38.0,71.2833,woman,C
2,3,26.0,7.9250,woman,S
3,1,35.0,53.1000,woman,S
4,3,35.0,8.0500,man,S
...,...,...,...,...,...
886,2,27.0,13.0000,man,S
887,1,19.0,30.0000,woman,S
888,3,,23.4500,woman,S
889,1,26.0,30.0000,man,C


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Before Column Transformer

In [19]:
# Simple Imputer -> Age

si = SimpleImputer()
si.fit(X[["age"]])

X_train_age = si.fit_transform(X_train[["age"]])

X_test_age = si.fit_transform(X_test[["age"]])

In [45]:
X_train_age.shape

(712, 1)

In [21]:
X_test_age.shape

(179, 1)

In [31]:
pd.crosstab(df["survived"],df["who"])

who,child,man,woman
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,34,449,66
1,49,88,205


In [28]:
# Ordinal Encoder -> Who i.e. Because Females have higher survival ration

oe = OrdinalEncoder(categories=[["woman","child","man"]])

X_train_gender = oe.fit_transform(X_train[["who"]])

X_test_gender = oe.fit_transform(X_test[["who"]])

X_train_gender.shape

(712, 1)

In [46]:
# One Hot Encoding -> Embarked

ohe = OneHotEncoder(sparse=False)

X_train_embark = ohe.fit_transform(X_train[["embarked"]])

X_test_embark = ohe.fit_transform(X_test[["embarked"]])

X_train_embark,X_train_embark.shape

(array([[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 0., 1., 0.]]),
 (712, 4))

In [35]:
# One Hot Encoding -> Embarked

ohe = OneHotEncoder(sparse=False,drop="first")

X_train_embark = ohe.fit_transform(X_train[["embarked"]])

X_test_embark = ohe.fit_transform(X_test[["embarked"]])

X_train_embark

array([[0., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 1., 0.]])

In [51]:
# Extracting Fare and Pclass Values
X_train_fare = X_train["fare"].values.reshape((712,1))
X_test_fare = X_test["fare"].values.reshape((179,1))

X_train_pclass = X_train["pclass"].values.reshape((712,1))
X_test_pclass = X_test["pclass"].values.reshape((179,1))

In [52]:
X_test_fare.shape

(179, 1)

In [53]:
X_train_transformed = np.concatenate((X_train_age,X_train_embark,X_train_fare,X_train_gender,X_train_pclass),axis=1)

X_test_transformed = np.concatenate((X_test_age, X_test_embark, X_test_fare, X_test_gender, X_test_pclass),axis=1)

In [59]:
X_train_transformed[0]

array([29.72439236,  1.        ,  0.        ,  0.        ,  0.        ,
        7.2292    ,  2.        ,  3.        ])

# Mentos Zindagi 😂

In [57]:
transformer = ColumnTransformer(transformers=[
    ("age_imputer",SimpleImputer(),["age"]),
    ("ordinal_encode",OrdinalEncoder(categories=[["woman","child","man"]]), ["who"]),
    ("OHE_Embark",OneHotEncoder(sparse=False, drop="first"),["embarked"])
],remainder="passthrough")

In [58]:
transformer.fit_transform(X_train)

array([[ 29.72439236,   2.        ,   0.        , ...,   0.        ,
          3.        ,   7.2292    ],
       [ 25.        ,   2.        ,   0.        , ...,   0.        ,
          2.        ,  13.        ],
       [ 25.        ,   0.        ,   0.        , ...,   0.        ,
          1.        , 151.55      ],
       ...,
       [ 25.        ,   0.        ,   0.        , ...,   0.        ,
          3.        ,   7.775     ],
       [ 18.        ,   0.        ,   0.        , ...,   0.        ,
          1.        , 262.375     ],
       [ 31.        ,   2.        ,   0.        , ...,   0.        ,
          3.        ,   7.775     ]])