In [154]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier


df = pd.read_csv('train.csv')
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### going to drop passenger ID and ticket

In [2]:
df.drop(['PassengerId','Ticket','Name'],axis =1, inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


#### Data cleaning check

In [3]:
df.isna().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [4]:
# Check percentage of missing values
df.isna().sum() / len(df)

Survived    0.000000
Pclass      0.000000
Sex         0.000000
Age         0.198653
SibSp       0.000000
Parch       0.000000
Fare        0.000000
Cabin       0.771044
Embarked    0.002245
dtype: float64

#### 20% of age rows are empty, it will be filled with median values as such large number of missing rows can not be dropped.  only 2 rows of embarked are missing, it will be filled with most frequently occuring value as it is categorical data. Cabin column will be completly transformed into two columns

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [41]:
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

In [43]:
num_cols
cat_cols

['Sex', 'Cabin', 'Embarked']

#### first step is train test split always

In [6]:
X = df.drop(['Survived'], axis =1)
Y = df['Survived']

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2,random_state=42)

In [8]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
331,1,male,45.5,0,0,28.5,C124,S
733,2,male,23.0,0,0,13.0,,S
382,3,male,32.0,0,0,7.925,,S
704,3,male,26.0,1,0,7.8542,,S
813,3,female,6.0,4,2,31.275,,S


#### going to create a custom column transformer to transform the cabin column 

In [133]:
from sklearn.base import BaseEstimator, TransformerMixin

class CabinTransformer(BaseEstimator, TransformerMixin):
        def __init__(self):
            print('in the CabinFeatureTransformer init method: ')

        def fit(self, X, y=None):
            X.Cabin.fillna('U1', inplace=True)
            return self

        def transform(self, X):
            X_copy = X.copy()
            X_copy.Cabin.fillna('U1', inplace=True)
            X_copy['cabin_letter'] = X_copy['Cabin'].str[0] #captures first letter
            X_copy['cabin_number'] = X_copy['Cabin'].str.extract(r'(\d+)') # captures numerical part
            X_copy = X_copy.drop('Cabin', axis=1)
            return X_copy

In [134]:
x2 = X_train.copy()
x2.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
331,1,male,45.5,0,0,28.5,C124,S
733,2,male,23.0,0,0,13.0,,S
382,3,male,32.0,0,0,7.925,,S
704,3,male,26.0,1,0,7.8542,,S
813,3,female,6.0,4,2,31.275,,S


In [135]:
tfr3 = ColumnTransformer([
    ('cabin_tfrmer',CabinTransformer(),[6])
],remainder='passthrough')

in the CabinFeatureTransformer init method: 


In [136]:
tfr3.fit_transform(x2)

in the CabinFeatureTransformer init method: 


array([['C', '124', 1, ..., 0, 28.5, 'S'],
       ['U', '1', 2, ..., 0, 13.0, 'S'],
       ['U', '1', 3, ..., 0, 7.925, 'S'],
       ...,
       ['U', '1', 3, ..., 0, 14.1083, 'S'],
       ['B', '96', 1, ..., 2, 120.0, 'S'],
       ['D', '26', 1, ..., 1, 77.2875, 'S']], dtype=object)

#### custom transformer with a Column transformer always returning a numpy array

In [137]:
class ColumnsSelector(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, columns):
        # save the features list internally in the class
        self.columns = columns
        
    def fit(self, X, y = None):
        return self
    def transform(self, X, y = None):
        # return the dataframe with the specified features
        return X[self.columns]

In [138]:
numeric_transformer = Pipeline(steps=[
    ('columns selector', ColumnsSelector(['Age','Fare'])),
])

In [139]:
tfr4 = ColumnTransformer([
    ('col_sel',ColumnsSelector(['Age','Fare']),slice(0,8))
])

In [140]:
tfr4.fit_transform(x2)

array([[ 45.5   ,  28.5   ],
       [ 23.    ,  13.    ],
       [ 32.    ,   7.925 ],
       ...,
       [ 41.    ,  14.1083],
       [ 14.    , 120.    ],
       [ 21.    ,  77.2875]])

In [141]:
numeric_transformer.fit_transform (x2)

Unnamed: 0,Age,Fare
331,45.5,28.5000
733,23.0,13.0000
382,32.0,7.9250
704,26.0,7.8542
813,6.0,31.2750
...,...,...
106,21.0,7.6500
270,,31.0000
860,41.0,14.1083
435,14.0,120.0000


#### As we can see can see above column selector class is not a column transformer and is used directly inside the pipeline. There is no need for a column transformer class here

In [142]:
cabin_tfmr = Pipeline(steps=[
    ('cabin_transformer', tfr3),
])

In [143]:
cabin_tfmr.fit_transform(x2)

in the CabinFeatureTransformer init method: 


array([['C', '124', 1, ..., 0, 28.5, 'S'],
       ['U', '1', 2, ..., 0, 13.0, 'S'],
       ['U', '1', 3, ..., 0, 7.925, 'S'],
       ...,
       ['U', '1', 3, ..., 0, 14.1083, 'S'],
       ['B', '96', 1, ..., 2, 120.0, 'S'],
       ['D', '26', 1, ..., 1, 77.2875, 'S']], dtype=object)

#### above block is just to understand behaviour of Pipeline with column transformer and without column transformer
#### next will try to make a pipeline with Custom column transformer and One HOT encoder

In [126]:
# first create a pipeline with two steps, custom transformation class and OHE
cabin_transformer = Pipeline(steps=[
    ('cabin_transformer', CabinTransformer()),
    ('cabin_ohe',OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

in the CabinFeatureTransformer init method: 


In [127]:
#then create a column transformer using the previously created pipeline
transformer5 = ColumnTransformer(
    transformers=[
        ('cabin_data_preprocessing', cabin_transformer, [6])
    ])

In [128]:
transformer5.fit_transform(x2)

in the CabinFeatureTransformer init method: 




array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [129]:
transformer5.named_transformers_['cabin_data_preprocessing']['cabin_ohe'].feature_names_in_

array(['cabin_letter', 'cabin_number'], dtype=object)

In [130]:
transformer5.named_transformers_['cabin_data_preprocessing']['cabin_ohe'].get_feature_names_out(['cabin_letter','cabin_number'])

array(['cabin_letter_A', 'cabin_letter_B', 'cabin_letter_C',
       'cabin_letter_D', 'cabin_letter_E', 'cabin_letter_F',
       'cabin_letter_G', 'cabin_letter_T', 'cabin_letter_U',
       'cabin_number_1', 'cabin_number_10', 'cabin_number_101',
       'cabin_number_102', 'cabin_number_103', 'cabin_number_104',
       'cabin_number_106', 'cabin_number_11', 'cabin_number_111',
       'cabin_number_118', 'cabin_number_12', 'cabin_number_121',
       'cabin_number_123', 'cabin_number_124', 'cabin_number_125',
       'cabin_number_128', 'cabin_number_14', 'cabin_number_16',
       'cabin_number_17', 'cabin_number_18', 'cabin_number_19',
       'cabin_number_2', 'cabin_number_20', 'cabin_number_22',
       'cabin_number_23', 'cabin_number_24', 'cabin_number_26',
       'cabin_number_28', 'cabin_number_3', 'cabin_number_30',
       'cabin_number_31', 'cabin_number_32', 'cabin_number_33',
       'cabin_number_34', 'cabin_number_35', 'cabin_number_36',
       'cabin_number_37', 'cabin_number_

#### problem above is that the cabin_number also has got OHE as we could not specify that only cabin_letter should be OHE

```python
num_pipeline = Pipeline([
                         ('imputer', SimpleImputer(strategy="median")),
                         ('std_scaler', StandardScaler()),
])

cabin_pipeline = Pipeline([
                           ('has_cabin', CabinTransformer()),
])

cat_pipeline = Pipeline([
                              ('imputer', SimpleImputer(strategy="most_frequent")),
                              ('cat', OneHotEncoder()),
])
```
using pipelines like above first and creating a generic column transformer shown below:

```Python
num_attribs = ['Age', 'SibSp', 'Parch', 'Fare']
cabin_attribs = ['Cabin']
ordinal_attribs = ['Sex']
cat_attribs = ['Embarked', 'Pclass']

full_pipeline = ColumnTransformer([
                                   ("num", num_pipeline, num_attribs),
                                   ("has_cabin", cabin_pipeline, cabin_attribs),
                                   ("ord", OrdinalEncoder(), ordinal_attribs),
                                   ("cat", cat_pipeline, cat_attribs),
])
titanic_prepared = full_pipeline.fit_transform(titanic)
```

There is no need to specify remainder='passthrough' for each pipeline in transformer. This is because the output of column transformer is not fed into a pipeline, instead each pipeline is meant to operate of distinct column of dataframe.
however, if we make separate column transformers and then combine them using a pipeline as shown below in rest of example, then we need to specify remainder =passthrough because the output of one column transformer in the pipeline is fed to the input of next column transformer in the pipeline.
Above is an example implementation from https://medium.com/analytics-vidhya/zero-to-pipeline-beginners-guide-to-building-a-scikit-learn-pipeline-to-predict-survival-on-the-bd9730f6f13b

Below is another example of the same approach of using pipelines first with several different trasformations done on a set of columns and then combining all of them using a column transformer:

```Python
cat_cols = ['embarked', 'sex', 'pclass', 'title', 'is_alone']
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False)),
    ('pca', PCA(n_components=10))
])

num_cols = ['age', 'fare', 'family_size']
num_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', RobustScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

cross_val_score(clf, X_train, y_train, cv=5, scoring="accuracy").mean()
```
Above is example from https://jaketae.github.io/study/sklearn-pipeline/

In our example we will build a few column transformers and then combine them inside a pipeline

In [147]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [146]:
X_train.drop(['Cabin'],axis =1, inplace=True)

In [149]:
X_train.columns.get_loc("Embarked")

6

In [157]:
transformer1 = ColumnTransformer(transformers=[("age_impute",SimpleImputer(),[2]),
                                              ("embared_impute",SimpleImputer(strategy="most_frequent"),[6])], remainder='passthrough')

In [151]:
transformer2 = ColumnTransformer(transformers=[("ohe_sex_embarked", OneHotEncoder(sparse=False, handle_unknown='ignore'),[1,6])],remainder='passthrough')

In [152]:
transformer3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10)) # scale all columns
])

In [155]:
transformer4 = DecisionTreeClassifier()

In [158]:
pipe = Pipeline([
    ('tranformer1',transformer1),
    ('transformer2',transformer2),
    ('transformer3',transformer3),
    ('transformer4',transformer4)
])

In [160]:
pipe.fit(X_train,Y_train)



In [161]:
pipe.named_steps

{'tranformer1': ColumnTransformer(remainder='passthrough',
                   transformers=[('age_impute', SimpleImputer(), [2]),
                                 ('embared_impute',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'transformer2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'transformer3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'transformer4': DecisionTreeClassifier()}

In [200]:
#pipe.named_steps['transformer2'].named_transformers_['ohe_sex_embarked'].feature_names_in_()
pipe.named_steps['transformer2'].named_transformers_['ohe_sex_embarked'].get_feature_names_out(['Sex','Embarked'])

array(['Sex_C', 'Sex_Q', 'Sex_S', 'Embarked_0.0', 'Embarked_4.0125',
       'Embarked_5.0', 'Embarked_6.2375', 'Embarked_6.4375',
       'Embarked_6.45', 'Embarked_6.4958', 'Embarked_6.75',
       'Embarked_6.8583', 'Embarked_6.95', 'Embarked_6.975',
       'Embarked_7.05', 'Embarked_7.0542', 'Embarked_7.125',
       'Embarked_7.1417', 'Embarked_7.225', 'Embarked_7.2292',
       'Embarked_7.25', 'Embarked_7.3125', 'Embarked_7.4958',
       'Embarked_7.5208', 'Embarked_7.55', 'Embarked_7.6292',
       'Embarked_7.65', 'Embarked_7.725', 'Embarked_7.7333',
       'Embarked_7.7375', 'Embarked_7.7417', 'Embarked_7.75',
       'Embarked_7.775', 'Embarked_7.7958', 'Embarked_7.8',
       'Embarked_7.8292', 'Embarked_7.8542', 'Embarked_7.8792',
       'Embarked_7.8875', 'Embarked_7.8958', 'Embarked_7.925',
       'Embarked_8.0292', 'Embarked_8.05', 'Embarked_8.1125',
       'Embarked_8.1375', 'Embarked_8.3', 'Embarked_8.3625',
       'Embarked_8.4042', 'Embarked_8.5167', 'Embarked_8.6542',
    

In [162]:
y_pred = pipe.predict(X_test)

In [163]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [165]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test,y_pred)

0.6256983240223464

In [167]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, Y_train, cv=5, scoring='accuracy').mean()



0.6391214419383433

In [172]:
params = {
    'transformer4__max_depth':[1,2,3,4,5,None]
}

In [173]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, Y_train)



In [175]:
grid.best_score_

0.6391214419383433