In [4]:
import numpy as np
import pandas as pd

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [6]:
df = pd.read_csv('../../data/titanic_train.csv')

In [7]:
usable_data=df.drop(columns=['PassengerId','Name','Ticket','Cabin'])

In [8]:
X_train,X_test,y_train,y_test = train_test_split(usable_data.drop(columns=['Survived']),
                                                 usable_data['Survived'],
                                                 test_size=0.2,
                                                 shuffle=True,
                                                random_state=42)

In [9]:
usable_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


Let's Start Transformation which is usable in Pipelines


 we just creating usable box for further use....

01. Fill NAN

In [10]:

transformation1=ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [11]:
transformation2=ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [12]:
transformation3=ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [13]:
transformation4=SelectKBest(score_func=chi2,k=8)

In [14]:
transformation5=DecisionTreeClassifier()

Let's create pipeline

In [15]:
pipe0 = Pipeline([
    ('trf1',transformation1),
    ('trf2',transformation2),
    ('trf3',transformation3),
    ('trf5',transformation5)
])

In [16]:
pipe0

In [17]:
pipe=make_pipeline(transformation1,
                   transformation2,
                   transformation3,
                   transformation4,
                   transformation5)

In [18]:
pipe

In [19]:
pipe0.fit(X_train,y_train)

### Explore the pipeline

In [20]:
from sklearn import set_config
set_config(display='diagram')

In [21]:
y_pred=pipe0.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score

In [23]:
accuracy_score(y_pred=y_pred,y_true=y_test)

0.6256983240223464

In [24]:
pipe.fit(X_train,y_train)

In [25]:
y_pred1=pipe.predict(X_test)

In [26]:
accuracy_score(y_pred=y_pred1,y_true=y_test)

0.6256983240223464

In [27]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x00000221D915D3A0>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [28]:
pipe0.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf5': DecisionTreeClassifier()}

In [29]:
pipe0.named_steps['trf1']['impute_age'].statistics_

array([29.49884615])

In [30]:
pipe0.named_steps['trf1'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [31]:
pipe0.named_steps['trf1'].transformers_[0]

('impute_age', SimpleImputer(), [2])

In [32]:
pipe0.named_steps['trf1'].transformers_[0][1].statistics_

array([29.49884615])

## Cross validation using Pipelines

In [33]:
from sklearn.model_selection import cross_val_score

cva=cross_val_score(pipe,X_train,y_train,scoring='accuracy')

In [34]:
cva

array([0.6013986 , 0.62237762, 0.68309859, 0.65492958, 0.63380282])

In [35]:
cva.mean()

0.6391214419383433

## GridSearch using Pipeline

In [36]:
params={
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [37]:
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(pipe0,params,cv=5,scoring='accuracy')
grid.fit(X_train,y_train)

In [38]:
pipe

In [39]:
params={
    'decisiontreeclassifier__max_depth':[1,2,3,4,5,None]
}
grid1=GridSearchCV(pipe,params,cv=5,scoring='accuracy')
grid1.fit(X_train,y_train)

In [40]:
grid.best_estimator_

In [41]:
grid.best_score_

0.6391214419383433

In [42]:
grid1.best_score_

0.6391214419383433

In [43]:
grid.best_params_

{'trf5__max_depth': 2}

In [44]:
grid1.best_params_

{'decisiontreeclassifier__max_depth': 2}

# Exporting the pipeline

In [45]:
import pickle
pickle.dump(pipe,open('../../models/pipe.pkl','wb'))