In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest,chi2

In [43]:
df = pd.read_csv(r'D:\krish naik - udemy ML and Deep Learning\CampusX\Feature Engineering\titanic.csv')

In [45]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
706,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S
79,80,1,3,"Dowdell, Miss. Elizabeth",female,30.0,0,0,364516,12.475,,S
538,539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S
159,160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
57,58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C


In [47]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [49]:
df.drop(columns = ['PassengerId','Name','Ticket','Cabin'],inplace = True)

In [51]:
# Train Test Split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),df['Survived'],test_size=0.2,random_state=42)

In [53]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(712, 7) (179, 7) (712,) (179,)


In [55]:
X_train.sample(5)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
85,3,female,33.0,3,0,15.85,S
42,3,male,,0,0,7.8958,C
425,3,male,,0,0,7.25,S
354,3,male,,0,0,7.225,C
399,2,female,28.0,0,0,12.65,S


In [57]:
# Imputation Transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]), # The output from the 1st transformer will be an array and not dataframe and 
    # numpy array don't have column names
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [59]:
# One Hot Encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output= False,handle_unknown = 'ignore'),[1,6])
],remainder='passthrough')

In [61]:
#Scaling
trf3 = ColumnTransformer([
    ('Scale',MinMaxScaler(),slice(0,10))
])

In [81]:
# Feature Selection
trf4 = SelectKBest(score_func=chi2,k=8) # Selecting top 5 columns out of 10 based on argument chi square test

In [83]:
# Train the model
trf5 = DecisionTreeClassifier()

#### Create Pipeline

In [91]:
# To pass a list of tuples
pipe = Pipeline([
    ('trf1',trf1),# (chain name,object)
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

pipe.fit(X_train,y_train)

#### Pipeline vs Make_Pipeline

Pipeline requires naming of steps,make_pipeline does not require naming of steps

(Same applies to Column Transformer Vs make_column_transformer)

In [87]:
# Alternate Syntax
make_pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

# train
make_pipe.fit(X_train,y_train)


In [74]:
''' If we performed steps such as Imputation , OneHotEncoding ,Scaling only in the Pipeline and we have not use/put
M.L Algorithm model in the pipeline then we would have use 'fit_transform' instead of 'fit' because we are only doing
data preprocessing and we are not doing model training.'''

" If we performed steps such as Imputation , OneHotEncoding ,Scaling only in the Pipeline and we have not use/put\nM.L Algorithm model in the pipeline then we would have use 'fit_transform' instead of 'fit' because we are only doing\ndata preprocessing and we are not doing model training."

#### Explore the Pipeline

In [93]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('Scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x00000221E950DE40>),
 'trf5': DecisionTreeClassifier()}

In [111]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_

array([29.49884615])

In [131]:
# Helpful in Backtracking/Debugging to check parameters
pipe.named_steps['trf1'].transformers_[1][1].statistics_[0]

'S'

In [79]:
# Display Pipeline
from sklearn import set_config
set_config(display='diagram')

In [133]:
y_pred = pipe.predict(X_test)

In [135]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

##### Cross validation using Pipeline 

In [140]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe,X_train,y_train,cv = 5,scoring='accuracy').mean()

0.6391214419383433

#### GridSearch using Pipeline

In [143]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [145]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv = 5,scoring = 'accuracy')
grid.fit(X_train,y_train)

In [147]:
grid.best_score_

0.6391214419383433

In [149]:
grid.best_params_

{'trf5__max_depth': 2}

### Exporting the Pipeline

In [156]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))