In [280]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier # to create a dummy predictive model

In [281]:
df = pd.read_csv('/content/Titanic-Dataset.csv')

In [282]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace=True) # dropping the unwanted columns
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [283]:
X_test, X_train, Y_test, Y_train = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=0.2, random_state=0)

In [284]:
df.isnull().sum()

Unnamed: 0,0
Survived,0
Pclass,0
Sex,0
Age,177
SibSp,0
Parch,0
Fare,0
Embarked,2


# Applying pre-processing techniques using column transformer

We need to apply both simple imputing (to handel missing values) and OHE in the embarked column.

In [285]:
from sklearn.compose import ColumnTransformer

# Its a good strategy to use indexes rather than name in the transformer, beacause it returns a numpy array, and its easier to use it further in the pipeline.

In [286]:
# imputation transformer

trf1 = ColumnTransformer(transformers=[
    ('impute_age', SimpleImputer(), [2]),
    ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
], remainder='passthrough'
)

In [287]:
# one-hot-encoding

trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1,6])
], remainder='passthrough'
)

In [288]:
# scaling

trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0, 10))
])

In [289]:
# feature selection
trf4 = SelectKBest(score_func=chi2, k=8)

In [290]:
# decision tree classifier
trf5 = DecisionTreeClassifier()

# Making the pipeline

In [291]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5)
])

## Pipeline vs make_pipeline

pipeline requires the names of the steps, make_pipeline doesnt. We can directly pass a touple (trf1, trf2......., trf5)

Same for ColumnTransformer and make_column_transformer

In [292]:
pipe.fit(X_train, Y_train)

# Exploring the pipeline

In [293]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_
# .named_steps returns a dictionary of steps
# ['trf1'] goes to the tranformer with name trf1
# .transformers_ gives a list of transformer in trf1
# .transformers_[0] goes to the 1st element of the list, returns a tuple
# .transformers_[0][1] returns the item, simple imputer object in this case
# Now we can apply the attributes to the simple imputer

array([29.51517483])

In [294]:
Y_pred = pipe.predict(X_test)

# Cross Validation using Pipelines

In [295]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, Y_train, cv=5, scoring='accuracy').mean()

np.float64(0.6822222222222223)

# Grid Search using Pipeline

In [296]:
# gridsearchcv
params = {
    'trf5__max_depth':[1,2,3,4,5,None] # specifying the name of the model transformer, the double underscore
}

In [297]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, Y_train)

In [298]:
grid.best_score_

np.float64(0.6822222222222223)

In [299]:
grid.best_params_

{'trf5__max_depth': 2}

# Exporting the Pipeline

In [300]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))