In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('titanic.csv')
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
136,137,1,1,"Newsom, Miss. Helen Monypeny",female,19.0,0,2,11752,26.2833,D47,S
115,116,0,3,"Pekoniemi, Mr. Edvard",male,21.0,0,0,STON/O 2. 3101294,7.925,,S
706,707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45.0,0,0,223596,13.5,,S
774,775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54.0,1,3,29105,23.0,,S
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S


In [3]:
df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
439,0,2,male,31.0,0,0,10.5,S
482,0,3,male,50.0,0,0,8.05,S
284,0,1,male,,0,0,26.0,S
91,0,3,male,20.0,0,0,7.8542,S
353,0,3,male,25.0,1,0,17.8,S


In [4]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:,0], test_size = 0.2, random_state = 42)

**Imputation Transformer**

In [6]:

trf1 = ColumnTransformer(
    [
        ('impute_age', SimpleImputer(), [2]),
        ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
    ], remainder='passthrough'
)

**One-Hot Encoding**

In [8]:
trf2 = ColumnTransformer([
    ('ohe_sex_empbarked', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), [1,6])
], remainder='passthrough'        
)

**Min-Max Scaler**

In [10]:
trf3 = ColumnTransformer(
    [
        ('scale', MinMaxScaler(), slice(0,10))
    ]
)

**Feature Selection**

In [12]:
trf4 = SelectKBest(score_func=chi2, k=8)

**Train the Model**

In [14]:
trf5 = DecisionTreeClassifier()

In [15]:
pipe = Pipeline(
    [
        ('trf1', trf1),
        ('trf2', trf2),
        ('trf3', trf3),
        ('trf4', trf4),
        ('trf5', trf5), 
    ]
)

In [16]:
from sklearn import set_config
set_config(display='diagram')

In [17]:
pipe.fit(x_train, y_train)

**Exploring the pipeline**

In [19]:
pipe.named_steps.trf1.transformers_[0][1].statistics_

array([29.49884615])

**Predict**

In [21]:
y_pred = pipe.predict(x_test)

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6256983240223464

**Cross Validation using Pipeline**

In [24]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, x_train, y_train, scoring='accuracy').mean()

0.6391214419383433

**Exporting the Pipeline**

In [26]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))