# <font color='green'> Import Libraries </font>

In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# <font color='green'> Data importing and preprocessing </font>

In [64]:
data = pd.read_csv('titanic.csv')
data.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
279,1171,0,2,"Oxenham, Mr. Percy Thomas",male,22.0,0,0,W./C. 14260,10.5,,S
59,951,1,1,"Chaudanson, Miss. Victorine",female,36.0,0,0,PC 17608,262.375,B61,C
286,1178,0,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
356,1248,1,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59.0,2,0,11769,51.4792,C101,S
63,955,1,3,"Bradley, Miss. Bridget Delia",female,22.0,0,0,334914,7.725,,Q


In [65]:
data.shape

(418, 12)

In [66]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [67]:
data.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [68]:
data.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
250,1,2,female,0.92,1,2,27.75,S
294,0,3,male,36.0,0,0,9.5,S
71,0,3,male,21.0,0,0,7.8958,S
312,0,3,male,,0,0,7.575,S
61,0,2,male,32.0,0,0,13.5,S


In [69]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
383,3,female,19.0,1,0,16.1,S
380,3,male,,0,0,7.75,Q
27,3,male,22.5,0,0,7.225,C
89,2,male,2.0,1,1,23.0,S
326,2,female,12.0,2,1,39.0,S


In [70]:
x_train,x_test,y_train,y_test = train_test_split(data.drop(columns=['Survived']),data['Survived'],test_size=0.3,random_state=1)

In [71]:
#1st transformer simple Imputer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_fare',SimpleImputer(strategy='most_frequent'),[5])
    ],remainder='passthrough')

In [72]:
#2nd transformer OHE
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [73]:
#3rd transformer ->MinMaxScaler for scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10)) #slice defines the column range for applying minmaxscaler
])

In [74]:
#train the model
trf4 = DecisionTreeClassifier()

# <font color='green'> Create Pipeline </font>

In [75]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4)
])

In [76]:
pipe.fit(x_train,y_train)



In [77]:
#pipe has a list of imp attributes refer documentation for that

In [78]:
y_pred = pipe.predict(x_test)

In [81]:
accuracy_score(y_test,y_pred)

0.6746031746031746

# <font color='green'> Exporting the pipeline </font>

In [82]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))