In [1]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest , chi2
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv("C:/Users/NIKUNJ KIKANI/Desktop/machine learning/pipeline/Book2.csv")

In [3]:
df.drop(['Unnamed: 12','PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)

In [4]:
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [5]:
x_train , x_test , y_train , y_test = train_test_split(df.drop(['Survived'],axis=1),
                                                       df['Survived'],test_size=0.2,random_state=42)

In [6]:
x_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [7]:
#impute transformer . 
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

In [8]:
trf2 = ColumnTransformer([
      ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [9]:
#scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

In [28]:
#feature Selection
trf4 = SelectKBest(score_func=chi2,k=10)

In [29]:
#train the model 
trf5 = DecisionTreeClassifier()

In [30]:
#create pipeline 

In [31]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

In [32]:
pipe.fit(x_train,y_train)

In [33]:
#explore the pipeline  

In [39]:
#code here
#pipe.

In [40]:
from sklearn import set_config

In [41]:
set_config(display='diagram')

In [42]:
#predict

In [43]:
y_pred = pipe.predict(x_test)

In [44]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.6256983240223464

In [45]:
#cross validation using pipeline

In [54]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe ,x_train , y_train , cv=5 , scoring='accuracy' ).mean()

0.6391214419383433

In [55]:
#GridSearch Using pipeline 

In [57]:
#gridsearch 
params = {
    'trf5__max_depth':[1,2,3,4,5,None]
}

In [59]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5,scoring="accuracy")
grid.fit(x_train,y_train)

In [60]:
grid.best_score_

0.6391214419383433

In [62]:
grid.best_params_

{'trf5__max_depth': 2}

In [63]:
# exporting the pipeline 

In [65]:
#export
import pickle 
pickle.dump(pipe,open('pipeline.pkl','wb'))