In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import OrdinalEncoder 
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline



In [3]:
dfs=pd.read_csv("https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day29-sklearn-pipelines/train.csv")

In [10]:
dfs.head(4)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S


In [12]:
X_train,X_test,Y_train,y_test=train_test_split(dfs.drop(columns=['Survived']),dfs['Survived'],test_size=0.25,random_state=45)

In [13]:
X_train.shape

(668, 7)

In [14]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
798,3,male,30.0,0,0,7.2292,C
581,1,female,39.0,1,1,110.8833,C


In [15]:
X_test.shape

(223, 7)

In [16]:
# applying Imputation 

In [17]:
from sklearn.compose import ColumnTransformer 

In [18]:
transform1=ColumnTransformer(transformers=[
    ("Age",SimpleImputer(),[2]), #applied on age and Embarked columns 
    ("Embarked",SimpleImputer(strategy="most_frequent"),[6])
],remainder='passthrough')

In [19]:
X_train["Embarked"]

798    C
581    C
541    S
381    C
791    S
      ..
725    S
607    S
544    C
643    S
414    S
Name: Embarked, Length: 668, dtype: object

In [20]:
# applying onehotencoding on Sex and Embarked columns
transform2=ColumnTransformer(transformers=[
    ("Sex_embarked",OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder="passthrough")

In [21]:
transform3=ColumnTransformer(transformers=[
    (('scale',MinMaxScaler(),slice(0,10))),
])

In [22]:
# feature selection 
transform4=SelectKBest(score_func=chi2,k=8)

In [23]:
model=DecisionTreeClassifier()

In [24]:
Pipe=Pipeline([
    'tr1',transform1,
    "tr2",transform2,
    "tr3",transform3,
    "tr4",transform4,
    "tr5",model])

In [25]:
Pipe

# make_pipeline vs Pipeline

In [26]:
Pipe=make_pipeline(transform1,transform2,transform3,transform4,model)

In [27]:
from sklearn import set_config

In [28]:
set_config(display="diagram")

In [29]:
X_train.shape

(668, 7)

In [30]:
Y_train.shape
X_test.shape
# y_test.shape

(223, 7)

In [31]:
Pipe.fit(X_train,Y_train)



In [33]:
ypred=Pipe.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
accuracy_score(ypred,y_test)

0.7040358744394619

In [37]:
# cross validation using pipelne

In [40]:
from sklearn.model_selection import cross_val_score
cross_val_score(Pipe,X_train,Y_train,cv=15,scoring="accuracy").mean()



0.6034343434343434

In [41]:
#grid search 

In [70]:
param_grid={'decisiontreeclassifier__max_depth': [ 5, 10, 15,None]}

In [82]:
#apply hyperparameters 

In [71]:
from sklearn.model_selection import GridSearchCV

In [72]:
GD=GridSearchCV(Pipe,param_grid,cv=5,scoring='accuracy')

In [73]:
GD.fit(X_train,Y_train)



In [74]:
GD.best_score_

0.597284255414656

In [75]:
GD.best_params_

{'decisiontreeclassifier__max_depth': 5}

In [None]:
#saving the model 

In [77]:
import pickle
pickle.dump(Pipe,open("pipeline.pkl",'wb'))

In [None]:
#load the save model

In [78]:
pickle.load(open("pipeline.pkl",'rb'))

In [79]:
test_output=np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

In [81]:
Pipe.predict(test_output)



array([0], dtype=int64)