In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline , make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv(r'C:\Users\alisa\OneDrive\Desktop\Machine Learning Note\Pandas Profiling\train (1).csv')
df.sample(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
385,386,0,2,"Davies, Mr. Charles Henry",male,18.0,0,0,S.O.C. 14879,73.5,,S
202,203,0,3,"Johanson, Mr. Jakob Alfred",male,34.0,0,0,3101264,6.4958,,S


In [4]:
df.drop(columns=['PassengerId','Name' , 'Ticket' , 'Cabin'] , inplace=True)

In [5]:
df.sample(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
592,0,3,male,47.0,0,0,7.25,S


In [6]:
X_train , X_test , y_train , y_test=train_test_split(df.iloc[:,1:] , df.iloc[:,1:2] , test_size=.2 , random_state=42)

In [8]:
X_train.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S


In [9]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

**Pipeline**

handling missing value-> OHE->scaling->feature selection-> model

In [11]:
# inputing transformer
pipe1 = ColumnTransformer([
    ('impute_age' , SimpleImputer(strategy='mean') ,[2]),#ehy [2] not ['age'] -> [age] will give np arry , it will create problem while doing OHE(iss lia eithout me problem aa raha tha)
    ('impute_Embaarked' , SimpleImputer(strategy='most_frequent') , [6])
], remainder='passthrough')

In [12]:
# applying OHE
pipe2 = ColumnTransformer([
    ('ohe_sex_and_embarked' , OneHotEncoder(sparse_output=False ,handle_unknown='ignore' ),[1,6])#[1,6] -> [age,embarked]
],remainder='passthrough')

drop='first' iss lia nahi kia Q ki DT algorithm hai

In [13]:
# scaling
from sklearn.preprocessing import MinMaxScaler
pipe3 = ColumnTransformer([
    ('scale' ,MinMaxScaler(),slice(0,10))
]) # pass through iss lia nahi laga q ki nahi chahiye like we all ready complete our work that's thy no need o pass through

Slice(0,8) wo total col. ke lia hi like OHE ke baad total 9 cols. hoga 

In [15]:
from sklearn.feature_selection import SelectKBest,chi2
# Feature selection
pipe4 = SelectKBest(score_func=chi2,k=5) # top 5 feature

In [16]:
pipe5 = DecisionTreeClassifier()

Create pipeline

In [17]:
# pipe =  Pipeline([])
pipe =  Pipeline([
    ('pipe1' , pipe1),
    ('pipe2' , pipe2),
    ('pipe3' , pipe3),
    ('pipe4' , pipe4),
    ('pipe5' , pipe5)
])

Pipeline Vs make_pipeline
Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

In [19]:
# Alternate Syntax
pipe_Alter = make_pipeline(pipe1,pipe2,pipe3,pipe4,pipe5)

In [20]:
# train
pipe.fit(X_train,y_train)

In [38]:
# Predict
y_pred = pipe.predict(X_test)
y_pred

array([1, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 3, 1, 3, 3, 3, 3, 3,
       1, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 1, 1, 3, 1, 1, 1, 3, 3, 3, 1, 3, 3, 3, 3, 1, 3, 1, 1, 1, 1,
       2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 1,
       3, 3, 1, 3, 3, 3, 1, 1, 3, 3, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 1,
       2, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 1, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 1, 3, 3, 1, 3, 1, 3, 3, 3, 3, 1, 3, 3, 1, 3, 1,
       3, 3, 3], dtype=int64)

In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6033519553072626

***Cross Validation using Pipeline***

In [23]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.5632620900226534

**Explore the Pipeline**

In [24]:
pipe.named_steps

{'pipe1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_Embaarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'pipe2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_and_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'pipe3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'pipe4': SelectKBest(k=5, score_func=<function chi2 at 0x00000165223B4540>),
 'pipe5': DecisionTreeClassifier()}

In [31]:
# simple impuer ka age ka value kitna hai
# pipe dictionary me hai
pipe.named_steps['pipe1'].transformers_[0][1].statistics_

array([29.49884615])

Exporting the Pipeline


In [37]:
# export 
import pickle
pickle.dump(pipe,open(r'C:\Users\alisa\OneDrive\Desktop\Machine Learning Note\ML Pipeline\pickle file/pipe.pkl','wb'))