# without using pipeline

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [5]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [7]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

## TrainTest Split

In [8]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [9]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [10]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

## PreProcessing and Transformation

In [11]:
# Applying imputation

si_age = SimpleImputer()                         # in age column, missing value is imputed by mean
si_embarked = SimpleImputer(strategy='most_frequent')        # in embarked column missing value is imputed by mode  (thats why we use strategy = 'most_frequent')

x_train_age = si_age.fit_transform(x_train[['Age']])
x_train_embarked = si_embarked.fit_transform(x_train[['Embarked']])

x_test_age = si_age.transform(x_test[['Age']])
x_test_embarked = si_embarked.transform(x_test[['Embarked']])

In [12]:
# one hot encoding for Sex and Embarked column

ohe_sex = OneHotEncoder(sparse=False,handle_unknown='ignore')               # here it is not necessary to avoid multi colinearty since at last we have to apply Decision tree algorithm (for linear model, it is necessary to avoid multi colinearity)
ohe_embarked = OneHotEncoder(sparse=False,handle_unknown='ignore')          # embarked have missing value so we cant't apply OHE simultaneously on both age and embarked

x_train_sex = ohe_sex.fit_transform(x_train[['Sex']])
x_train_embarked = ohe_embarked.fit_transform(x_train_embarked)

x_test_sex = ohe_sex.transform(x_test[['Sex']])
x_test_embarked = ohe_embarked.transform(x_test_embarked)

In [13]:
x_train_rem = x_train.drop(columns=['Sex','Age','Embarked'])    # after applying feature engineering we store the remaining column (on which feature engineering is not applied) in x_train_rem
x_test_rem = x_test.drop(columns=['Sex','Age','Embarked'])

In [14]:
x_train_transformed = np.concatenate((x_train_rem,x_train_age,x_train_sex,x_train_embarked),axis=1)    # we concatenate the column on which feature engineering applied with the remaining column
x_test_transformed = np.concatenate((x_test_rem,x_test_age,x_test_sex,x_test_embarked),axis=1)

In [15]:
x_train_transformed.shape

(712, 10)

## Decision Tree Model

In [16]:
clf = DecisionTreeClassifier()

clf.fit(x_train_transformed,y_train)

In [17]:
y_pred = clf.predict(x_test_transformed)
y_pred

array([0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [18]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.770949720670391

# Using Pipeline

In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2

In [20]:
df = pd.read_csv('train.csv')

In [21]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

## TrainTest Split

In [23]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [24]:
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


## Feature Engineering

In [25]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),                   # in this case we pass column index instead of column name since in building pipelines column is call by index value while it passes from one step to another (since column_transformer return numpy array which does not have column name)
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
     ],remainder='passthrough')

In [26]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
     ],remainder='passthrough')

In [27]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))          # here multi colinearity doesn't taken care of so after one hot encoding we produce 2(from gender) + 3(from embarks) = 5 additional column along with 2 column get minus (one for gender and one for embarks), so total number of column is 7(previously) + 5 - 2 = 10 and so min max scaler took place for column index 0 to 9
     ])

In [28]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=8)          # k = 8 means we are using top 8 column out of 10

In [29]:
# train the model
trf5 = DecisionTreeClassifier()

all the individual chain has been created and now we join then using pipeline

## Create Pipeline

In [30]:
pipe = Pipeline([                   # here we pass a list of tuple of all the object created above
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
     ])

* Alternate Syntax:

pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [31]:
# train
pipe.fit(x_train,y_train)         # if we are not training any model through the pipeline or at the end of the pipeline then we use pipi.fit_transform(x_train,y_train)

In [32]:
pipe.named_steps             # it shows all the individual steps 

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x000001F553FA3D90>),
 'trf5': DecisionTreeClassifier()}

In [33]:
pipe.named_steps['trf1'].transformers_           # it shows all the steps occured in trf1

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder', 'passthrough', [0, 1, 3, 4, 5])]

In [34]:
pipe.named_steps['trf1'].transformers_[0]           # it shows 1st steps occured in trf1

('impute_age', SimpleImputer(), [2])

In [35]:
pipe.named_steps['trf1'].transformers_[0][1]          # it shows 1st object of 1st steps occured in trf1

In [36]:
pipe.named_steps['trf1'].transformers_[0][1].statistics_     # we can use the attributes of 1st object of 1st step of trf1 (here statics_ gives the mean calculated by the imputer)

array([29.49884615])

In [37]:
pipe.named_steps['trf1'].transformers_[1]              # it shows 2nd steps occured in trf1

('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])

In [38]:
pipe.named_steps['trf1'].transformers_[1][1]          # it shows 1st object of 2nd steps occured in trf1

In [39]:
pipe.named_steps['trf1'].transformers_[1][1].statistics_     # we can use the attributes of 1st object of 2nd step of trf1 (here statics_ gives the mode calculated by the imputer)

array(['S'], dtype=object)

the above steps helps in debugging

In [40]:
# Predict
y_pred = pipe.predict(x_test)
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [41]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.6256983240223464

here accuracy is lesser then the model without pipeline because by using feature selection we have forcefully reduced the significant column so due to this accuracy decreases

## Cross Validation using Pipeline

In cross validation, we made train test split in our data different times and run the algorithm different times and then we calculate the mean score of that

In [42]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score

cross_val_score(pipe, x_train, y_train, cv=5, scoring='accuracy').mean()      # cv = 5 mean the train test split occur 5 times

0.6391214419383433

## GridSearch using Pipeline

In this we do hyper parameter tuning and by doing this we tune the performance of algorithm

In [43]:
# gridsearchcv
params = {
          'trf5__max_depth':[1,2,3,4,5,None]            # in this we set the max depth parameter one by one and calculate the accuracy of decision tree corresponding to each max depth
         }

In [44]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(x_train, y_train)

In [45]:
grid.best_score_

0.6391214419383433

In [46]:
grid.best_params_

{'trf5__max_depth': 2}