In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [49]:
df = pd.read_csv('/content/train.csv')

In [50]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Plan
`Age` and `Embarked` are the columns where the data are missing. So our first goal is to handle the missing value, using simple-impute, this will be done by using the column transformer by one tuple.

The output of that will be send to the second transformer whose work will be to do the OneHotEncoding for the two columns that are: `sex` and `embarked`.

Again the output of the OHE, that will be send to the another transformer i.e. scaling because all the transformers can be in the same numerical values.

After this will be doing the feature selection, we will take best 5 features. [This is the optional step here, still I am adding coz it can be used in future by you in other dataset, however here you can take reference how we do it].

After this we will train the model with decision tree and that will give me output.

In [51]:
df.drop(columns = ['PassengerId', 'Name', 'Ticket', 'Cabin'], inplace = True)

## Train and Test Split

In [52]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']), df['Survived'], test_size = 0.2, random_state = 42)

In [53]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [54]:
y_train.sample(5)

Unnamed: 0,Survived
167,0
645,1
154,0
378,0
555,0


# Steps of Column Transformer

## Column Transformer 1: Imputation Transformer


In [55]:
trf1 = ColumnTransformer([
    ('impute_age', SimpleImputer(),[2]),
    ('impute_embarked', SimpleImputer(strategy = 'most_frequent'),[6])
], remainder = 'passthrough')

Here in the simple imputer we have given the index of the column meanwhile you can notice that we didn't mention through with the column name, so by this I can say to you that, if you use the name instead of the index then we know that when we are using the simple-imputer then it doesn't give in the Dataframe it gives in the numpy array, and in this array there is no name of the column its just an array, when we will forward to the another transformer then the transformer will get the numpy array and that transformer will search the name of that column, by this the pipeline will get bursts.

Suggestion is when you are using the pipeline, then all transformer should use the index type calling to the columns. Otherwise the pipeline will bursts.

Or we can also say in Hindi: "Pipeline fatt gya".

## Column Transformer 2: OneHotEncoding

In [56]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarked', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'), [1,6])
], remainder = 'passthrough')

## Column Transformer 3: Scaling

In [57]:
trf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0,10))
])

## Column Transformer 4: Feature Selection

In [58]:
trf4 = SelectKBest(score_func = chi2, k=5)

## Column Transformer 5: Train the model

In [59]:
trf5 = DecisionTreeClassifier()

# Creating Pipeline

In [60]:
pipe = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3),
    ('trf4', trf4),
    ('trf5', trf5)
])

# Pipeline vs make_pipeline

Pipeline requires naming of steps, make_pipeline doesn't.

(Same applies to Column Transformer vs make_column_transformer)

In [61]:
# Alternative Syntax [Optional]
pipe = make_pipeline(trf1, trf2, trf3, trf4, trf5)

In [62]:
# Train
pipe.fit(X_train, y_train)

Suppose, if were using only three steps of transformation: `Imputation`, `OneHotEncoding`, `Scaling` and we didn't implement the algorithm, then we use `pipe.fit_transfrom` instead of `pipe.fit`. Because we aren't doing model training here, we have just completed the data preprocessing. We can use the `pipe.fit` however you have to do the transformation of this training again.

Summary:

- With Algorithm: `fit` and `predict`
- Without Algorithm: `fit_transform`

# Explroe the Pipelines

In [63]:
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=5, score_func=<function chi2 at 0x78d9421e2200>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

In [64]:
pipe.named_steps['columntransformer-1']

In [65]:
pipe.named_steps['columntransformer-1'].transformers_

[('impute_age', SimpleImputer(), [2]),
 ('impute_embarked', SimpleImputer(strategy='most_frequent'), [6]),
 ('remainder',
  FunctionTransformer(accept_sparse=True, check_inverse=False,
                      feature_names_out='one-to-one'),
  [0, 1, 3, 4, 5])]

In [66]:
print(pipe.named_steps['columntransformer-1'].transformers_[0])
print(pipe.named_steps['columntransformer-1'].transformers_[0][1])

('impute_age', SimpleImputer(), [2])
SimpleImputer()


In [67]:
pipe.named_steps['columntransformer-1'].transformers_[0][1].statistics_

array([29.49884615])

In [68]:
print(pipe.named_steps['columntransformer-1'].transformers_[1])
print(pipe.named_steps['columntransformer-1'].transformers_[1][1])
print(pipe.named_steps['columntransformer-1'].transformers_[1][1].statistics_)

('impute_embarked', SimpleImputer(strategy='most_frequent'), [6])
SimpleImputer(strategy='most_frequent')
['S']


In [69]:
from sklearn import set_config
set_config(display = 'diagram')

In [70]:
y_pred = pipe.predict(X_test)

In [71]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0])

In [72]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6256983240223464

# Cross Validation using Pipeline

In [73]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv = 5, scoring = 'accuracy')

array([0.6013986 , 0.62237762, 0.68309859, 0.65492958, 0.63380282])

In [74]:
cross_val_score(pipe, X_train, y_train, cv = 5, scoring = 'accuracy').mean()

0.6391214419383433

# GridSearch using Pipeline

In [77]:
# GridSearch CV
# Assuming 'decisiontreeclassifier' is the name of your DecisionTreeClassifier step in the pipeline
params = {
    'decisiontreeclassifier__max_depth':[1,2,3,4,5,None]
}

In [78]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, params, cv = 5, scoring = 'accuracy')
grid.fit(X_train, y_train)

In [79]:
grid.best_score_

0.6391214419383433

In [80]:
grid.best_params_

{'decisiontreeclassifier__max_depth': 2}

# Exporting the Pipeline

Using the pipeline in production

In [82]:
import pickle
pickle.dump(pipe, open('pipe.pkl', 'wb'))

# Production Code

In [83]:
import pickle
import numpy as np

In [84]:
pipe = pickle.load(open('pipe.pkl', 'rb'))

In [85]:
test_input2 = np.array([2,'male', 31.0, 0, 0, 10.5, 'S'], dtype = object).reshape(1,7)

In [86]:
pipe.predict(test_input2)



array([0])