In [17]:
! pip install numpy 
! pip install pandas 
! pip install sklearn 



In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## What are Pipelines?

**Pipelines chains together multiple steps so that the output of each syep is used as input to the next step
Pipelines make it easy to apply the same preprocessing to train and test**

## Let's Plan

In [19]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [20]:
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                random_state=42)

The random_state parameter in the train_test_split function is used to control the random shuffling and splitting of the data. It determines the randomization applied to the data before splitting. By setting a specific value for random_state, you can ensure that the random shuffling of the data is reproducible.

In [21]:
X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [22]:
y_train.sample(5)

242    0
441    0
142    1
310    1
355    0
Name: Survived, dtype: int64

In [23]:
# imputation transformer
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

Here the remainder = passthrough means that the data will be continue print after the imputation

In [24]:
# one hot encoding
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

In [25]:
df = trf2.fit_transform(df)
print(df)

[[0.0 0.0 1.0 ... 1 0 'S']
 [1.0 0.0 0.0 ... 1 0 'C']
 [0.0 0.0 1.0 ... 0 0 'S']
 ...
 [0.0 0.0 1.0 ... 1 2 'S']
 [1.0 0.0 0.0 ... 0 0 'C']
 [0.0 0.0 1.0 ... 0 0 'Q']]


In [26]:
# Scaling
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

The slice(0,10) argument to the ColumnTransformer transformer specifies that the min-max scaling should be applied to the first 10 columns of the dataset.

In [27]:
# Transform the data
df = trf3.fit_transform(df)

# Print the transformed data
print(df)

[[0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


In [28]:
# Feature selection
trf4 = SelectKBest(score_func=chi2,k=8)

In [29]:
# train the model
trf5 = DecisionTreeClassifier()

# Create Pipeline & GridSearch using Pipeline

In [30]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ('trf1', ColumnTransformer(remainder='passthrough',
                                  transformers=[('impute_age', SimpleImputer(), [2]),
                                                ('impute_embarked',
                                                 SimpleImputer(strategy='most_frequent'),
                                                 [6])])),
    ('trf2', ColumnTransformer(remainder='passthrough',
                                  transformers=[('ohe_sex_embarked',
                                                 OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                 [1, 6])])),
    ('trf3', ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))])),
    ('trf4', SelectKBest(k=8, score_func= chi2 )),
    ('trf5', DecisionTreeClassifier())
])

# Define the parameter grid
params = {
    'trf5__max_depth': [1, 2, 3, 4, 5, None]
}

# Perform grid search
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('trf1',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('impute_age',
                                                                         SimpleImputer(),
                                                                         [2]),
                                                                        ('impute_embarked',
                                                                         SimpleImputer(strategy='most_frequent'),
                                                                         [6])])),
                                       ('trf2',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('ohe_sex_embarked',
                                                                         OneHotEnc

In [31]:
pipe.steps

[('trf1',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('impute_age', SimpleImputer(), [2]),
                                  ('impute_embarked',
                                   SimpleImputer(strategy='most_frequent'),
                                   [6])])),
 ('trf2',
  ColumnTransformer(remainder='passthrough',
                    transformers=[('ohe_sex_embarked',
                                   OneHotEncoder(handle_unknown='ignore',
                                                 sparse=False),
                                   [1, 6])])),
 ('trf3',
  ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))])),
 ('trf4', SelectKBest(k=8, score_func=<function chi2 at 0x00000237488F84C0>)),
 ('trf5', DecisionTreeClassifier())]

# Pipeline Vs make_pipeline

Pipeline requires naming of steps, make_pipeline does not.

(Same applies to ColumnTransformer vs make_column_transformer)

Grid search is a technique for finding the best hyperparameters for a model by searching through a grid of possible hyperparameter values. Hyperparameters are the parameters of a model that are not learned from the data. They are typically set by the user, and they can have a significant impact on the performance of the model.

We use grid search when we want to find the best hyperparameters for a model. Grid search can be helpful in a number of situations, including:

* When we have a large number of hyperparameters to tune.
* When we want to find the best model for a specific task.
* When we want to compare the performance of different models.

Grid search can be computationally expensive, so it is important to use it judiciously. However, it can be a valuable tool for finding the best hyperparameters for a model and improving the performance of the model.

Here are some of the benefits of using grid search:

* It can help you find the best hyperparameters for your model, which can lead to improved performance.
* It can help you compare the performance of different models, which can help you choose the best model for your needs.
* It can help you make sure that your model is not overfitting the training data.

However, there are also some drawbacks to using grid search:

* It can be computationally expensive, especially if you have a large number of hyperparameters to tune.
* It can be time-consuming, especially if you need to run grid search multiple times.
* It can be difficult to interpret the results of grid search, especially if you have a large number of hyperparameters to tune.

Overall, grid search is a powerful tool that can be used to find the best hyperparameters for a model. However, it is important to use it judiciously and to be aware of the potential drawbacks.

In [32]:
# Alternate Syntax
pipe = make_pipeline(trf1,trf2,trf3,trf4,trf5)

In [33]:
# train
pipe.fit(X_train,y_train)


Pipeline(steps=[('columntransformer-1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('impute_age', SimpleImputer(),
                                                  [2]),
                                                 ('impute_embarked',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  [6])])),
                ('columntransformer-2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('ohe_sex_embarked',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  [1, 6])])),
                ('columntransformer-3',
                 ColumnTransformer(transformers=[('scale', MinMaxScaler(),
                             

# Explore the Pipeline

In [34]:
# Code here
pipe.named_steps

{'columntransformer-1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'columntransformer-2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'columntransformer-3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'selectkbest': SelectKBest(k=8, score_func=<function chi2 at 0x00000237488F84C0>),
 'decisiontreeclassifier': DecisionTreeClassifier()}

## Explore the Pipeline


The set_config() function in scikit-learn is used to configure the behavior of the library. The display parameter can be set to diagram to display a diagram of the pipeline. This can be helpful for understanding how the pipeline works and for debugging problems.

In [35]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')
pipe

In [36]:
# Predict
y_pred = pipe.predict(X_test)

In [37]:
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.6256983240223464

## Cross Validation using Pipeline

 Cross-validation is a technique for evaluating the performance of a model by training the model on a subset of the data and then testing the model on the remaining data.

The code then calls the cross_val_score function with the following arguments:

**pipe**: The pipeline that will be used to train and test the model.

**X_train**: The training data

**y_train**: The labels for the training data.

**cv**: The number of folds to use for cross-validation

**scoring**: The scoring metric to use for evaluating the model. In this case, the accuracy metric is being used, which measures the proportion of correctly classified samples.

In [39]:
# cross validation using cross_val_score
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean()

0.6391214419383433