# Pipeline
Pipeline is a data processing technique that allows you to transform data from one representation to another through a sequence of stages. Each stage is a transformation of the data, and the output of one stage is the input of the next stage.

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [6]:
df=sns.load_dataset('titanic')

X = df[['pclass','sex','age','fare','embarked']]
y = df['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_features = ['age', 'fare']
cat_features = ['pclass','sex','embarked']

num_trans = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])
cat_trans = Pipeline(steps=[
    ('encoder', OneHotEncoder()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

preprocessor = ColumnTransformer(transformers=[
    ('num',num_trans,num_features),
    ('cat',cat_trans,cat_features)
])

In [7]:
pipeline=Pipeline(steps=[('preprocessor',preprocessor),('classifier',RandomForestClassifier(random_state=42))])
# fit the pipeline with the training data
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

0.7877094972067039
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       105
           1       0.74      0.74      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

[[86 19]
 [19 55]]


# Hyperparameter Tuning using Pipeline

In [8]:
from sklearn.model_selection import GridSearchCV

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) 

pipeline = Pipeline([
    ('Imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder',OneHotEncoder(handle_unknown='ignore')),
    ('model',RandomForestClassifier(random_state=42))
])
# Define the hyperparameters to tune
hyperparameters = {
    'model__n_estimators': [100, 200, 300, 500],
    'model__max_depth': [None, 5, 10, 30],
    'model__min_samples_split': [2, 5, 10, 15]
}
grid=GridSearchCV(pipeline, hyperparameters, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)

{'model__max_depth': 10, 'model__min_samples_split': 10, 'model__n_estimators': 300}


In [14]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val)
print(accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

0.8493150684931506
              precision    recall  f1-score   support

           0       0.90      0.88      0.89        49
           1       0.76      0.79      0.78        24

    accuracy                           0.85        73
   macro avg       0.83      0.83      0.83        73
weighted avg       0.85      0.85      0.85        73

[[43  6]
 [ 5 19]]


# Selecting best model in Pipeline

In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [20]:
models = [
    ('RandomForestClassifier',RandomForestClassifier(random_state=42)),
    ('GradientBoostingClassifier',GradientBoostingClassifier(random_state=42)),
    ('XGBClassifier',XGBClassifier(random_state=42)),
    ('SVM',SVC(random_state=42)),
    ('LogisticReg',LogisticRegression(random_state=42))
]
best_models = None
best_accuracy = 0

for name,model in models:
    pipeline = Pipeline([
        ('Imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OneHotEncoder(handle_unknown='ignore')),
        ('model',model)
    ])
    score = cross_val_score(pipeline, X_train, y_train, cv=5)
    mean_acc= score.mean()
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Cross-Val accuracy: {mean_acc}")
    print(f"{name} accuracy: {accuracy}")
    #print(classification_report(y_test, y_pred))
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = pipeline
        best_model_name = name
print(f"Best model: {best_model_name} with accuracy: {best_accuracy}")

RandomForestClassifier Cross-Val accuracy: 0.8247808299240209
RandomForestClassifier accuracy: 0.8212290502793296
GradientBoostingClassifier Cross-Val accuracy: 0.8454120397428404
GradientBoostingClassifier accuracy: 0.8100558659217877
XGBClassifier Cross-Val accuracy: 0.8041496201052016
XGBClassifier accuracy: 0.7653631284916201
SVM Cross-Val accuracy: 0.821332554061952
SVM accuracy: 0.7932960893854749
LogisticReg Cross-Val accuracy: 0.8281706604324957
LogisticReg accuracy: 0.8044692737430168
Best model: RandomForestClassifier with accuracy: 0.8212290502793296
