In [1]:
# To Data importing and processing
import pandas as pd
import numpy as np

# The data
from sklearn.datasets import load_breast_cancer

# For Scaling the data in range of 0-1
from sklearn.preprocessing import MinMaxScaler

# For reducing the dimentionality of the data
from sklearn.decomposition import PCA

# For splitting the data
from sklearn.model_selection import train_test_split

# Creating a Pipeline
from sklearn.pipeline import Pipeline

# For creating a model1
from sklearn.linear_model import LogisticRegression

# For creating a model2
from sklearn.ensemble import RandomForestClassifier

# For creating a model3
from xgboost import XGBClassifier

# For evaluating our performance
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
df = load_breast_cancer()

In [3]:
X = df.data
y = df.target

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Pipelines Creation
#### 1. Data Preprocessing by using Standard Scaler
#### 2. Reduce Dimension using PCA
#### 3. Apply  Classifier

In [5]:
pipeline_lr = Pipeline([('MinMaxscaler1', MinMaxScaler()), 
                       ('PCA1', PCA(n_components=10)), 
                       ('lr_classification', LogisticRegression())])

In [6]:
pipeline_rf = Pipeline([('MinMaxscaler2', MinMaxScaler()), 
                       ('PCA2', PCA(n_components=10)), 
                       ('rf_classification', RandomForestClassifier())])

In [7]:
pipeline_xg = Pipeline([('MinMaxscaler3', MinMaxScaler()), 
                       ('PCA3', PCA(n_components=10)), 
                       ('xg_classification', XGBClassifier())])

In [8]:
# Making List of Pipeline

mypipelines = [pipeline_lr, pipeline_rf, pipeline_xg]

In [9]:
# Creating variables to store values

best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [10]:
# Dictionary of pipelines and classifier types for ease of reference

pipe_dict = {0: 'Logistic Regression', 1: 'Random Forest', 2: 'XGB Classifier'}

# Fit the pipelines

for pipe in mypipelines:
    pipe.fit(X_train, y_train)

In [11]:
for i, model in enumerate(mypipelines):
    print(f'{pipe_dict[i]} Test Accuracy : {model.score(X_test,y_test)}')

Logistic Regression Test Accuracy : 0.956140350877193
Random Forest Test Accuracy : 0.9473684210526315
XGB Classifier Test Accuracy : 0.9649122807017544


In [12]:
# Choosing the best Model for the given data

for i, model in enumerate(mypipelines):
    if model.score(X_test, y_test) > best_accuracy:
        accuracy = model.score(X_test, y_test)
        
        best_pipeline = model
        
        best_classifier = i
        
print(f"Classification With Best Accuracy: {pipe_dict[best_classifier]}")

Classification With Best Accuracy: XGB Classifier


<br>
_____________________________________________________________________________________________________________________________

# Hyperparameter Tuning With Grid SearchCV Using Pipelines

In [16]:
from sklearn.model_selection import GridSearchCV

my_pip = Pipeline([("classification", XGBClassifier())])

# Create dictionary with candidate learning algorithms and their hyperparameters

grid_parameters = [{"classification": [LogisticRegression()], 
                    "classification__penalty": ['l2','l1'], 
                    "classification__C": np.logspace(0,4,10)}, 
                  
                   {"classification": [RandomForestClassifier()], 
                    "classification__n_estimators": [10,100,1000], 
                    "classification__max_depth": [5,10,15,20,25,50,100]}, 
                   
                   {"classification": [XGBClassifier()], 
                    "classification__penalty": ['l2','l1'], 
                    "classification__booster": ['gbtree', 'gblinear']}]

# create a gridsearch of the pipeline, the fit the best model

gridsearch = GridSearchCV(my_pip, grid_parameters, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [17]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(steps=[('classification', RandomForestClassifier(max_depth=10))])
The mean accuracy of the model is: 0.9649122807017544


<br>
_____________________________________________________________________________________________________________________________

# Make_pipelines in Sklearn

In [18]:
from sklearn.pipeline import make_pipeline

pipe = make_pipeline((RandomForestClassifier()))

# Create dictionary with candidate learning algorithms and their hyperparameters

grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]

# create a gridsearch of the pipeline, the fit the best model

gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)


best_model.score(X_test,y_test)

0.9385964912280702