In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline 
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
Data=load_iris()
#print(Data.data,Data.target)
#print(Data.feature_names,Data.target_names)
X_train,X_test,y_train,y_test = train_test_split(Data.data,Data.target,test_size=0.25,random_state=0)
# Create Pipeline essentials
pipeline_LR =Pipeline([('Scaler1',StandardScaler()),('PCA1',PCA(n_components=2)),('LR',LogisticRegression(random_state=0))])
pipeline_DTC =Pipeline([('Scaler2',StandardScaler()),('PCA2',PCA(n_components=2)),('DTC',DecisionTreeClassifier())])
pipeline_RF=Pipeline([('Scaler3', StandardScaler()),('PCA3',PCA(n_components=2)),('RF',RandomForestClassifier())])
pipelines=[pipeline_LR, pipeline_DTC, pipeline_RF]
pipe_dict={0:'LogisticRegression',1:'DecisionTreeClassifier', 2:'RandomForestClassifier'}
#Fit dataset'iris' to pipeline
for pipe in pipelines:
    pipe.fit(X_train,y_train)
#Enumerate Test Accuracy for each model and print the results as statement
for i,pipe in enumerate(pipelines):
    print("{} Test Accuracy : {} ".format(pipe_dict[i],pipe.score(X_test,y_test)))
# To verify which pipe is at best
best_accuracy=0
for i,pipe in enumerate(pipelines):
    if pipe.score(X_test,y_test)>best_accuracy:
        best_accuracy = pipe.score(X_test,y_test)
        best_pipeline=pipe
print("Classifier with best accuracy : {}".format(pipe_dict[i]))
#Perform hyperparameter tuning for the pipe - RandomForestClassifier    
from sklearn.model_selection import GridSearchCV
best_pipeline = RandomForestClassifier()
#Defining the parameters
n_estimators = [int(x) for x in np.linspace(10,200,10)]
max_features = ['auto','sqrt']
max_depth = [2,4]
min_samples_split=[2,5]
min_samples_leaf=[1,2]
bootstrap = [True,False]
#Create the paramgrid
param_grid = {'n_estimators':n_estimators,
             'max_features':max_features,
             'max_depth':max_depth,
             'min_samples_split':min_samples_split,
             'min_samples_leaf':min_samples_leaf,
             'bootstrap':bootstrap}
Grid_RF = GridSearchCV(best_pipeline,param_grid=param_grid,cv=3,verbose=2,n_jobs=-1)
Grid_RF.fit(X_train,y_train)
y_pred=Grid_RF.predict(X_test)
Grid_RF.best_params_
print("Accuracy: %.2f" % accuracy_score(y_test, y_pred))

LogisticRegression Test Accuracy : 0.8421052631578947 
DecisionTreeClassifier Test Accuracy : 0.8947368421052632 
RandomForestClassifier Test Accuracy : 0.8947368421052632 
Classifier with best accuracy : RandomForestClassifier
Fitting 3 folds for each of 320 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:   45.4s
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  2.2min finished


Accuracy: 0.95
