In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier 
from sklearn import tree #Decision tree--eager learner

from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, precision_score

from sklearn.pipeline import make_pipeline

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [4]:
#Load dataset
irisDF=pd.read_csv('iris_dataset.csv')
irisDF.head(3)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa


In [5]:
#Encode categorical label
speciesEncoder=LabelEncoder()
irisDF['class']=speciesEncoder.fit_transform(irisDF['class'])

In [6]:
irisDF.head(3)

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0


In [68]:
X=irisDF.iloc[:,:-1]

In [69]:
X

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   sepallength  150 non-null    float64
 1   sepalwidth   150 non-null    float64
 2   petallength  150 non-null    float64
 3   petalwidth   150 non-null    float64
dtypes: float64(4)
memory usage: 4.8 KB


In [9]:
y=irisDF.iloc[:,-1]
type(y)

pandas.core.series.Series

In [70]:
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: class, Length: 150, dtype: int32

# Cross Validation: k-Fold Cross Validation

In [10]:
k=5 #set the number of splits (k)

In [11]:
#create cross validation object
kfolds=StratifiedKFold(n_splits=k,shuffle=False) #for 5-fold cross validation

In [12]:
print(kfolds)

StratifiedKFold(n_splits=5, random_state=None, shuffle=False)


In [71]:
#Instantiate classifier
rfClf=RandomForestClassifier(n_estimators=70)

In [72]:
logRegClf=LogisticRegression(max_iter=250)

In [73]:
cols=irisDF.columns
cols=cols[:-1] #retrieve all feature columns

In [74]:
cols

Index(['sepallength', 'sepalwidth', 'petallength', 'petalwidth'], dtype='object')

In [17]:
irisDF.iloc[[2,4,5]]

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
2,4.7,3.2,1.3,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0


In [18]:
accuracy={}
fold_count=1
for train_index, test_index in kfolds.split(X, y):
        #print(train_index,'\n',test_index)
        
        X_train=irisDF.iloc[train_index].loc[:,cols]
        X_test=irisDF.iloc[test_index].loc[:,cols]
        
        y_train=irisDF.iloc[train_index].loc[:,'class']
        y_test=irisDF.iloc[test_index].loc[:,'class']
        #y_test=irisDF.loc[test_index]['class']
        #fit the model
        
        scaler=StandardScaler()
        X_train=scaler.fit_transform(X_train)
        rfClf.fit(X_train,y_train)
        #Work out predictions
        X_test=scaler.transform(X_test)
        rf_predictions=rfClf.predict(X_test)
        key='fold#'+str(fold_count)
        fold_accuracy=accuracy_score(y_test,rf_predictions)
        accuracy[key]=fold_accuracy
        print(f"{key } accuracy = {fold_accuracy:.3f}")
        fold_count+=1
        
print(accuracy.values())   
avg_accuracy=sum(accuracy.values())/k
print(f"Avg accuracy: {avg_accuracy:.3f}")

fold#1 accuracy = 0.967
fold#2 accuracy = 0.967
fold#3 accuracy = 0.933
fold#4 accuracy = 0.900
fold#5 accuracy = 1.000
dict_values([0.9666666666666667, 0.9666666666666667, 0.9333333333333333, 0.9, 1.0])
Avg accuracy: 0.953


In [75]:
X.shape

(150, 4)

In [76]:
y.shape

(150,)

In [89]:
#Or simply use cross_val_score...with random forest
fold_scores = cross_val_score(rfClf, X, y, cv=k, scoring='accuracy')
print(fold_scores)
print(len(fold_scores))
print(f"Average accuracy:{np.mean(fold_scores):.3f}")

[0.96666667 0.96666667 0.93333333 0.96666667 1.        ]
5
Average accuracy:0.967


In [91]:
#Or simply use cross_val_score...with LogisticRegression
fold_scores = cross_val_score(logRegClf, X, y, cv=k, scoring='accuracy')
print(fold_scores)
print(f"Average accuracy:{np.mean(fold_scores):.3f}")

[0.96666667 1.         0.93333333 0.96666667 1.        ]
Average accuracy:0.973


In [92]:
irisDF.iloc[[1,2,3]]

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth,class
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0


In [93]:
irisDF.iloc[[1,2,3]].loc[:,cols] #retrieve row indexes 1,2,3 then retrieve all rows, excluding class

Unnamed: 0,sepallength,sepalwidth,petallength,petalwidth
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2


In [96]:
for train_index, test_index in kfolds.split(X, y):
    print(train_index)
    print(test_index)

[ 10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
[  0   1   2   3   4   5   6   7   8   9  50  51  52  53  54  55  56  57
  58  59 100 101 102 103 104 105 106 107 108 109]
[  0   1   2   3   4   5   6   7   8   9  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  50  51  52  53  54  55  56  57  58  59  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108 109
 120 121 122 123 124 125

In [94]:
accuracy={}
fold_count=1
for train_index, test_index in kfolds.split(X, y):
        X_train=X.iloc[train_index]
        X_test=X.iloc[test_index]
        
        y_train=y[train_index]
        y_test=y[test_index]
        print('*'*80)
        print(train_index,'\n\n',test_index)
        #fit the model
        rfClf.fit(X_train,y_train)
        #Work out predictions
        rf_predictions=rfClf.predict(X_test)
        key='fold#'+str(fold_count)
        fold_accuracy=accuracy_score(y_test,rf_predictions)
        accuracy[key]=fold_accuracy
        print('*'*80)
        print(f"\n{key } accuracy = {fold_accuracy:.3f}")

        fold_count+=1
        
print(list(accuracy.values()))   
avg_accuracy=sum(accuracy.values())/k
print(f"Avg accuracy: {avg_accuracy:.3f}")

[ 10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95  96  97  98  99 110 111 112 113 114 115 116 117 118 119
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
 138 139 140 141 142 143 144 145 146 147 148 149]
[  0   1   2   3   4   5   6   7   8   9  50  51  52  53  54  55  56  57
  58  59 100 101 102 103 104 105 106 107 108 109]
********************************************************************************
[ 10  11  12  13  14  15  16  17  18  19  20  21  22  23  24  25  26  27
  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45
  46  47  48  49  60  61  62  63  64  65  66  67  68  69  70  71  72  73
  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90  91
  92  93  94  95

In [26]:
#Or simply use cross_val_score...with random forest
fold_scores = cross_val_score(rfClf, X, y, cv=k, scoring='accuracy')
print(fold_scores)
print(f"Accuracy:{np.mean(fold_scores):.3f}")

[0.96666667 0.96666667 0.93333333 0.93333333 1.        ]
Accuracy:0.960


In [27]:
#Or simply  use cross_val_score...with random forest
fold_scores = cross_val_score(rfClf, X, y, cv=k, scoring='recall_weighted')
print(fold_scores)
print(f"Recall:{np.mean(fold_scores):.3f}")

[0.96666667 0.96666667 0.93333333 0.96666667 1.        ]
Recall:0.967


In [28]:
#Or simply use cross_val_score...with random forest
fold_scores = cross_val_score(rfClf, X, y, cv=k, scoring='f1_weighted')
print("f1-weighted\n",fold_scores)


f1-weighted
 [0.96658312 0.96658312 0.93265993 0.96658312 1.        ]


In [29]:
#Or simply  use cross_validate...with random forest---for multiple metrics use cross_validate function instead of cross_val_score

fold_scores = cross_validate(rfClf, X, y, cv=k, scoring=['accuracy','precision_weighted','recall_weighted','f1_weighted'])
#print(fold_scores)

for key,value in fold_scores.items():
    value=[float(f"{x:.3f}") for x in value] #render value in 3 decimal places---list comprehension.
    print("\n",key,"\t", value,f"\nAverage {key}:", f"{np.mean(value):.3f}")



 fit_time 	 [0.084, 0.064, 0.097, 0.114, 0.068] 
Average fit_time: 0.085

 score_time 	 [0.016, 0.016, 0.014, 0.021, 0.016] 
Average score_time: 0.017

 test_accuracy 	 [0.967, 0.967, 0.933, 0.9, 1.0] 
Average test_accuracy: 0.953

 test_precision_weighted 	 [0.97, 0.97, 0.944, 0.902, 1.0] 
Average test_precision_weighted: 0.957

 test_recall_weighted 	 [0.967, 0.967, 0.933, 0.9, 1.0] 
Average test_recall_weighted: 0.953

 test_f1_weighted 	 [0.967, 0.967, 0.933, 0.9, 1.0] 
Average test_f1_weighted: 0.953


In [30]:
#If data needs scaling--make a pipeline---see imports
#pipeline includes a)transformers and 2)final estimator/model learner
#single scoring function

kfold=KFold(n_splits=k)
knnClf = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
cross_val_score(knnClf, X, y, cv=kfold,scoring='accuracy')

array([1.        , 1.        , 0.8       , 0.93333333, 0.8       ])

In [31]:
#See the pipeline here.
knnClf

In [32]:
#If data needs scaling--make a pipeline
#with multiple scoring functions

kfold=KFold(n_splits=k)
scaler=StandardScaler()
knnClf = make_pipeline(scaler, KNeighborsClassifier(n_neighbors=3)) #make pipeline
#cross_validate(knnClf, X, y, cv=kfold) #Or
cross_validate(knnClf, X, y, cv=kfold,scoring=['accuracy','precision_micro','recall_micro'])

{'fit_time': array([0.00470114, 0.        , 0.00203896, 0.        , 0.00066519]),
 'score_time': array([0.01480746, 0.00914001, 0.00600648, 0.        , 0.        ]),
 'test_accuracy': array([1.        , 1.        , 0.8       , 0.93333333, 0.8       ]),
 'test_precision_micro': array([1.        , 1.        , 0.8       , 0.93333333, 0.8       ]),
 'test_recall_micro': array([1.        , 1.        , 0.8       , 0.93333333, 0.8       ])}

# Hyperparameter tuning

In [33]:
#Create random forest classifier
rf=RandomForestClassifier()

In [34]:
#Define the search space
search_space={
             'n_estimators':[10,40,70,90,120],\
             'max_depth':[2,3,5,10,20]\
             }

In [35]:
#Create a GridSearch object
gd_search=GridSearchCV(estimator=rf, param_grid=search_space,\
                        cv=k, n_jobs=1,verbose=2,scoring='accuracy')

In [36]:
gd_search=gd_search.fit(X_train,y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=10; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=40; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=40; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=40; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=40; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=40; total time=   0.0s
[CV] END .......................max_depth=2, n_estimators=70; total time=   0.0s
[CV] END .......................max_depth=2, n_

[CV] END ......................max_depth=20, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=40; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=90; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=90; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=90; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=90; total time=   0.0s
[CV] END ...................

In [37]:
gd_search

In [38]:
gd_search.cv_results_ #see the results---kizungumkuti (learn some Swahili :-) )

{'mean_fit_time': array([0.013726  , 0.0447855 , 0.06933484, 0.09826488, 0.13996043,
        0.01213059, 0.06029291, 0.1067636 , 0.14074225, 0.17776585,
        0.01920061, 0.05514984, 0.07560749, 0.10161963, 0.1765727 ,
        0.01613178, 0.04685102, 0.10272274, 0.12733483, 0.16239185,
        0.01089969, 0.04887938, 0.0880012 , 0.11539488, 0.13471045]),
 'std_fit_time': array([0.00366186, 0.00638584, 0.00574354, 0.01343264, 0.02946626,
        0.00631066, 0.01009869, 0.00732931, 0.01181428, 0.01336734,
        0.00360232, 0.00688499, 0.01202727, 0.00394744, 0.02142513,
        0.00199518, 0.00588446, 0.00927825, 0.02513145, 0.02848325,
        0.00489517, 0.00497143, 0.01046183, 0.01113172, 0.0087679 ]),
 'mean_score_time': array([0.00062275, 0.00624342, 0.00463314, 0.01193528, 0.00441675,
        0.00670328, 0.00510688, 0.00976748, 0.00833902, 0.01213326,
        0.0023509 , 0.00538363, 0.01257105, 0.00502787, 0.00160341,
        0.00050144, 0.00853057, 0.00803313, 0.00320563, 0.00

In [39]:
#Transform cv_results ( a dictionary) into a  Pandas data frame.
accuracy_df = pd.DataFrame(gd_search.cv_results_)
accuracy_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.013726,0.003662,0.000623,0.001245,2,10,"{'max_depth': 2, 'n_estimators': 10}",0.916667,1.0,0.916667,0.916667,0.875,0.925,0.040825,25
1,0.044785,0.006386,0.006243,0.003136,2,40,"{'max_depth': 2, 'n_estimators': 40}",0.916667,1.0,0.916667,0.958333,0.875,0.933333,0.042492,21
2,0.069335,0.005744,0.004633,0.003177,2,70,"{'max_depth': 2, 'n_estimators': 70}",0.958333,1.0,0.916667,0.916667,0.875,0.933333,0.042492,21
3,0.098265,0.013433,0.011935,0.008704,2,90,"{'max_depth': 2, 'n_estimators': 90}",0.916667,1.0,0.916667,0.958333,0.875,0.933333,0.042492,21
4,0.13996,0.029466,0.004417,0.006172,2,120,"{'max_depth': 2, 'n_estimators': 120}",0.916667,1.0,0.916667,0.958333,0.875,0.933333,0.042492,21


In [40]:
#Obtain a subset of the ddata frame
accuracy_df=accuracy_df[['param_n_estimators','param_max_depth','mean_test_score']]
accuracy_df

Unnamed: 0,param_n_estimators,param_max_depth,mean_test_score
0,10,2,0.925
1,40,2,0.933333
2,70,2,0.933333
3,90,2,0.933333
4,120,2,0.933333
5,10,3,0.941667
6,40,3,0.95
7,70,3,0.958333
8,90,3,0.95
9,120,3,0.941667


In [41]:
#Sort for easy viewing
accuracy_df.sort_values(['mean_test_score'], ascending=False)

Unnamed: 0,param_n_estimators,param_max_depth,mean_test_score
24,120,20,0.958333
22,70,20,0.958333
14,120,5,0.958333
20,10,20,0.958333
7,70,3,0.958333
19,120,10,0.958333
11,40,5,0.95
15,10,10,0.95
13,90,5,0.95
18,90,10,0.95


In [42]:
#Retrive details of the best parameters
gd_search.best_params_

{'max_depth': 3, 'n_estimators': 70}

In [43]:
#Retrieve details of the best estimator
best_estimator= gd_search.best_estimator_
best_estimator

In [44]:
#Obtain the best score
gd_search.best_score_

0.9583333333333334

In [45]:
#Set up RandomizedSearch

In [46]:
r_search=RandomizedSearchCV(estimator=rf, param_distributions=search_space,\
                        cv=k, n_jobs=1,verbose=2,scoring='accuracy')

In [47]:
#Perform search
r_search=r_search.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.0s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.0s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.0s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.0s
[CV] END .......................max_depth=3, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=10, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=10, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=10, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=10, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=10, n_estimators=70; total time=   0.0s
[CV] END ......................max_depth=20, n_estimators=90; total time=   0.0s
[CV] END ......................max_depth=20, n_e

In [48]:
#See kizungumkuti again
r_search.cv_results_

{'mean_fit_time': array([0.07492366, 0.07421737, 0.09322381, 0.04569454, 0.00906134,
        0.08941097, 0.0407866 , 0.01247897, 0.07206388, 0.06926012]),
 'std_fit_time': array([0.00369291, 0.00096063, 0.00341848, 0.00332706, 0.0074216 ,
        0.0106171 , 0.00428293, 0.00623967, 0.00522156, 0.00423946]),
 'mean_score_time': array([0.00525732, 0.00673656, 0.00554471, 0.0013021 , 0.00312381,
        0.01331372, 0.        , 0.003124  , 0.00640163, 0.00618515]),
 'std_score_time': array([0.00435828, 0.0016751 , 0.00289342, 0.0026042 , 0.00624762,
        0.00364364, 0.        , 0.006248  , 0.00320082, 0.00315826]),
 'param_n_estimators': masked_array(data=[70, 70, 90, 40, 10, 90, 40, 10, 70, 70],
              mask=[False, False, False, False, False, False, False, False,
                    False, False],
        fill_value='?',
             dtype=object),
 'param_max_depth': masked_array(data=[3, 10, 20, 10, 2, 3, 20, 3, 20, 5],
              mask=[False, False, False, False, False, Fa

In [49]:
#Dump the results into a data frame
accuracy_df = pd.DataFrame(r_search.cv_results_)
accuracy_df.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.074924,0.003693,0.005257,0.004358,70,3,"{'n_estimators': 70, 'max_depth': 3}",0.958333,1.0,0.916667,0.958333,0.916667,0.95,0.03118,6
1,0.074217,0.000961,0.006737,0.001675,70,10,"{'n_estimators': 70, 'max_depth': 10}",0.958333,1.0,0.916667,0.958333,0.875,0.941667,0.042492,7
2,0.093224,0.003418,0.005545,0.002893,90,20,"{'n_estimators': 90, 'max_depth': 20}",0.958333,1.0,0.916667,0.958333,0.958333,0.958333,0.026352,1
3,0.045695,0.003327,0.001302,0.002604,40,10,"{'n_estimators': 40, 'max_depth': 10}",0.958333,1.0,0.916667,0.958333,0.958333,0.958333,0.026352,1
4,0.009061,0.007422,0.003124,0.006248,10,2,"{'n_estimators': 10, 'max_depth': 2}",0.916667,1.0,0.958333,0.958333,0.875,0.941667,0.042492,7


In [50]:
#Obtain a subset
accuracy_df=accuracy_df[['param_n_estimators','param_max_depth','mean_test_score']]
accuracy_df

Unnamed: 0,param_n_estimators,param_max_depth,mean_test_score
0,70,3,0.95
1,70,10,0.941667
2,90,20,0.958333
3,40,10,0.958333
4,10,2,0.941667
5,90,3,0.941667
6,40,20,0.958333
7,10,3,0.941667
8,70,20,0.958333
9,70,5,0.958333


In [51]:
#Sort the results for ease of interpretation
accuracy_df.sort_values('mean_test_score', ascending=False)

Unnamed: 0,param_n_estimators,param_max_depth,mean_test_score
2,90,20,0.958333
3,40,10,0.958333
6,40,20,0.958333
8,70,20,0.958333
9,70,5,0.958333
0,70,3,0.95
1,70,10,0.941667
4,10,2,0.941667
5,90,3,0.941667
7,10,3,0.941667


In [52]:
#Retrieve the best estimator
r_search.best_estimator_

In [53]:
#Obtain the best score
r_search.best_score_

0.9583333333333334

In [54]:
r_search.best_params_

{'n_estimators': 90, 'max_depth': 20}

In [55]:
#Let us store the best estimator
best_estimator = r_search.best_estimator_
best_estimator

# Grid Search Hyperparameter Tuning and Model Selection

In [56]:
#Let us define the search space
hyperparams={'decision_tree':
                 {'model':tree.DecisionTreeClassifier(),
                  'params':{
                    'criterion':['gini','entropy'],
                    'max_depth':[5,10,20]
                    }
                },
             'knn':
                 {'model':KNeighborsClassifier(),
                  'params':{
                    'n_neighbors':[3,5,7,9],
                    'weights':['uniform','distance'],
                    'metric':['minkowski','euclidean','manhattan','chebyshev']
                    }
                },
             'r_forest':
                 {'model':RandomForestClassifier(),
                  'params':{
                    'n_estimators':[20,60,80,100],
                    'criterion':['gini','entropy'],
                    'max_depth':[5,10,20],
                    'bootstrap':[True,False]
                    }
                }
            }

In [57]:
accuracy_values=[]
results={}
#loop=1
#keys=hyperparams.keys()
#print(keys)
#print(hyperparams['knn'])
for key in hyperparams.keys():
    g_search=GridSearchCV(hyperparams[key]['model'],hyperparams[key]['params'],
                          cv=k,return_train_score=False,verbose=2)
    g_search.fit(X,y)
    #print(loop)
    accuracy_values.append({'model':key,
                            'best_score':g_search.best_score_,
                            'best_params':g_search.best_params_
                           })
    results[key]=g_search.cv_results_
    #loop+=1

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END ........................criterion=gini, max_depth=5; total time=   0.0s
[CV] END .......................criterion=gini, max_depth=10; total time=   0.0s
[CV] END .......................criterion=gini, max_depth=10; total time=   0.0s
[CV] END .......................criterion=gini, max_depth=10; total time=   0.0s
[CV] END .......................criterion=gini, max_depth=10; total time=   0.0s
[CV] END .......................criterion=gini, max_depth=10; total time=   0.0s
[CV] END .......................criterion=gini, max_depth=20; total time=   0.0s
[CV] END .......................criterion=gini, m

[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=3, weights=distance; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ...metric=manhattan, n_neighbors=5, weights=uniform; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ..metric=manhattan, n_neighbors=5, weights=distance; total time=   0.0s
[CV] END ...metric=manhattan

[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=60; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=60; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=60; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, criterion=gini, max_depth=10, n_estimators=100; total time=   0

[CV] END bootstrap=True, criterion=entropy, max_depth=20, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, criterion=entropy, max_depth=20, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, criterion=entropy, max_depth=20, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, criterion=entropy, max_depth=20, n_estimators=100; total time=   0.0s
[CV] END bootstrap=False, criterion=gini, max_depth=5, n_estimators=20; total time=   0.0s
[CV] END bootstrap=False, criterion=gini, max_depth=5, n_estimators=20; total time=   0.0s
[CV] END bootstrap=False, criterion=gini, max_depth=5, n_estimators=20; total time=   0.0s
[CV] END bootstrap=False, criterion=gini, max_depth=5, n_estimators=20; total time=   0.0s
[CV] END bootstrap=False, criterion=gini, max_depth=5, n_estimators=20; total time=   0.0s
[CV] END bootstrap=False, criterion=gini, max_depth=5, n_estimators=60; total time=   0.0s
[CV] END bootstrap=False, criterion=gini, max_depth=5, n_estimators=60; to

[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=60; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=60; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=80; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=100; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=100; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, max_depth=10, n_estimators=100; total time=   0.0s
[CV] END bootstrap=False, criterion=entropy, ma

In [58]:
accuracy_values

[{'model': 'decision_tree',
  'best_score': 0.9600000000000002,
  'best_params': {'criterion': 'gini', 'max_depth': 5}},
 {'model': 'knn',
  'best_score': 0.9866666666666667,
  'best_params': {'metric': 'chebyshev',
   'n_neighbors': 5,
   'weights': 'uniform'}},
 {'model': 'r_forest',
  'best_score': 0.9666666666666668,
  'best_params': {'bootstrap': True,
   'criterion': 'gini',
   'max_depth': 10,
   'n_estimators': 20}}]

In [59]:
pd.set_option('display.max_colwidth', None) #using this to avoid strings being truncated in column output
accuracy_df=pd.DataFrame(accuracy_values)

accuracy_df

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.96,"{'criterion': 'gini', 'max_depth': 5}"
1,knn,0.986667,"{'metric': 'chebyshev', 'n_neighbors': 5, 'weights': 'uniform'}"
2,r_forest,0.966667,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'n_estimators': 20}"


In [60]:
accuracy_df=pd.DataFrame(accuracy_values,columns=['model','best_params','best_score'])
accuracy_df

Unnamed: 0,model,best_params,best_score
0,decision_tree,"{'criterion': 'gini', 'max_depth': 5}",0.96
1,knn,"{'metric': 'chebyshev', 'n_neighbors': 5, 'weights': 'uniform'}",0.986667
2,r_forest,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 10, 'n_estimators': 20}",0.966667


In [61]:
accuracy_df['best_score'].max()

0.9866666666666667

In [62]:
#best model ---create a filter to return details of model will maximum best_score
accuracy_df[accuracy_df['best_score'] == accuracy_df['best_score'].max()]

Unnamed: 0,model,best_params,best_score
1,knn,"{'metric': 'chebyshev', 'n_neighbors': 5, 'weights': 'uniform'}",0.986667


In [63]:
result_dfs=[] #a list of data frames
for key,results in results.items():
    result_df=pd.DataFrame(results)
    if key=='decision_tree':
        result_df=result_df[['param_criterion','param_max_depth','mean_test_score']]
        result_dfs.append(result_df)
    elif key=='knn':
        result_df=result_df[['param_metric','param_n_neighbors','param_weights','mean_test_score']]
        result_dfs.append(result_df)
    elif key=='r_forest': 
        result_df=result_df[['param_criterion','param_max_depth','param_n_estimators','param_bootstrap','mean_test_score']]
        result_dfs.append(result_df)


In [64]:
result_dfs[0]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.96
1,gini,10,0.96
2,gini,20,0.953333
3,entropy,5,0.953333
4,entropy,10,0.953333
5,entropy,20,0.953333


In [65]:
result_dfs[1]

Unnamed: 0,param_metric,param_n_neighbors,param_weights,mean_test_score
0,minkowski,3,uniform,0.966667
1,minkowski,3,distance,0.966667
2,minkowski,5,uniform,0.973333
3,minkowski,5,distance,0.966667
4,minkowski,7,uniform,0.98
5,minkowski,7,distance,0.98
6,minkowski,9,uniform,0.973333
7,minkowski,9,distance,0.973333
8,euclidean,3,uniform,0.966667
9,euclidean,3,distance,0.966667


In [66]:
result_dfs[2]

Unnamed: 0,param_criterion,param_max_depth,param_n_estimators,param_bootstrap,mean_test_score
0,gini,5,20,True,0.96
1,gini,5,60,True,0.953333
2,gini,5,80,True,0.96
3,gini,5,100,True,0.96
4,gini,10,20,True,0.966667
5,gini,10,60,True,0.96
6,gini,10,80,True,0.966667
7,gini,10,100,True,0.96
8,gini,20,20,True,0.966667
9,gini,20,60,True,0.966667


In [67]:
result_dfs

[  param_criterion param_max_depth  mean_test_score
 0            gini               5         0.960000
 1            gini              10         0.960000
 2            gini              20         0.953333
 3         entropy               5         0.953333
 4         entropy              10         0.953333
 5         entropy              20         0.953333,
    param_metric param_n_neighbors param_weights  mean_test_score
 0     minkowski                 3       uniform         0.966667
 1     minkowski                 3      distance         0.966667
 2     minkowski                 5       uniform         0.973333
 3     minkowski                 5      distance         0.966667
 4     minkowski                 7       uniform         0.980000
 5     minkowski                 7      distance         0.980000
 6     minkowski                 9       uniform         0.973333
 7     minkowski                 9      distance         0.973333
 8     euclidean                 3       