## Week 7: Day 3 – Grid Search

### Load Dataset
Load iris dataset from datasets module that was imported. Iris is a famous dataset in the world of statistics and has been used for numerous tutorials.

If you run print(iris) you will see that it returns a dictionary where the first key, value pair is "data" and a 150x 4 numpy array. The second key, value pair is "target" and list of integer values. The last key, value pair is "feature_names" and a list of names. We shall be using these 3 components to build our pandas dataframe that will be used in the rest of this tutorial.

In [None]:
# Load and return the iris dataset
from sklearn import svm, datasets
iris=datasets.load_iris()

In [None]:
#importing packages
import pandas as pd
# Create data frame
df=pd.DataFrame(iris.data,columns=iris.feature_names)
# Lets add another column to the dataset
df["flower"]=iris.target
df["flower"]=df["flower"].apply(lambda x:iris.target_names[x])
df[47:52]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),flower
47,4.6,3.2,1.4,0.2,setosa
48,5.3,3.7,1.5,0.2,setosa
49,5.0,3.3,1.4,0.2,setosa
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor


In [None]:
#Data splitting

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(iris.data, iris.target, test_size=0.3)

In [None]:
#Create a svm Classifier

model=svm.SVC(kernel="rbf",C=30,gamma="auto") # rbf Kernel

#train the model using the training sets

model.fit(X_train,y_train)

# The score() method will take in an input X_test, and it's target value Y_test, your model will compute Y_pred for your X_test
#, and attribute a score.

model.score(X_test,y_test)

0.9555555555555556

In [None]:
# the cross_val_score splits the data repeatedly into a training and a testing set, trains the estimator using the training set
# and computes the scores based on the testing set for each iteration of cross-validation.

from sklearn.model_selection import cross_val_score

In [None]:
#Manually try suppling models with different parameters to cross_val_score function with 5 fold cross validation
cross_val_score(svm.SVC(kernel="linear",C=10,gamma="auto"),iris.data, iris.target, cv=5)

array([1.        , 1.        , 0.9       , 0.96666667, 1.        ])

In [None]:
cross_val_score(svm.SVC(kernel="rbf",C=10,gamma="auto"),iris.data, iris.target, cv=5)

array([0.96666667, 1.        , 0.96666667, 0.96666667, 1.        ])

In [None]:
#Above approach is tiresome and very manual. We can use for loop as an alternative
import numpy as np
kernels=["rbf", "linear"]
C=[1,10,20]
avg_scores={}
for kval in kernels:
    for cval in C:
        cv_scores=cross_val_score(svm.SVC(kernel=kval,C=cval,gamma="auto"), iris.data, iris.target, cv=5)
        avg_scores[kval + "_" + str(cval)] = np.average(cv_scores)
avg_scores

{'rbf_1': 0.9800000000000001,
 'rbf_10': 0.9800000000000001,
 'rbf_20': 0.9666666666666668,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

GridSearch is a tool that is used for hyperparameter tuning. As
stated before, Machine Learning in practice comes down to
comparing different models to each other and trying to find the
best working model.

In [None]:
#GridSearchCV does exactly same thing as for loop above.
from sklearn.model_selection import GridSearchCV

clf=GridSearchCV(svm.SVC(gamma="auto"), {
    "C":[1,10,20],
    "kernel": ["rbf", "linear"]
}, cv=5, return_train_score=False)

clf.fit(iris.data, iris.target)
clf.cv_results_

{'mean_fit_time': array([0.00182762, 0.00251689, 0.00266457, 0.00083151, 0.00088377,
        0.00074863]),
 'std_fit_time': array([1.03230981e-03, 2.12550037e-03, 1.42266186e-03, 1.97029645e-04,
        6.46942389e-05, 1.03450039e-04]),
 'mean_score_time': array([0.00168281, 0.00109291, 0.00112681, 0.00044379, 0.00054383,
        0.00037646]),
 'std_score_time': array([1.57997604e-03, 1.00247942e-03, 6.16248253e-04, 9.93062581e-05,
        1.03492669e-04, 2.74220754e-05]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'ker

In [None]:
#  Create a Pandas data frame and then supply CV results as an input.
df=pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.001828,0.001032,0.001683,0.00158,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.002517,0.002126,0.001093,0.001002,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.002665,0.001423,0.001127,0.000616,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.000832,0.000197,0.000444,9.9e-05,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000884,6.5e-05,0.000544,0.000103,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000749,0.000103,0.000376,2.7e-05,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [None]:
# view the results as a pandas DataFrame
df[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [None]:
# dir() show the names in the module namespace  
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_is_fitted',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_in_',
 'n_jobs',
 'n_splits

In [None]:
# clf.best_score_ then gives us the mean cross-validated score of the best estimator among the ones defined by hyperparameters.
clf.best_score_

0.9800000000000001

In [None]:
# best_params_ attribute will return the combination of hyperparameters along with values that give the best performance of our estimate specified.
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [None]:
# run RandomizedSearchCV 20 times (with n_iter=10) and record the mean score
from sklearn.model_selection import RandomizedSearchCV

rs = RandomizedSearchCV(svm.SVC(gamma="auto"), {
    "C":[1,10,20],
    "kernel": ["rbf", "linear"]
    }, 
    cv=5,
    return_train_score=False,
    n_iter=2
                       )
rs.fit(iris.data, iris.target)
pd.DataFrame(rs.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,10,linear,0.973333
1,20,rbf,0.966667


In [None]:
# Importing required packages
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
# GRID SEARCH to find the best model and parameters
model_params = {
    "svm" : {
        "model" : svm.SVC(gamma="auto"),
        "params" : {
           "C" : [1,10,20],
           "kernel" : ["rbf", "linear"]
      }
  },
  "random_forest" : {
      "model" : RandomForestClassifier(),
      "params" : {
          "n_estimators" : [1,5,10]
      }
      
  },
  "logistic_regression" : {
      "model" : LogisticRegression(solver="liblinear", multi_class="auto"),
      "params" : {
          "C" : [1,5,10]
      }
  }
}   

In [None]:
# we want to classify the models with respect of score, best parameters and f2 score.
scores=[]

for model_name, mp in model_params.items():
    clf = GridSearchCV(mp["model"], mp["params"], cv=5, return_train_score=False)
    clf.fit(iris.data, iris.target)
    scores.append({
        "model": model_name,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_,
    })

In [None]:
# converting it into a dataframe to display results.
df=pd.DataFrame(scores,columns=["model", "best_score", "best_params"])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.98,"{'C': 1, 'kernel': 'rbf'}"
1,random_forest,0.96,{'n_estimators': 10}
2,logistic_regression,0.966667,{'C': 5}
