### __Hyperparameter Tuning Experiment - GridSearchCV__
> - Test GradSearch on classification data & models
> - The following is tested:
    * Running Time
    * Best Parameters Determined
    * Performance Test

> - __Models tested:__
    * __Scaled Data without PCA:__ Logistic Regression, Random Forest, XGBoost
    * __Data pipeline scaling & PCA:__ SVC, SGD

#### Reference Links:
> - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
> - https://scikit-learn.org/stable/modules/model_evaluation.html
> - https://towardsdatascience.com/gridsearchcv-for-beginners-db48a90114ee

> - https://stackoverflow.com/questions/39409866/correlation-heatmap
> - https://datascience.stackexchange.com/questions/17540/make-seaborn-heatmap-bigger
> - https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/
> - https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/
> - https://stackoverflow.com/questions/43214978/seaborn-barplot-displaying-values
> - https://medium.com/aiplusoau/hyperparameter-tuning-a5fe69d2a6c7
> - https://www.kaggle.com/code/prashant111/catboost-classifier-in-python/notebook
> - https://catboost.ai/en/docs/concepts/python-reference_catboostclassifier_grid_search
> - https://towardsdatascience.com/logistic-regression-model-tuning-with-scikit-learn-part-1-425142e01af5
> - https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74


In [1]:
#import the required packages here
# import libraries
import numpy as np
import pandas as pd
import io
from datetime import datetime, timezone, timedelta
import datetime as dt
import time
from collections import Counter
from PIL import Image


# import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
#plt.imshow(mpimg.imread('jan_aug_tempanom.png'))

# import seaborn
import seaborn as sns

# import plotly graph objects
import plotly.graph_objs as go
import plotly.express as px


# import scipy.cluster.hierarchy
import scipy
import scipy.linalg as la
import scipy.cluster.hierarchy as sch
from scipy.cluster.vq import whiten, kmeans, vq
from scipy.stats import multivariate_normal
from pathlib import Path

import statsmodels.api as sm

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
import sklearn.model_selection as model_selection
import sklearn.model_selection as cross_validation
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, FeatureUnion
import sklearn.metrics as metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import average_precision_score, f1_score, classification_report
from sklearn.metrics import plot_precision_recall_curve, plot_confusion_matrix
from sklearn import linear_model, datasets
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.svm import SVC
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier

import xgboost as xgb
from xgboost import XGBClassifier

#for showing 2D plot
%matplotlib inline 

# to be able to see multiple ouputs from sungle cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### __Import data from csv files__

In [2]:
# import data
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print('\nTrain Data:')
train_df.info()
train_df

print('\nTest Data:')
test_df.info()
test_df


Train Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    344 non-null    int64  
 1   mean radius              344 non-null    float64
 2   mean texture             344 non-null    float64
 3   mean perimeter           344 non-null    float64
 4   mean area                344 non-null    float64
 5   mean smoothness          344 non-null    float64
 6   mean compactness         344 non-null    float64
 7   mean concavity           344 non-null    float64
 8   mean concave points      344 non-null    float64
 9   mean symmetry            344 non-null    float64
 10  mean fractal dimension   344 non-null    float64
 11  radius error             344 non-null    float64
 12  texture error            344 non-null    float64
 13  perimeter error          344 non-null    float64
 14  area error   

Unnamed: 0,index,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,1,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.086900,0.07017,0.1812,...,23.41,158.80,1956.0,0.12380,0.18660,0.24160,0.18600,0.2750,0.08902,0
1,2,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.197400,0.12790,0.2069,...,25.53,152.50,1709.0,0.14440,0.42450,0.45040,0.24300,0.3613,0.08758,0
2,3,11.42,20.38,77.58,386.1,0.14250,0.28390,0.241400,0.10520,0.2597,...,26.50,98.87,567.7,0.20980,0.86630,0.68690,0.25750,0.6638,0.17300,0
3,4,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.198000,0.10430,0.1809,...,16.67,152.20,1575.0,0.13740,0.20500,0.40000,0.16250,0.2364,0.07678,0
4,5,12.45,15.70,82.57,477.1,0.12780,0.17000,0.157800,0.08089,0.2087,...,23.75,103.40,741.6,0.17910,0.52490,0.53550,0.17410,0.3985,0.12440,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
339,556,10.16,19.59,64.73,311.7,0.10030,0.07504,0.005025,0.01116,0.1791,...,22.88,67.88,347.3,0.12650,0.12000,0.01005,0.02232,0.2262,0.06742,1
340,560,14.05,27.15,91.38,600.4,0.09929,0.11260,0.044620,0.04304,0.1537,...,33.17,100.20,706.7,0.12410,0.22640,0.13260,0.10480,0.2250,0.08321,1
341,562,15.22,30.62,103.40,716.9,0.10480,0.20870,0.255000,0.09429,0.2128,...,42.79,128.70,915.0,0.14170,0.79170,1.17000,0.23560,0.4089,0.14090,0
342,565,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.144000,0.09791,0.1752,...,38.25,155.00,1731.0,0.11660,0.19220,0.32150,0.16280,0.2572,0.06637,0



Test Data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   index                    100 non-null    int64  
 1   mean radius              100 non-null    float64
 2   mean texture             100 non-null    float64
 3   mean perimeter           100 non-null    float64
 4   mean area                100 non-null    float64
 5   mean smoothness          100 non-null    float64
 6   mean compactness         100 non-null    float64
 7   mean concavity           100 non-null    float64
 8   mean concave points      100 non-null    float64
 9   mean symmetry            100 non-null    float64
 10  mean fractal dimension   100 non-null    float64
 11  radius error             100 non-null    float64
 12  texture error            100 non-null    float64
 13  perimeter error          100 non-null    float64
 14  area error     

Unnamed: 0,index,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,418,12.70,12.17,80.88,495.0,0.08785,0.05794,0.023600,0.024020,0.1583,...,16.92,88.12,566.9,0.13140,0.16070,0.09385,0.08224,0.2775,0.09464,1
1,170,12.32,12.39,78.85,464.1,0.10280,0.06981,0.039870,0.037000,0.1959,...,15.64,86.97,549.1,0.13850,0.12660,0.12420,0.09391,0.2827,0.06771,1
2,84,12.00,15.65,76.95,443.3,0.09723,0.07165,0.041510,0.018630,0.2079,...,24.90,87.78,567.9,0.13770,0.20030,0.22670,0.07632,0.3379,0.07924,1
3,456,11.63,29.29,74.87,415.1,0.09357,0.08574,0.071600,0.020170,0.1799,...,38.81,86.04,527.8,0.14060,0.20310,0.29230,0.06835,0.2884,0.07220,1
4,85,18.46,18.52,121.10,1075.0,0.09874,0.10530,0.133500,0.087950,0.2132,...,27.68,152.20,1603.0,0.13980,0.20890,0.31570,0.16420,0.3695,0.08579,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,375,16.17,16.07,106.30,788.5,0.09880,0.14380,0.066510,0.053970,0.1990,...,19.14,113.10,861.5,0.12350,0.25500,0.21140,0.12510,0.3153,0.08960,1
96,307,9.00,14.40,56.36,246.3,0.07005,0.03116,0.003681,0.003472,0.1788,...,20.07,60.90,285.5,0.09861,0.05232,0.01472,0.01389,0.2991,0.07804,1
97,219,19.53,32.47,128.00,1223.0,0.08420,0.11300,0.114500,0.066370,0.1428,...,45.41,180.20,2477.0,0.14080,0.40970,0.39950,0.16250,0.2713,0.07568,0
98,564,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.243900,0.138900,0.1726,...,26.40,166.10,2027.0,0.14100,0.21130,0.41070,0.22160,0.2060,0.07115,0


#### __Data Preparation__
> - for the second set of testing (scaling the original data)

In [3]:
# start time to measure the time of the program execution
start_time = time.time()

# Scale data
train_copy = train_df.copy()

# prepare data for scaling
train_copy = train_copy.drop(['index'], axis=1)
y_copy = train_copy['target']
X_copy = train_copy.drop(['target'], axis=1)

# perform a robust scaler transform of the dataset
scale = MinMaxScaler()
X_scaled = scale.fit_transform(X_copy)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")



--- 0.003989696502685547 seconds ---


#### __Train/Test Split__
> - 1st Set: train/test split on the original data
> - 2nd Set: train/yesy split on the scaled data

In [4]:
# start time to measure the time of the program execution
start_time = time.time()

# 1st Set:
# split train/test
y = train_copy['target']
X = train_copy.drop(['target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")



--- 0.009271860122680664 seconds ---


In [5]:
# start time to measure the time of the program execution
start_time = time.time()

# 2nd Set:
# split train/test
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X_scaled, y_copy, test_size=0.20, random_state=42)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")



--- 0.0009832382202148438 seconds ---


### __1st Set - GridSearchCV:__

#### Scaled Data without PCA: Logistic Regression, Random Forest, XGBoost
> - Logistic Regression: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
> - Random Foreat: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
> - XGBoost: https://towardsdatascience.com/binary-classification-xgboost-hyperparameter-tuning-scenarios-by-non-exhaustive-grid-search-and-c261f4ce098d


#### Test 1: Logistic Regression

In [9]:
# start time to measure the time of the program execution
start_time = time.time()

# Create first pipeline for base.
pipe_lg = Pipeline([('classifier' , LogisticRegression())])

# Create param grid.
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]

param_grid_lg = [
    {'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : param_range,
     'classifier__max_iter': [50, 100, 150, 200, 250],
     'classifier__random_state': [5, 10, 20, 30, 40, 50, 80],
    'classifier__solver' : ['liblinear', 'saga']}
]

# Create grid search object
grid_model_lg = GridSearchCV(pipe_lg, param_grid = param_grid_lg, cv = 5, verbose=True, n_jobs=-1, scoring='f1')

# Fit on data
grid_model_lg.fit(X_train2, y_train2)

# find best model score
grid_model_lg.best_params_
grid_model_lg.best_index_
grid_model_lg.best_score_

# find best model score
grid_model_lg.score(X_train2, y_train2)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Fitting 5 folds for each of 980 candidates, totalling 4900 fits




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('classifier', LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'classifier__C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0,
                                            100.0],
                          'classifier__max_iter': [50, 100, 150, 200, 250],
                          'classifier__penalty': ['l1', 'l2'],
                          'classifier__random_state': [5, 10, 20, 30, 40, 50,
                                                       80],
                          'classifier__solver': ['liblinear', 'saga']}],
             scoring='f1', verbose=True)

{'classifier__C': 100.0,
 'classifier__max_iter': 200,
 'classifier__penalty': 'l2',
 'classifier__random_state': 5,
 'classifier__solver': 'saga'}

939

0.9755206022719636

0.9788732394366197



--- 8.355851650238037 seconds ---


#### Test 2: Random Forest

In [None]:
# start time to measure the time of the program execution
start_time = time.time()

# Create first pipeline for base.
pipe_rf = Pipeline([('classifier' , RandomForestClassifier())])

# Create param grid.
param_grid_rf = [
    {'classifier__n_estimators': 
     [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],
    'classifier__criterion' : ['gini', 'entropy'],
     'classifier__max_features': ['auto', 'sqrt'],
    'classifier__max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)],
     'classifier__min_samples_split': [2, 5, 10],
     'classifier__min_samples_leaf': [1, 2, 4],
     #'classifier__random_state': [5, 10, 20, 30, 40, 50, 80],
    'classifier__bootstrap' : [True, False]}
]

# Create grid search object
grid_model_rf = GridSearchCV(pipe_rf, param_grid = param_grid_rf, cv = 5, verbose=True, n_jobs=-1, scoring='f1')

# Fit on data
grid_model_rf.fit(X_train2, y_train2)

# find best model score
grid_model_rf.best_params_
grid_model_rf.best_index_
grid_model_rf.best_score_

# find best model score
grid_model_rf.score(X_train2, y_train2)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

Fitting 5 folds for each of 83160 candidates, totalling 415800 fits


#### Test 3: XGBoost

In [None]:
# start time to measure the time of the program execution
start_time = time.time()

# Create first pipeline for base.
pipe_xgb = Pipeline([('classifier' , XGBClassifier())])

# Create param grid.
param_grid_xgb = [
        {'classifier__gamma': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4, 200],
        'classifier__learning_rate': [0.01, 0.03, 0.06, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7],
        'classifier__max_depth': [5,6,7,8,9,10,11,12,13,14],
        'classifier__n_estimators': [50,65,80,100,115,130,150],
        'classifier_reg_alpha': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200],
        'classifier__reg_lambda': [0,0.1,0.2,0.4,0.8,1.6,3.2,6.4,12.8,25.6,51.2,102.4,200]}
                ]

# Create grid search object
grid_model_xgb = GridSearchCV(pipe_xgb, param_grid = param_grid_xgb, cv = 5, verbose=True, n_jobs=-1, scoring='f1')

# Fit on data
grid_model_xgb.fit(X_train2, y_train2)

# find best model score
grid_model_xgb.best_params_
grid_model_xgb.best_index_
grid_model_xgb.best_score_

# find best model score
grid_model_xgb.score(X_train2, y_train2)

# print the overall program runtime.
print(f"\n\n--- {(time.time() - start_time)} seconds ---")

### __2nd Set - GridSearchCV:__

#### Data pipeline scaling & PCA: SVC, SGD