## Finding the Optimal Parameters for Machine Learning Models
### Dr. Robert G. de Luna, PECE

## Agenda

- How to search for an **optimal tuning parameter**?
- How do you search for **multiple tuning parameters** at once?
- What do you do with those tuning parameters before making **real predictions**?

TO CHECK THE VERSION OF LIBRARIES

In [None]:
import sys
print('Python: {}'.format(sys.version))
# scipy
import scipy
print('scipy: {}'.format(scipy.__version__))
# numpy
import numpy
print('numpy: {}'.format(numpy.__version__))
# matplotlib
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))
# scikit-learn
import sklearn
print('sklearn: {}'.format(sklearn.__version__))

TO IMPORT LIBRARIES

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

TO LOAD THE DATASET

In [None]:
dataset = pandas.read_csv('Herbal_Plants.csv')

TO DETERMINE THE DIMENSIONS OF THE DATASET

In [None]:
print(dataset.shape)

TO PEEK AT THE DATA

In [None]:
print(dataset.head(20))

TO SEE THE STATISTICAL SUMMARY

In [None]:
print(dataset.describe())

TO SEE THE CLASS DISTRIBUTION

In [None]:
print(dataset.groupby('Herbal').size())

TO SHOW THE UNIVARIATE PLOT (BOX and WHISKER PLOTS)

In [None]:
dataset.plot(kind='box', subplots=False, layout=(1,5), sharex=False, sharey=False)
plt.show()

##### TO SHOW THE HISTOGRAM FOR THE DISTRIBUTION

In [None]:
dataset.hist()
plt.show()

FOR THE MULTIVARIATE PLOT

In [None]:
# For the Scatter Plot Matrix
from pandas.plotting import scatter_matrix
scatter_matrix(dataset)
plt.show()

TO CREATE THE MATRIX OF INDEPENDENT VARIABLE, X

In [None]:
X = dataset.iloc[:, :-1].values
X

TO CREATE THE MATRIX OF DEPENDENT VARIABLE, Y

In [None]:
Y = dataset.iloc[:, 5].values
Y

TO ENCODE THE CATEGORICAL DATA IN THE DEPENDENT VARIABLE, Y

In [None]:
from sklearn.preprocessing import LabelEncoder 
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
Y

#### TO SPLIT THE DATASET INTO TRAINING DATASET AND TESTING DATASET

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=0)

In [None]:
# To Show the Shapes of X and Y Data
print(X.shape)
print(Y.shape)

In [None]:
# To Show the Shapes of the New X Objects
print(X_train.shape)
print(X_test.shape)

In [None]:
# To Show the Shapes of the New Y Objects
print(Y_train.shape)
print(Y_test.shape)

#### TO IMPORT DIFFERENT MACHINE LEARNING MODELS

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

###### To Build Different Machine Learning Models

In [None]:
models = []
models.append(('LR', LogisticRegression(max_iter=1000000)))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('NN', MLPClassifier()))
models

#### To Evaluate Each Model in Turn Using Default Parameters of All Models

In [None]:
# For Test Options and Evaluation Metric
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

results = []
names = []

for name, model in models:
	k_Fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
	cv_results = cross_val_score(model, X, Y, cv=k_Fold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	printed_results = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(printed_results)

#### To Select the Best Machine Learning Model

In [None]:
figure = plt.figure()
figure.suptitle('Algorithm Comparison')
axis = figure.add_subplot(111)
plt.boxplot(results)
axis.set_xticklabels(names)
plt.show()

#### A. To Create the Logistic Regression Model 

In [None]:
# To Instantiate the Model (Using the Default Parameters)
logistic_regression = LogisticRegression(max_iter=100000, random_state=0)

# To Fit the Training Dataset into Logistic Regression Model
logistic_regression.fit(X_train, Y_train)

# To Predict the Output of the Testing Dataset
Y_predict_LogReg = logistic_regression.predict(X_test)
Y_predict_LogReg

###### To Evaluate the Performance of the Logistic Regression Model

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_predict_LogReg)

import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(confusion_matrix, annot=True)
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_test, Y_predict_LogReg)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_test, Y_predict_LogReg))

#### Applying GridSearchCV to find the Best Parameters for the Logistic Regression Model

Grid Search can be thought of as an exhaustive search for selecting a model. In Grid Search, the data scientist sets up a grid of hyperparameter values and for each combination, trains a model and scores on the testing data. In this approach, every combination of hyperparameter values is tried which can be very inefficient. For example, searching 20 different parameter values for each of 4 parameters will require 160,000 trials of cross-validation. This equates to 1,600,000 model fits and 1,600,000 predictions if 10-fold cross validation is used. While Scikit Learn offers the GridSearchCV function to simplify the process, it would be an extremely costly execution both in computing power and time.

In [None]:
# To Import the StratifiedKFold Class
from sklearn.model_selection import StratifiedKFold
k_Fold = StratifiedKFold (n_splits=10, shuffle=True, random_state=0)

# To Import the GridSearch Class
from sklearn.model_selection import GridSearchCV

# To Set Parameters to be Optimized Under the Logistic Regression Model
parameters = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1'], 'solver': ['liblinear','saga']},
              {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2'], 'solver': ['newton-cg', 'lbfgs', 'saga', 'sag']}]

grid_search = GridSearchCV(estimator = logistic_regression,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = k_Fold,
                           n_jobs = -1)
grid_search = grid_search.fit(X, Y)
print(grid_search)

In [None]:
# To View the Results of the GridSearch
pd.DataFrame(grid_search.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

In [None]:
# To Identify the Best Accuracy and Best Features

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("BEST ACCURACY SCORE:")
print(best_accuracy)
print('')

print("BEST PARAMETERS:")
print(best_parameters)

#### Applying RandomizedSearchCV to find the Best Parameters for the Logistic Regression Model

By contrast, Random Search sets up a grid of hyperparameter values and selects random combinations to train the model and score. This allows you to explicitly control the number of parameter combinations that are attempted. The number of search iterations is set based on time or resources. Scikit Learn offers the RandomizedSearchCV function for this process.

While it’s possible that RandomizedSearchCV will not find as accurate of a result as GridSearchCV, it surprisingly picks the best result more often than not and in a fraction of the time it takes GridSearchCV would have taken. Given the same resources, Randomized Search can even outperform Grid Search.

In [None]:
# To Import the StratifiedKFold Class
from sklearn.model_selection import StratifiedKFold
k_Fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# To Import the RandomizedSearchCV Class
from sklearn.model_selection import RandomizedSearchCV

# To Set Parameters to be Optimized Under the Logistic Regression Model
parameters = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1'], 'solver': ['liblinear','saga']},
              {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'penalty': ['l1', 'l2'], 'solver': ['newton-cg', 'lbfgs', 'saga', 'sag']}]

#C = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
#penalty = ['l1', 'l2']
#solver = ['newton-cg', 'lbfgs', 'saga', 'sag']
#parameters = dict(C=C, penalty=penalty, solver=solver)

randomized_search = RandomizedSearchCV(estimator = logistic_regression,
                                       param_distributions = parameters,
                                       n_iter = 50,
                                       scoring = 'accuracy',
                                       cv = k_Fold,
                                       n_jobs = -1,
                                       random_state = 0)
best_fit = randomized_search.fit(X, Y)
print(randomized_search)

In [None]:
# To View the Results of the GridSearch
pd.DataFrame(randomized_search.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

In [None]:
# To Identify the Best Accuracy and Best Features

best_accuracy = randomized_search.best_score_
best_parameters = randomized_search.best_params_

print("BEST ACCURACY SCORE:")
print(best_accuracy)
print('')

print("BEST PARAMETERS:")
print(best_parameters)

###### To Create New Logistic Regression Model Using the Optimal Parameters

In [None]:
# To Instantiate the Model (Using the Optimized Parameters)
logistic_regression = LogisticRegression(C=1000, penalty='l2', solver='newton-cg', random_state=0)

# To Fit the Training Dataset into Logistic Regression Model
logistic_regression.fit(X_train, Y_train)

# To Predict the Output of the Testing Dataset
Y_predict_LogReg = logistic_regression.predict(X_test)
Y_predict_LogReg

###### To Evaluate the Performance of the Logistic Regression Model

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_predict_LogReg)

import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(confusion_matrix, annot=True)
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_test, Y_predict_LogReg)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_test, Y_predict_LogReg))

#### B. To Create the K-Nearest Neighbors Model 

In [None]:
# To Instantiate the Model (Using the Default Parameters)
k_nearest_neighbors = KNeighborsClassifier()

# To Fit the Training Dataset into K Nearest Neighbors Model
k_nearest_neighbors.fit(X_train, Y_train)

# To Predict the Output of the Testing Dataset
Y_predict_KNN = k_nearest_neighbors.predict(X_test)
Y_predict_KNN


###### To Evaluate the Performance of the K-Nearest Neighbors Machine Model

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_predict_KNN)

import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(confusion_matrix, annot=True)
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_test, Y_predict_KNN)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_test, Y_predict_KNN))

#### Applying GridSearch to find the Best Parameters for the K-Nearest Neighbors Machine Model

In [None]:
# To Import the kFold Class
from sklearn.model_selection import StratifiedKFold
k_Fold = StratifiedKFold (n_splits=10, shuffle=True, random_state=0)

# To Import the GridSearch Class
from sklearn.model_selection import GridSearchCV

# To Set Parameters to be Optimized Under the K Nearest Neighbors Model
k_range = list(range(1, 51))
weight = ['uniform', 'distance']
algorithm = ['auto', 'ball_tree', 'kd_tree', 'brute']
leaf_size = [10, 20, 30, 40, 50, 60, 70, 80, 100]
parameters = dict(n_neighbors=k_range, weights=weight, algorithm=algorithm, leaf_size=leaf_size)
grid_search = GridSearchCV(estimator = k_nearest_neighbors,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = k_Fold,
                           n_jobs = -1)
grid_search = grid_search.fit(X, Y)
print(grid_search)

In [None]:
# To View the Results of the GridSearch
pd.DataFrame(grid_search.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

In [None]:
# To Identify the Best Accuracy and Best Features

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("BEST ACCURACY SCORE:")
print(best_accuracy)
print('')

print("BEST PARAMETERS:")
print(best_parameters)

###### To Create New K-Nearest Neighbors Model Using the Optimal Parameters

In [None]:
# To Instantiate the Model
k_nearest_neighbors = KNeighborsClassifier(n_neighbors=4, weights='distance', algorithm='auto', leaf_size=10)

# To Fit the Training Dataset into K Nearest Neighbors Model
k_nearest_neighbors.fit(X_train, Y_train)

# To Predict the Output of the Training Dataset
Y_predict_KNN = k_nearest_neighbors.predict(X_test)
Y_predict_KNN

###### To Evaluate the Performance of the K-Nearest Neighbors Machine Model

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_predict_KNN)

import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(confusion_matrix, annot=True)
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_test, Y_predict_KNN)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_test, Y_predict_KNN))

#### C. To Create the Support Vector Machine Model 

In [None]:
# To Instantiate the Model (Using the Default Parameters)
support_vector_machine = SVC(random_state=0)

# To Fit the Training Dataset into Support Vector Machine Model
support_vector_machine.fit(X_train, Y_train)

# To Predict the Output of the Testing Dataset
Y_predict_SVM = support_vector_machine.predict(X_test)
Y_predict_SVM

###### To Evaluate the Performance of the Support Vector Machine Model

In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_predict_SVM)

import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(confusion_matrix, annot=True)
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_test, Y_predict_SVM)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_test, Y_predict_SVM))


#### Applying GridSearch to find the Best Parameters for the Support Vector Machine Model

In [None]:
# To Import the kFold Class
from sklearn.model_selection import StratifiedKFold
k_Fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# To Import the GridSearch Class
from sklearn.model_selection import GridSearchCV

# To Set Parameters to be Optimized Under the Support Vector Machine Model
parameters = [{'C': [0.001, 0.01, 0.1, 10, 100, 1000], 'kernel': ['linear'], 'decision_function_shape' : ['ovo', 'ovr']},
              {'C': [0.001, 0.01, 0.1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.5, 1, 5, 10], 'decision_function_shape' : ['ovo', 'ovr']},
              {'C': [0.001, 0.01, 0.1, 10, 100, 1000], 'kernel': ['poly'], 'gamma': [0.1, 0.5, 1, 5, 10], 'degree': [2, 3, 4, 5], 'decision_function_shape' : ['ovo', 'ovr']},
              {'C': [0.001, 0.01, 0.1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.1, 0.5, 1, 5, 10]}]
grid_search = GridSearchCV(estimator = support_vector_machine,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = k_Fold,
                           n_jobs = -1)
grid_search = grid_search.fit(X, Y)
print(grid_search)

In [None]:
# Other Parameters for SVM
# To Set Parameters to be Optimized Under the Support Vector Machine Model
#parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear'], 'decision_function_shape' : ['ovo', 'ovr']},
              #{'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 1, 5, 10], 'decision_function_shape' : ['ovo', 'ovr']},
              #{'C': [1, 10, 100, 1000], 'kernel': ['poly'], 'gamma': [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 1, 5, 10], 'degree': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 'decision_function_shape' : ['ovo', 'ovr']},
              #{'C': [1, 10, 100, 1000], 'kernel': ['sigmoid'], 'gamma': [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 1, 5, 10], 'decision_function_shape' : ['ovo', 'ovr']}]

In [None]:
# To View the Results of the GridSearch
pd.DataFrame(grid_search.cv_results_)[['mean_test_score', 'std_test_score', 'params']]

In [None]:
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("BEST ACCURACY SCORE:")
print(best_accuracy)
print('')

print("BEST PARAMETERS:")
print(best_parameters)

###### To Create New Support Vector Machine Model Using the Optimized Parameters

In [None]:
# To Instantiate the Model (Using the Optimized Parameters)
state_vector_machine = SVC(C=0.001, decision_function_shape='ovo', gamma=10, kernel='poly', degree=4, random_state=0)

# To Fit the Training Dataset into Support Vector Machine Model
state_vector_machine.fit(X_train, Y_train)

# To Predict the Output of the Training Dataset
Y_predict_SVM = state_vector_machine.predict(X_test)
Y_predict_SVM


In [None]:
# To Show the Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_test, Y_predict_SVM)

import seaborn as sns
plt.figure(figsize=(10,7))
sns.heatmap(confusion_matrix, annot=True)
plt.xlabel("Predicted Value")
plt.ylabel("Actual Value")

In [None]:
# For the Classification Accuracy
from sklearn.metrics import accuracy_score
classification_accuracy = accuracy_score(Y_test, Y_predict_SVM)
print('Classification Accuracy: %.4f'
      % classification_accuracy)
print('')

In [None]:
# For the Classification Report
from sklearn.metrics import classification_report
print("CLASSIFICATION REPORT:")
print(classification_report(Y_test, Y_predict_SVM))

###### Dr. Robert G. de Luna, PECE
robert.deluna@dlsl.edu.ph / robert_g_deluna@dlsu.edu.ph