## Implementing GridSearchCV using Scikit-Learn

In [None]:
from sklearn import preprocessing
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

data = pd.read_csv("bc.csv")

In [None]:
# Setting the X (features) and Y (label/output/target)
X = data.iloc[:,:-1].values
Y = data.iloc[:,33].values

In [None]:
# Preprocessing the data using MinMaxScaler
mm_scaler = preprocessing.MinMaxScaler()
X_mm = mm_scaler.fit_transform(X)

In [None]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_mm, Y, test_size = 0.2, random_state=42, stratify=Y)

## Hypertuning model parameters using GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score 

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
print("Parameter grid:\n{}".format(param_grid))

grid_search = GridSearchCV(SVC(), param_grid, cv=5)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

In [None]:
print("Best estimator:\n{}".format(grid_search.best_estimator_))

In [None]:
# Rebuild a model on the training set using the optimum parameters' values
# evaluate the model on the test set
svm = grid_search.best_estimator_
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
training_score = svm.score(X_train, y_train)
test_score = svm.score(X_test, y_test)

print("Training set score with best parameters: {:.2f}".format(training_score))
print("Test set score with best parameters: {:.2f}".format(test_score))

In [None]:
from sklearn.metrics import classification_report
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
# Visualize confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

result = confusion_matrix(y_test, y_pred)

sns.heatmap(result, annot=True, cmap='crest', fmt="g", linewidth=2)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.show()

## Exercise 

1. Repeat the experiment by optimizing the KNN model using 10-fold cross-validation by tuning the n_neighbors: 1-10

2. What is the optimal value for the n_neighbors parameter?

3. Rebuild the model with the optimum n_neighbors parameter and conduct the final evaluation.