In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,StratifiedKFold,GridSearchCV,RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn import datasets

In [2]:
data = datasets.load_breast_cancer()
data_df = pd.DataFrame(data = data["data"],columns=data.feature_names)
data_df["target"] = data.target

In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [4]:
ind_features = data.feature_names

In [5]:
sc = StandardScaler()
x_scaled = sc.fit_transform(data_df[ind_features])

In [6]:
X = pd.DataFrame(data = x_scaled, index=data_df.index, columns= ind_features)
y = data_df["target"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42, stratify=y)

In [8]:
svc_model = SVC(random_state=42)
svc_model.fit(X_train,y_train)

y_train_pred=svc_model.predict(X_train)
y_test_pred=svc_model.predict(X_test)

print("Training Accuracy",np.round(accuracy_score(y_train,y_train_pred)))
print("Testing Accuracy",np.round(accuracy_score(y_test,y_test_pred)))

Training Accuracy 1.0
Testing Accuracy 1.0


In [9]:
## Default params of the model
svc_model.get_params()
param = {
    
    "C":np.logspace(-1, 2, 5),
    "kernel":["linear","rbf","poly"],
    "gamma": list(np.logspace(-1, 2, 5))+["auto","scale"]   
}
scoring_metrics = ["accuracy"]
kfold = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

We specified a few options for `GridSearchCV`.
* `estimator=svc` means we are using Support Vector Classifier as the model.
* `param_grid=param_grid` takes our pre-defined search space for the grid search.
* `scoring=scoring` set the performance evaluation metric. Because we set the scoring to  'accuracy', the model will use accuracy as the evaluation metric.
* `refit='accuracy'` enables refitting the model with the best parameters on the whole training dataset.
* `n_jobs=-1` means parallel processing using all the processors.
* `cv=kfold` takes the `StratifiedKFold` we defined.
* `verbose` controls the number of messages returned by the grid search. The higher the number, the more information is returned. `verbose=0` means silent.

In [10]:
grid_search = GridSearchCV(estimator=svc_model, param_grid= param,
                           scoring=scoring_metrics,
                           n_jobs=-1,
                           refit="accuracy",
                            cv=kfold,
                            verbose=0,
                            pre_dispatch='2*n_jobs',
                            return_train_score=True)

In [11]:
# Fit grid search
grid_result = grid_search.fit(X_train, y_train)

# Print grid search summary
grid_result

In [12]:
grid_search_df=pd.DataFrame(grid_search.cv_results_)
grid_search_df.to_excel("1_Gridsearch_results.xlsx")

In [13]:
grid_result.get_params()

{'cv': StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
 'error_score': nan,
 'estimator__C': 1.0,
 'estimator__break_ties': False,
 'estimator__cache_size': 200,
 'estimator__class_weight': None,
 'estimator__coef0': 0.0,
 'estimator__decision_function_shape': 'ovr',
 'estimator__degree': 3,
 'estimator__gamma': 'scale',
 'estimator__kernel': 'rbf',
 'estimator__max_iter': -1,
 'estimator__probability': False,
 'estimator__random_state': 42,
 'estimator__shrinking': True,
 'estimator__tol': 0.001,
 'estimator__verbose': False,
 'estimator': SVC(random_state=42),
 'n_jobs': -1,
 'param_grid': {'C': array([  0.1       ,   0.56234133,   3.16227766,  17.7827941 ,
         100.        ]),
  'kernel': ['linear', 'rbf', 'poly'],
  'gamma': [0.1,
   0.5623413251903491,
   3.1622776601683795,
   17.78279410038923,
   100.0,
   'auto',
   'scale']},
 'pre_dispatch': '2*n_jobs',
 'refit': 'accuracy',
 'return_train_score': True,
 'scoring': ['accuracy'],
 'verbose': 0}

In [14]:
pd.DataFrame(data = grid_result.get_params(),index=[0])

Unnamed: 0,cv,error_score,estimator__C,estimator__break_ties,estimator__cache_size,estimator__class_weight,estimator__coef0,estimator__decision_function_shape,estimator__degree,estimator__gamma,...,estimator__tol,estimator__verbose,estimator,n_jobs,param_grid,pre_dispatch,refit,return_train_score,scoring,verbose
0,"StratifiedKFold(n_splits=5, random_state=42, s...",,1.0,False,200,,0.0,ovr,3,scale,...,0.001,False,SVC(random_state=42),-1,,2*n_jobs,accuracy,True,accuracy,0


In [15]:
# Print the best accuracy score for the training dataset
print(f'The best accuracy score for the training dataset is {grid_result.best_score_}')

# Print the hyperparameters for the best score
print(f'The best hyperparameters are {grid_result.best_params_}')

# Print the best accuracy score for the testing dataset
print(f'The accuracy score for the testing dataset is {grid_search.score(X_test, y_test)}')

The best accuracy score for the training dataset is 0.9741997264021889
The best hyperparameters are {'C': 0.1, 'gamma': 0.1, 'kernel': 'linear'}
The accuracy score for the testing dataset is 0.986013986013986
