In [33]:
# import data
import pandas as pd
from sklearn import datasets
wine = datasets.load_wine()
wine_df = pd.DataFrame(data = wine.data, columns=wine.feature_names)
wine_df['target'] = wine.target
wine_df.head(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0


In [34]:
X = wine_df.drop('target', axis = 1)
y = wine_df['target']

# split train/test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((133, 13), (45, 13), (133,), (45,))

In [35]:
# scaling
from sklearn.preprocessing import StandardScaler
std_scale = StandardScaler()
std_scale.fit(X_train)
X_train_std = std_scale.transform(X_train)
X_test_std = std_scale.transform(X_test)

In [36]:
# grid search
from sklearn import svm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

param_grid = {'kernel': ('linear', 'rbf'),
              'C': [0.5, 1, 10, 100]}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
svc = svm.SVC(random_state=0)
grid_cv = GridSearchCV(svc, param_grid, cv = kfold, scoring='accuracy')
grid_cv.fit(X_train_std, y_train)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVC(random_state=0),
             param_grid={'C': [0.5, 1, 10, 100], 'kernel': ('linear', 'rbf')},
             scoring='accuracy')

In [37]:
grid_cv.cv_results_
import numpy as np
np.transpose(pd.DataFrame(grid_cv.cv_results_))

Unnamed: 0,0,1,2,3,4,5,6,7
mean_fit_time,0.009676,0.008124,0.005123,0.007855,0.007723,0.00611,0.005407,0.004584
std_fit_time,0.003537,0.000541,0.004233,0.004153,0.001637,0.00457,0.005855,0.005771
mean_score_time,0.002092,0.00384,0.00405,0.005677,0.001104,0.003171,0.000261,0.002082
std_score_time,0.003012,0.002733,0.003017,0.003351,0.002209,0.004853,0.000521,0.003666
param_C,0.5,0.5,1,1,10,10,100,100
param_kernel,linear,rbf,linear,rbf,linear,rbf,linear,rbf
params,"{'C': 0.5, 'kernel': 'linear'}","{'C': 0.5, 'kernel': 'rbf'}","{'C': 1, 'kernel': 'linear'}","{'C': 1, 'kernel': 'rbf'}","{'C': 10, 'kernel': 'linear'}","{'C': 10, 'kernel': 'rbf'}","{'C': 100, 'kernel': 'linear'}","{'C': 100, 'kernel': 'rbf'}"
split0_test_score,0.888889,0.962963,0.888889,0.925926,0.888889,0.925926,0.888889,0.925926
split1_test_score,0.962963,1.0,0.962963,0.962963,0.962963,0.962963,0.962963,0.962963
split2_test_score,0.925926,0.962963,0.925926,0.962963,0.925926,0.962963,0.925926,0.962963


In [38]:
# best score & hyper parameter
print(grid_cv.best_score_)
print(grid_cv.best_params_)

0.9774928774928775
{'C': 0.5, 'kernel': 'rbf'}


In [39]:
# final model
clf = grid_cv.best_estimator_
print(clf)

SVC(C=0.5, random_state=0)


In [40]:
# cross validation score(1)
from sklearn.model_selection import cross_validate
metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cv_scores = cross_validate(clf, X_train_std, y_train, cv = kfold, scoring = metrics)
cv_scores



{'fit_time': array([0.00428152, 0.00311446, 0.        , 0.00552058, 0.00500917]),
 'score_time': array([0.00558782, 0.00650716, 0.00951838, 0.0108273 , 0.00813985]),
 'test_accuracy': array([0.96296296, 1.        , 0.96296296, 0.96153846, 1.        ]),
 'test_precision_macro': array([0.96296296, 1.        , 0.96969697, 0.96969697, 1.        ]),
 'test_recall_macro': array([0.96666667, 1.        , 0.96296296, 0.95833333, 1.        ]),
 'test_f1_macro': array([0.9628483 , 1.        , 0.96451914, 0.96190476, 1.        ])}

In [41]:
# cross validation score(2)
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(clf, X_train, y_train, cv = kfold, scoring='accuracy')
print(cv_score)
print(cv_score.mean())
print(cv_score.std())




[0.66666667 0.62962963 0.62962963 0.53846154 0.57692308]
0.6082621082621082
0.04511870745260207


In [43]:
# prediction
pred_svm = clf.predict(X_test_std)
print(pred_svm)

[0 2 1 0 1 1 0 2 1 1 2 2 0 1 2 1 0 0 1 0 1 0 0 1 1 1 1 1 1 2 0 0 1 0 0 0 2
 1 1 2 0 0 1 1 1]


In [46]:
# accuracy
from sklearn.metrics import accuracy_score
accuracy_score = accuracy_score(y_test, pred_svm)
print(accuracy_score)

# confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, pred_svm)

# classification report
from sklearn.metrics import classification_report
class_report = classification_report(y_test, pred_svm)
print(class_report)

1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00         8

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45

