In [99]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, auc, f1_score, roc_curve, classification_report
from keras.datasets import mnist
import os

In [81]:
TRAIN_RATIO = 0.6
VALIDATION_RATIO = 0.2
TEST_RATIO = 0.2

(x_train, y_train), (x_test, y_test) = mnist.load_data()

X = np.concatenate([x_train, x_test])
y = np.concatenate([y_train, y_test])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=(1-TRAIN_RATIO))
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=((TEST_RATIO/(VALIDATION_RATIO+TEST_RATIO))))

In [82]:
#minimize the data by 1/16

X_train = np.reshape(X_train, (-1, 784)).astype('float32')
X_test = np.reshape(X_test, (-1, 784)).astype('float32')
X_val = np.reshape(X_val, (-1, 784)).astype('float32')

X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
X_test = pd.DataFrame(X_test)
y_test = pd.DataFrame(y_test)
X_val = pd.DataFrame(X_val)
y_val = pd.DataFrame(y_val)

X_train = X_train.sample(frac = 1/16)
y_train = y_train.sample(frac = 1/16)
X_test = X_test.sample(frac = 1/16)
y_test = y_test.sample(frac = 1/16)
X_val = X_val.sample(frac = 1/16)
y_val = y_val.sample(frac = 1/16)

In [83]:
#check for bias
print(y_train.value_counts())

1    293
9    291
6    280
7    279
5    264
3    263
4    243
2    242
8    236
0    234
dtype: int64


In [84]:
#set up the Pipeline object with StandardScaler as transformer and SVC as an estimator
steps = [('scaler', StandardScaler()), ('SVM', SVC(kernel='poly'))]
#define Pipeline object
pipeline = Pipeline(steps)

#use the GridSearchCV method with 5 folds cross-validation
parameters = {'SVM__C':[0.001, 0.1, 100, 10e5], 'SVM__gamma':[10,1,0.1,0.01]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv=5)

In [85]:
grid.fit(X_train, y_train.values.ravel())

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('SVM', SVC(kernel='poly'))]),
             param_grid={'SVM__C': [0.001, 0.1, 100, 1000000.0],
                         'SVM__gamma': [10, 1, 0.1, 0.01]})

In [86]:
y_pred = grid.predict(X_test)

print("score = %3.2f" %(grid.score(X_test, y_test)))
print("best parameters from train data: ", grid.best_params_)

score = 0.12
best parameters from train data:  {'SVM__C': 0.001, 'SVM__gamma': 0.01}


In [87]:
#performance measures
print("confusion matrix: \n ", confusion_matrix(y_test, y_pred))

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("accuracy = ", accuracy)
print("f1 = ", f1)

confusion matrix: 
  [[ 3 54  0  1  1  0  2  1  1 20]
 [ 1 77  0  0  1  0  2  2  1 19]
 [ 3 74  2  0  0  1  3  2  2 20]
 [ 1 61  2  0  0  2  1  3  1 19]
 [ 1 58  1  1  0  2  2  0  2 11]
 [ 3 57  1  0  0  0  1  0  2 19]
 [ 0 61  0  0  0  1  1  2  2 19]
 [ 2 54  1  0  2  0  2  2  0 14]
 [ 2 58  0  0  0  0  1  0  1 20]
 [ 2 66  1  0  0  0  1  1  0 15]]
accuracy =  0.11542857142857142
f1 =  0.05404809198750919


In [89]:
steps1 = [('scaler', StandardScaler()), ('SVM', SVC(kernel='poly'))]
pipeline1 = Pipeline(steps1)

parameters1 = {'SVM__C':[grid.best_params_['SVM__C']], 'SVM__gamma':[grid.best_params_['SVM__gamma']]} 
grid1 = GridSearchCV(pipeline1, param_grid=parameters1, cv=5)
grid1.fit(X_val, y_val.values.ravel())



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('SVM', SVC(kernel='poly'))]),
             param_grid={'SVM__C': [0.001], 'SVM__gamma': [0.01]})

In [90]:
y_pred2 = grid1.predict(X_test)

print("score on the test data set= %3.2f" %(grid1.score(X_test, y_test)))
print("best parameters from train data: ", grid1.best_params_ )

score on the test data set= 0.12
best parameters from train data:  {'SVM__C': 0.001, 'SVM__gamma': 0.01}


In [91]:
#performance measures
confusionMat = confusion_matrix(y_test, y_pred2)
confusionMat_df = pd.DataFrame(confusionMat)
print("confusion matrix: \n ", confusionMat)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print("accuracy = ", accuracy)
print("f1 = ", f1)

confusion matrix: 
  [[ 1 75  1  0  1  1  2  1  1  0]
 [ 0 99  0  1  0  0  2  0  1  0]
 [ 0 99  1  0  3  0  2  2  0  0]
 [ 0 83  0  1  1  0  2  1  2  0]
 [ 1 75  0  1  0  0  1  0  0  0]
 [ 0 78  0  1  2  0  1  0  1  0]
 [ 1 83  0  1  1  0  0  0  0  0]
 [ 1 72  2  0  1  0  0  0  0  1]
 [ 1 80  0  0  1  0  0  0  0  0]
 [ 0 82  0  0  1  1  2  0  0  0]]
accuracy =  0.11542857142857142
f1 =  0.05404809198750919


In [101]:
#performance measures
print(classification_report(y_test, y_pred, target_names=['0', '1', '2', '3','4','5','6','7','8','9']))

classification_report_df = classification_report(y_test, y_pred, target_names=['0', '1', '2', '3','4','5','6','7','8','9'])

parameters = pd.DataFrame.from_dict(grid1.get_params(), orient = 'index')
print(parameters)

              precision    recall  f1-score   support

           0       0.17      0.04      0.06        83
           1       0.12      0.75      0.21       103
           2       0.25      0.02      0.03       107
           3       0.00      0.00      0.00        90
           4       0.00      0.00      0.00        78
           5       0.00      0.00      0.00        83
           6       0.06      0.01      0.02        86
           7       0.15      0.03      0.04        77
           8       0.08      0.01      0.02        82
           9       0.09      0.17      0.11        86

    accuracy                           0.12       875
   macro avg       0.09      0.10      0.05       875
weighted avg       0.10      0.12      0.05       875

                                                                                         0
cv                                                                                       5
error_score                                                

In [102]:
outname = 'confusionMat_SVM.csv'
outname2 = 'parameters_SVM.csv'


outdir = 'Exports'

if not os.path.exists(outdir):
    os.mkdir(outdir)
    

confusionMat_df.to_csv(f"{outdir}/{outname}")
parameters.to_csv(f"{outdir}/{outname2}")