### Imports

In [13]:
import pandas as pd
import time

from sklearn.datasets import fetch_openml
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA

### Fetch the data

In [2]:
mnist = fetch_openml('mnist_784')

X = mnist.data
y = mnist.target

### Scale data to (0,1) & (-1,1)

In [3]:
scaler_0_1 = MinMaxScaler(feature_range=(0, 1))
scaler_m1_1 = MinMaxScaler(feature_range=(-1, 1))

X_0 = scaler_0_1.fit_transform(X)
X_1 = scaler_m1_1.fit_transform(X)

### Use a simple split, where we'll use 60000 examples for training and 10000 for testing/validation.

In [4]:
# Use a simple split, where we'll use 60000 examples for training and 10000 for testing/validation.
X_0_train, X_0_test, y_0_train, y_0_test = train_test_split(X_0, y, test_size=10000, random_state=0)
X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y, test_size=10000, random_state=0)

### Method that prints accuracy & confusion matrix

In [5]:
def print_accuracy_confusion_matrix(y_true, y_pred):
    print('Accuracy:', accuracy_score(y_true=y_true, y_pred=y_pred))
    print('Confusion Matrix\n', confusion_matrix(y_true=y_true, y_pred=y_pred), '\n')

### For the split datasets above, execute SVM for linear and rbf kernels

In [6]:
# Models
model_linear = SVC(kernel='linear')
model_rbf = SVC(kernel='rbf')

model_linear.fit(X_0_train, y_0_train)
y_pred = model_linear.predict(X_0_test)
print('Linear Kernel for (0,1) normalization')
print_accuracy_confusion_matrix(y_0_test, y_pred)

model_rbf.fit(X_0_train, y_0_train)
y_pred = model_rbf.predict(X_0_test)
print('RBF Kernel for (0,1) normalization')
print_accuracy_confusion_matrix(y_0_test, y_pred)

model_linear.fit(X_1_train, y_1_train)
y_pred = model_linear.predict(X_1_test)
print('Linear Kernel for (-1,1) normalization')
print_accuracy_confusion_matrix(y_1_test, y_pred)

model_rbf.fit(X_1_train, y_1_train)
y_pred = model_rbf.predict(X_1_test)
print('RBF Kernel for (-1,1) normalization')
print_accuracy_confusion_matrix(y_1_test, y_pred)

Linear Kernel for (0,1) normalization
Accuracy: 0.936
Confusion Matrix
 [[ 969    0    2    3    1   11    7    0    1    2]
 [   0 1125    4    2    1    2    0    3    4    0]
 [   6   10  980   10   11    2    4    7    8    2]
 [   2    7   34  919    2   20    0    5   16    8]
 [   1    2    6    1  907    1    7    9    1   27]
 [   9    3   10   29    7  782    7    0    9    7]
 [   7    1    8    0    8   14  947    0    4    0]
 [   5    6   16    7   11    4    1  985    1   28]
 [   2   12   12   25    5   14    7    1  875   10]
 [   2    4    5    9   37    8    0   25    8  871]] 

RBF Kernel for (0,1) normalization
Accuracy: 0.9777
Confusion Matrix
 [[ 988    0    1    0    0    3    3    0    1    0]
 [   0 1134    2    0    1    0    0    2    1    1]
 [   1    0 1020    2    4    0    2    5    5    1]
 [   0    1   11  973    0    9    0    4   13    2]
 [   1    1    1    0  946    1    1    4    0    7]
 [   0    1    3    7    1  838    7    0    5    1]
 [   5 

### In order to extract the best hyper parameters, use Grid Search for SVM with rbf kernel

In [7]:
def grid_search(X_train, y_train):
    # Creating a KFold object with 5 splits
    folds = KFold(n_splits = 5, shuffle = True, random_state = 10)
    
    # Specify range of hyperparameters
    hyper_params = [ {'gamma': [1e-2, 1e-3, 1e-4], 'C': [5,10]}]
    
    # Specify model
    model = SVC(kernel='rbf')
    
    # Set up GridSearchCV()
    model_cv = GridSearchCV(estimator = model,
                            param_grid = hyper_params,
                            scoring= 'accuracy',
                            cv = folds,
                            verbose = 1,
                            return_train_score=True,
                            n_jobs=4)
    
    # Fit the model
    model_cv.fit(X_train, y_train)
    
    # Cross Validation results
    cv_results = pd.DataFrame(model_cv.cv_results_)
    print(cv_results)
    
    
    # Print the optimal accuracy score and hyperparameters
    best_score = model_cv.best_score_
    best_hyperparams = model_cv.best_params_
    
    print("The best test score is {0} corresponding to hyperparameters {1}".format(best_score, best_hyperparams))

### Apply grid search for (0,1) normalized dataset

In [8]:
grid_search(X_0_train, y_0_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
0     363.364452      6.725101       165.236679       10.694748       5   
1     434.663380     32.911280       225.978360       22.853936       5   
2     848.417112     36.132054       364.932385        9.025371       5   
3     292.935847      9.668948       170.052067        5.066450      10   
4     325.692875      4.533128       196.359152        9.759201      10   
5     611.516203     58.077355       292.725572       44.495067      10   

  param_gamma                      params  split0_test_score  \
0        0.01     {'C': 5, 'gamma': 0.01}           0.981583   
1       0.001    {'C': 5, 'gamma': 0.001}           0.951417   
2      0.0001   {'C': 5, 'gamma': 0.0001}           0.926083   
3        0.01    {'C': 10, 'gamma': 0.01}           0.982250   
4       0.001   {'C': 10, 'gamma': 0.001}           0.955083   
5      0.0001  {'C': 10, 'gamm

### Apply grid search for (-1,1) normalized dataset

In [9]:
grid_search(X_1_train, y_1_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
   mean_fit_time  std_fit_time  mean_score_time  std_score_time param_C  \
0     993.658104     60.722012       279.927370        3.001816       5   
1     328.816208      8.526209       173.200430        2.727178       5   
2     560.992705     23.156123       277.633363        7.351470       5   
3    1057.380470     47.105132       358.286031       20.506534      10   
4     336.114696     15.946812       192.357687        2.743108      10   
5     575.514290     40.576777       274.885116       58.242152      10   

  param_gamma                      params  split0_test_score  \
0        0.01     {'C': 5, 'gamma': 0.01}           0.984833   
1       0.001    {'C': 5, 'gamma': 0.001}           0.973917   
2      0.0001   {'C': 5, 'gamma': 0.0001}           0.940500   
3        0.01    {'C': 10, 'gamma': 0.01}           0.984833   
4       0.001   {'C': 10, 'gamma': 0.001}           0.976917   
5      0.0001  {'C': 10, 'gamm

### Create SVM model with kernel=rbf, C=5, gamma=0.01. For all the train/predict from this point forward we use the (-1,1) normalized dataset.

In [10]:
model = SVC(kernel='rbf', C=5, gamma=0.01)

### Fit SVM Model without PCA transformation initially

In [11]:
start_time = time.time()
model.fit(X_1_train, y_1_train)
print('Elapsed time: %s secs'%(time.time()-start_time))

y_pred = model_rbf.predict(X_1_test)
print('RBF Kernel for (-1,1) normalization')
print_accuracy_confusion_matrix(y_1_test, y_pred)

Elapsed time: 479.74951577186584 secs
RBF Kernel for (-1,1) normalization
Accuracy: 0.9777
Confusion Matrix
 [[ 988    0    1    0    0    3    3    0    1    0]
 [   0 1134    2    0    1    0    0    2    1    1]
 [   1    0 1020    2    4    0    2    5    5    1]
 [   0    1   11  973    0    9    0    4   13    2]
 [   1    1    1    0  946    1    1    4    0    7]
 [   0    1    3    7    1  838    7    0    5    1]
 [   5    0    0    0    3    5  974    0    2    0]
 [   1    3    7    0    9    1    1 1033    0    9]
 [   0    5    3    5    2    2    3    0  937    6]
 [   1    1    1    4   13    4    1    9    1  934]] 



### Apply PCA transformation to the (-1,1) normalized dataset with 0.95 number of components and fit and predict the SVM Model

In [21]:
scaler = StandardScaler()
X_1_train = scaler.fit_transform(X_1_train)
X_1_test = scaler.fit_transform(X_1_test)

In [22]:
pca = PCA(.95)

pca.fit(X_1_train)
X_train_pca = pca.transform(X_1_train)
X_test_pca = pca.transform(X_1_test)
print(X_train_pca.shape)

start_time = time.time()
model.fit(X_train_pca, y_1_train)
print('Elapsed time using %s components: %s secs'%(X_train_pca.shape[1], time.time()-start_time))

y_pred = model.predict(X_test_pca)
print('RBF Kernel for (-1,1) normalization')
print_accuracy_confusion_matrix(y_1_test, y_pred)

(60000, 327)
Elapsed time using 327 components: 2230.0537192821503 secs
RBF Kernel for (-1,1) normalization
Accuracy: 0.8934
Confusion Matrix
 [[ 925    0   65    1    0    1    3    0    1    0]
 [   0 1120   17    0    0    0    0    2    1    1]
 [   1    0 1028    1    1    0    0    3    5    1]
 [   0    0  149  842    0    6    0    4   10    2]
 [   1    0   96    0  857    0    0    2    1    5]
 [   0    0  108    9    0  736    5    2    2    1]
 [   2    0  135    0    1    5  844    0    2    0]
 [   1    4  170    0    4    1    0  877    0    7]
 [   0    3  103    7    0    6    1    0  840    3]
 [   0    0   74    7    8    1    0   12    2  865]] 



### Apply PCA transformation to the (-1,1) normalized dataset with 0.65 number of components and fit and predict the SVM Model

In [23]:
pca = PCA(.65)

pca.fit(X_1_train)
X_train_pca = pca.transform(X_1_train)
X_test_pca = pca.transform(X_1_test)
print(X_train_pca.shape)

start_time = time.time()
model.fit(X_train_pca, y_1_train)
print('Elapsed time using %s components: %s secs'%(X_train_pca.shape[1], time.time()-start_time))

y_pred = model.predict(X_test_pca)
print('RBF Kernel for (-1,1) normalization')
print_accuracy_confusion_matrix(y_1_test, y_pred)

(60000, 78)
Elapsed time using 78 components: 290.6317615509033 secs
RBF Kernel for (-1,1) normalization
Accuracy: 0.9543
Confusion Matrix
 [[ 967    0   21    1    0    1    5    0    1    0]
 [   0 1125   10    1    0    0    0    2    2    1]
 [   3    0 1027    2    1    0    0    3    2    2]
 [   0    0   41  951    0    5    0    3   11    2]
 [   1    0   41    0  910    1    0    5    0    4]
 [   1    0   21   11    1  820    5    2    2    0]
 [   2    0   50    0    1    5  931    0    0    0]
 [   0    2   58    0    2    0    0  996    0    6]
 [   0    1   40    6    0    2    2    2  905    5]
 [   0    1   28    8    9    1    0    9    2  911]] 



### Apply PCA transformation to the (-1,1) normalized dataset with 0.35 number of components and fit and predict the SVM Model

In [24]:
pca = PCA(.35)

pca.fit(X_1_train)
X_train_pca = pca.transform(X_1_train)
X_test_pca = pca.transform(X_1_test)
print(X_train_pca.shape)

start_time = time.time()
model.fit(X_train_pca, y_1_train)
print('Elapsed time using %s components: %s secs'%(X_train_pca.shape[1], time.time()-start_time))

y_pred = model.predict(X_test_pca)
print('RBF Kernel for (-1,1) normalization')
print_accuracy_confusion_matrix(y_1_test, y_pred)

(60000, 17)
Elapsed time using 17 components: 29.632370233535767 secs
RBF Kernel for (-1,1) normalization
Accuracy: 0.9665
Confusion Matrix
 [[ 990    0    0    2    0    1    2    0    1    0]
 [   0 1129    3    2    0    0    0    2    2    3]
 [   2    0 1010    8    3    3    1    4    6    3]
 [   1    4   18  946    1    9    1    6   22    5]
 [   1    0    3    1  932    0    0    4    2   19]
 [   2    1    2   11    2  828    7    1    7    2]
 [   3    0    1    0    2    8  974    0    1    0]
 [   1    3    8    4    3    1    0 1029    1   14]
 [   2    7    3   15    1   16    4    3  904    8]
 [   0    0    2   11   13    4    0   11    5  923]] 

