# SVM on MNIST

In [1]:
import pandas as pd
import os

import numpy as np
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

from sklearn import svm
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

### Loading in data

In [2]:
# Get the current working directory
current_dir = os.getcwd()

# Navigate to the parent directory
data_dir = os.path.abspath(os.path.join(current_dir, os.pardir, 'Pset3', 'data'))

In [3]:
# loading in data
file_path_train = os.path.join(data_dir, "mnist_train.csv")
file_path_test = os.path.join(data_dir, "mnist_test.csv")

train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)

x_train = train_df.drop('label', axis = 1)
y_train = train_df['label']

x_test = test_df.drop('label', axis = 1)
y_test = test_df['label']

### Data Preprocessing
We need to normalize the data by dividing everything but the label by 255. 255 because we're working with computer vision.

In [4]:
# Divide all of predictors by 255
x_train = x_train/255
x_test = x_test/255

#### Pre-processing with PCA for dimension reduction

In [5]:
# fitting the PCA
pca = PCA()
pca.fit(x_train)

# getting total number of principal components
num_pcs = x_train.shape[1]

# pca.transform returns a numpy array
pca_train = pca.transform(x_train)[:, :num_pcs]
pca_test = pca.transform(x_test)[:, :num_pcs]

# we turn the numpy array into a pandas dataframe
pca_train_df = pd.DataFrame(pca_train, columns=[f"PC{i+1}" for i in range(num_pcs)])
pca_test_df = pd.DataFrame(pca_test, columns=[f"PC{i+1}" for i in range(num_pcs)])

# Insert the new column at the front
#pca_train_df.insert(0, 'label', y_train)
#pca_test_df.insert(0, 'label', y_test)

In [6]:
# doing what's explained in above markdown
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

def explain_perc_var(perc_explain):
    num_pcs = np.argmax(cumulative_variance_ratio >= perc_explain) + 1
    perc_explain = float(perc_explain) * 100
    print(num_pcs,'pcs to explain', perc_explain, '% of variance')
    return(num_pcs)

# getting enough principal components to explain 90% of variance
num_pcs_90 = explain_perc_var(0.9)

87 pcs to explain 90.0 % of variance


So we keep 87 principal components in our matrix now.

In [7]:
# + 1 because we also have to include the label as well.
pca_train_df = pca_train_df.iloc[:, 0:num_pcs_90+1]
pca_test_df = pca_test_df.iloc[:, 0:num_pcs_90+1]

pca_train_df.shape

(60000, 88)

To make sure the code actually works ok, let's do a sample of 1,000.

In [8]:
# Sample 1000 rows from the predictor DataFrame
pca_train_df_sample = pca_train_df.sample(n=1000, random_state=254)

# Sample the corresponding 1000 rows from the labels DataFrame
y_train_sample = y_train.loc[pca_train_df_sample.index]

In [9]:
# initializing svm
svc = svm.SVC(kernel='rbf')

# Create a RepeatedKFold object
rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=254)

# Define a grid of hyperparameters
param_grid = {
    'C': np.power(10, np.linspace(-5, 5, num=20)),
    'gamma': np.power(10, np.linspace(-5, 5, num=20))
}

# Perform grid search
grid_clf = GridSearchCV(svc, param_grid, cv=rkf, scoring='accuracy', n_jobs=-1)
grid_clf.fit(pca_train_df_sample, y_train_sample)

# Print the best parameters and the corresponding score
print("Best parameters: ", grid_clf.best_params_)
print("Best cross-validation score: ", grid_clf.best_score_)

Best parameters:  {'C': 6.1584821106602545, 'gamma': 0.01438449888287663}
Best cross-validation score:  0.9329999999999999


In [10]:
# Predict on a test set
y_pred = grid_clf.predict(pca_test_df)

from sklearn.metrics import accuracy_score

# printing accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Test accuracy:', accuracy)

Test accuracy: 0.9302
