In [2]:
# library preparations
import scipy.io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import joblib
import seaborn as sns
import time
import pickle

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline

# Support Vector Machine Approach

More details can be found from Scikit Learn directly at: https://scikit-learn.org/stable/modules/svm.html#svm

Pros of SVM:

* Effective in high dimensional space, so a large number of features post de-compression per image is ok
* Effective even if # of features exceeds # of samples (where usually violates some assumptions)
* Memory efficient since using support vectors only
* Can supply custom kernel functions specific to dataset (something we are looking at...)

Cons of SVM:
* Kernel function is often critical especially if features >> samples
* No direct probability estimates (due to mathmatical nature of finding optimal equation under the hood)

In [4]:
load_train = np.load('./data/train.npz', allow_pickle=True)
load_test = np.load('./data/test.npz', allow_pickle=True)
X_train, y_train = load_train['X_train'], load_train['y_train']
X_test, y_test = load_test['X_test'], load_test['y_test']

samples, rows, cols = X_train.shape
print("Before flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

y_train = y_train.reshape(y_train.shape[0], )
y_test = y_test.reshape(y_test.shape[0], )
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

print("After flattening:")
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Before flattening:
(10000, 160, 2)
(10000, 1)
(1000, 160, 2)
(1000, 1)
After flattening:
(10000, 320)
(10000,)
(1000, 320)
(1000,)


In [5]:
from sklearn.svm import SVC

pipe = Pipeline([
    ('scaler', StandardScaler()), 
    ('pca', PCA()), 
    ('svm', SVC())
])

param_grid = {
    'pca__n_components': list(range(3, 500)),
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__C': np.arange(1, 2, 0.1),
    'svm__degree': list(range(3, 10))
}

scoring = ['accuracy', 'precision']

In [None]:
train_pred = cross_val_predict(
    GridSearchCV(pipe, param_grid, scoring='accuracy', cv=5, n_jobs=4), 
    X_train, y_train, cv=10, n_jobs=4
)
print('Cross validated Accuracy is: ', accuracy_score(y_train, train_pred))

print(pd.DataFrame(
    confusion_matrix(y_train, train_pred), 
    index = ["True 0", "True 1"],
    columns = ["Pred 0", "Pred 1"]
))

report = classification_report(y_train, train_pred)
print('\n' + report)

# Predictions using SVM

SVM does not generate probabilities, but instead it generates a *score* (the mathmatical function output found via minimizing classification error while also seeking to widen margin)



In [5]:
final_model = GridSearchCV(pipe, param_grid, scoring="accuracy", cv=5, n_jobs=8)
final_model.fit(X_train, y_train)
print(final_model.best_params_)

test_pred = final_model.predict(X_test)
print('Test set Accuracy is: ', accuracy_score(y_test, test_pred))

KeyboardInterrupt: 