In [1]:
import numpy as np
import os
import pandas as pd

# Load the .npz file
with np.load('cifar4-train.npz') as data:
    cifar4_data = dict(data.items())

print('It is a dictionary with keys:', list(cifar4_data.keys()))

It is a dictionary with keys: ['pixels', 'overfeat', 'labels', 'names', 'allow_pickle']


In [2]:
X=cifar4_data['overfeat']
y=cifar4_data['labels']

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

# Split data into train/test sets
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=0)

# SVM with Linear Kernel

In [4]:
# Create a k-NN classifier with default values
# i set the # of componenents of the PCA to 87 because it represent the 85% of deviations

# for the Linear Kernel I will use the LinearSVC as it looks faster and then I will use the SVC estimator as it use by default the RBF
pipeLSVM = Pipeline([
     ('PCA', PCA(n_components=87)),
        ('LSV', LinearSVC(random_state=0)
        )])


# Fit to train data
pipeLSVM.fit(X_tr, y_tr)

# Evaluate on test set
accuracy = pipeLSVM.score(X_tr, y_tr)
print('Accuracy: {:.3f}'.format(accuracy))

Accuracy: 0.779


In [5]:
from sklearn.model_selection import KFold

# Create k-fold object
kfold = KFold(n_splits=5)


from sklearn.model_selection import cross_validate
rf_scores = cross_validate(pipeLSVM, X_tr, y_tr, cv=kfold,)

In [6]:
# I Create the list of C parameters to test

param=[]
for i in range (10):
    p=0.05+0.05*5*i
    param.append(p)

In [7]:
from sklearn.model_selection import cross_validate

# Save accuracy on test set
test_scores_mean = []
test_scores_std = []
param_lin=[]


for i in range(9):
# Set parameters
    pipeLSVM= Pipeline([
     ('PCA', PCA(n_components=87)),
        ('LSV', LinearSVC(random_state=0, C=param[i])
        )])

    rf_scores = cross_validate(pipeLSVM, X_tr, y_tr, cv=kfold,)
    print('C value',param[i])
    print('SVM - mean test {:.3f}'.format(
    np.mean(rf_scores['test_score'])))
    print('SVM - std test {:.3f}'.format(
    np.std(rf_scores['test_score'])))
    
    test_scores_mean.append(format(np.mean(rf_scores['test_score'])))
    test_scores_std.append(format(np.std(rf_scores['test_score'])))
    param_lin.append(param[i])

C value 0.05
SVM - mean test 0.822
SVM - std test 0.011
C value 0.3
SVM - mean test 0.805
SVM - std test 0.008
C value 0.55
SVM - mean test 0.781
SVM - std test 0.005
C value 0.8
SVM - mean test 0.761
SVM - std test 0.013
C value 1.05
SVM - mean test 0.754
SVM - std test 0.018
C value 1.3
SVM - mean test 0.768
SVM - std test 0.020
C value 1.55
SVM - mean test 0.761
SVM - std test 0.013
C value 1.8
SVM - mean test 0.745
SVM - std test 0.022
C value 2.05
SVM - mean test 0.765
SVM - std test 0.018


In [8]:
svm_linear=pd.DataFrame(param_lin,columns=['C'])
df=pd.DataFrame(test_scores_mean, columns=['Mean'])
svm_linear=pd.concat([svm_linear,df], axis=1, )
df2=pd.DataFrame(test_scores_std,columns=['Std Dev'])
svm_linear=pd.concat([svm_linear,df2], axis=1)

svm_linear.sort_values(by='Mean', ascending=False).head(1)

Unnamed: 0,C,Mean,Std Dev
0,0.05,0.8219999999999998,0.011


# Non-Linear Kernel classification

In [9]:
from sklearn.svm import SVC

# the standard for the kernel is rbf so I do not need to mention it
pipeSVC = Pipeline([
     ('PCA', PCA(n_components=87)),
        ('SVC', SVC()
        )])

test_scores_mean_svc = []
test_scores_std_svc = []
param_svc=[]
# Fit to train data
pipeSVC.fit(X_tr, y_tr)

# Evaluate on test set
accuracy = pipeSVC.score(X_tr, y_tr)
print('Accuracy: {:.3f}'.format(accuracy))

from sklearn.model_selection import ParameterGrid

# Define a set of reasonable values
C = np.arange(0.05, 2,0.5)
gamma = np.arange(0.0001,0.001,0.0002)


# Define a grid of values
grid = ParameterGrid({
    'SVC__C': C,
    'SVC__gamma': gamma,
    
})



for params_dict in grid:
    # Set parameters
    pipeSVC.set_params(**params_dict)

    # Fit a k-NN classifier
    pipeSVC.fit(X_tr, y_tr)
    
    
    # Evaluate on test set
   
    
    
    
    
    rf_scores_nonliner = cross_validate(pipeSVC, X_tr, y_tr, cv=kfold)
    print(params_dict)
    print('SVM - mean test {:.3f}'.format(
    np.mean(rf_scores_nonliner['test_score'])))
    print('SVM - mean test {:.3f}'.format(
    np.std(rf_scores_nonliner['test_score'])))
    test_scores_mean_svc.append(format(np.mean(rf_scores_nonliner['test_score'])))
    test_scores_std_svc.append(format(np.std(rf_scores_nonliner['test_score'])))
    param_svc.append(params_dict)
    

Accuracy: 1.000
{'SVC__C': 0.05, 'SVC__gamma': 0.0001}
SVM - mean test 0.769
SVM - mean test 0.012
{'SVC__C': 0.05, 'SVC__gamma': 0.00030000000000000003}
SVM - mean test 0.762
SVM - mean test 0.010
{'SVC__C': 0.05, 'SVC__gamma': 0.0005000000000000001}
SVM - mean test 0.733
SVM - mean test 0.017
{'SVC__C': 0.05, 'SVC__gamma': 0.0007000000000000001}
SVM - mean test 0.685
SVM - mean test 0.024
{'SVC__C': 0.05, 'SVC__gamma': 0.0009000000000000002}
SVM - mean test 0.610
SVM - mean test 0.054
{'SVC__C': 0.55, 'SVC__gamma': 0.0001}
SVM - mean test 0.819
SVM - mean test 0.006
{'SVC__C': 0.55, 'SVC__gamma': 0.00030000000000000003}
SVM - mean test 0.824
SVM - mean test 0.007
{'SVC__C': 0.55, 'SVC__gamma': 0.0005000000000000001}
SVM - mean test 0.822
SVM - mean test 0.007
{'SVC__C': 0.55, 'SVC__gamma': 0.0007000000000000001}
SVM - mean test 0.809
SVM - mean test 0.003
{'SVC__C': 0.55, 'SVC__gamma': 0.0009000000000000002}
SVM - mean test 0.791
SVM - mean test 0.006
{'SVC__C': 1.05, 'SVC__gamma': 0

In [10]:
svc=pd.DataFrame(param_svc)
df=pd.DataFrame(test_scores_mean_svc, columns=['Mean'])
svc=pd.concat([svc,df], axis=1, )
df2=pd.DataFrame(test_scores_std_svc,columns=['Std Dev'])
svc=pd.concat([svc,df2], axis=1)

svc.sort_values(by='Mean', ascending=False).head(1)

Unnamed: 0,SVC__C,SVC__gamma,Mean,Std Dev
11,1.05,0.0003,0.8317499999999999,0.0066895440801298


# Evaluate accuracy on test set

In [11]:
# I run the tuned linear kernel
pipeLSVM = Pipeline([
     ('PCA', PCA(n_components=87)),
        ('LSV', LinearSVC(random_state=0, C=0.05)
        )])


# Fit to train data
pipeLSVM.fit(X_tr, y_tr)

# Evaluate on test set
accuracy = pipeLSVM.score(X_te, y_te)
print('Accuracy: {:.3f}'.format(accuracy))

Accuracy: 0.819


In [12]:
# I ran the tuned Non-linear Kernel
pipeSVC = Pipeline([
     ('PCA', PCA(n_components=87)),
        ('SVC', SVC(C=1.55,gamma=0.0003)
        )])

test_scores_mean_svc = []
test_scores_std_svc = []
param_svc=[]
# Fit to train data
pipeSVC.fit(X_tr, y_tr)

# Evaluate on test set
accuracy = pipeSVC.score(X_te, y_te)
print('Accuracy: {:.3f}'.format(accuracy))

Accuracy: 0.815


### As expected with the non linear kernel I get a better accuracy but with an higher computational cost