## Importing necessary modules

In [1]:
import os
import time
import numpy as np

import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline

## Loading the CIFAR10 dataset

In [2]:
from tensorflow.keras.datasets import cifar10

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# Load the CIFAR10 dataset
baseDir = os.path.dirname(os.path.abspath('__file__')) + '/'
classesName = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
(xTrain, yTrain), (xTest, yTest) = cifar10.load_data()
xTrain = xTrain.astype(np.float)
yTrain = np.squeeze(yTrain)
yTest = np.squeeze(yTest)
xTest = xTest.astype(np.float)

In [None]:
# Show dimension for each variable
print ('Train image shape:    {0}'.format(xTrain.shape))
print ('Train label shape:    {0}'.format(yTrain.shape))
print ('Test image shape:     {0}'.format(xTest.shape))
print ('Test label shape:     {0}'.format(yTest.shape))

## Normalizing the data

In [None]:
#Reshaping Data into a Vector and Normalizing it (-1 to 1)
print(xTrain.shape)
print(yTrain.shape)
xTrain = np.reshape(xTrain, (xTrain.shape[0], -1)) 
# The -1 means that the corresponding dimension is calculated from the other given dimensions.
xTest = np.reshape(xTest, (xTest.shape[0], -1))
print(xTrain.shape) 
print(xTrain[0])
#Normalize 
xTrain=((xTrain/255)*2)-1
xTest=((xTest/255)*2)-1
print(xTrain.shape)
print(xTrain[0])

## Tuning parameters

In [None]:
#Choosing a smaller dataset
xTrain_s=xTrain[:1000,:]
yTrain_s=yTrain[:1000]
print(xTrain_s.shape)
print(yTrain_s.shape)

### SVM Linear Kernel

In [None]:
#creating function for svm linear kernel 
from sklearn import svm
def svm_linear(c):
    svc = svm.SVC(probability = False, kernel = 'linear', C = c)
    
    svc.fit(xTrain_s, yTrain_s) 
    
    # Find the prediction and accuracy on the training set.
    svc_linear_train = svc.predict(xTrain_s)
    acc_train = np.mean(svc_linear_train == yTrain_s)
    acc_train_svm_linear.append(acc_train)
    print('Train Accuracy = {0:f}'.format(acc_train))
    
    # Find the prediction and accuracy on the test set.
    svc_linear_test = svc.predict(xTest)
    acc_test = np.mean(svc_linear_test == yTest)
    acc_test_svm_linear.append(acc_test)
    print('Test Accuracy = {0:f}'.format(acc_test))

In [None]:
#finding the c which gives the highest accuracy
c_svm_linear = [0.0001,0.001,0.01,0.1,1,10,100]
acc_train_svm_linear = []
acc_test_svm_linear = []

for c in c_svm_linear:
    svm_linear(c)

plt.plot(c_svm_linear, acc_train_svm_linear,'.-',color='red')
plt.plot(c_svm_linear, acc_test_svm_linear,'.-',color='orange')
plt.xlabel('c')
plt.ylabel('Accuracy')
plt.title("Plot of accuracy vs c for Training and Test data")
plt.grid()

Best model:

Linear kernel with c-0.1

Train Accuracy = 1.000000

Test Accuracy = 0.297200

## PCA for dimensionality reduction

In [None]:
#cumulative Explained Variance against Number of Components
combined=np.vstack((xTrain,xTest))
from sklearn.decomposition import PCA
pca = PCA().fit(combined)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.title("Plot of Cumulative Explained Variance vs Number of Components")

In [None]:
print(pca.explained_variance_ratio_.cumsum()[499])
print(pca.explained_variance_ratio_.cumsum()[149])
print(pca.explained_variance_ratio_.cumsum()[55])
print(pca.explained_variance_ratio_.cumsum()[45])
print(pca.explained_variance_ratio_.cumsum()[15])

Therefore, if we choose to reduce the number of components to 15, we can retain 71.6% of the variance in the data and it is also very computationally efficient.

# Fitting best model

In [None]:
#reduce to 15 dimensions.
pca = PCA(n_components=15)
pca.fit(combined)
projected = pca.transform(combined)
print(pca.explained_variance_.shape)
print(pca.components_.shape)
print(combined.shape)
print(projected.shape)

In [None]:
#splitting x train and x test
x_train=projected[:50000,:]
y_train=yTrain[:50000]
x_test=projected[50000:,:]
y_test=yTest

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
#fitting the linear svm model on 15 principal components
import time
start_time = time.time()
svc = svm.SVC(probability=False,  kernel="linear", C=0.1)
svc.fit(x_train, y_train)
print("--- %s seconds ---" % (time.time() - start_time))

## Performance Assessment

In [None]:
#predicting train accuracy
pred = svc.predict(x_train)
acc_train = np.mean(pred == y_train)
print('Train Accuracy = {0:f}'.format(acc_train))

In [None]:
#predicting test accuracy
pred = svc.predict(x_test)
acc_test = np.mean(pred == y_test)
print('Test Accuracy = {0:f}'.format(acc_test))

In [None]:
#calculating confusion matrix

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true=y_test, y_pred=pred)
cm

In [None]:
#confusion matrix heatmap

import seaborn as sns
import pandas as pd

class_names = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']
cm =  pd.DataFrame(cm, index=class_names,columns=class_names)
fig = plt.figure(figsize=(10,8))
ax = sns.heatmap(cm,annot=True,cbar=False, cmap='Greens',linewidths=0.5,fmt='.0f')
ax.set_title('Confusion Matrix',fontsize=16,y=1.25)
ax.set_ylabel('Ground Truth',fontsize=14)
ax.set_xlabel('Predicted',fontsize=14)
ax.xaxis.set_ticks_position('top')
ax.xaxis.set_label_position('top')
ax.tick_params(labelsize=12)