In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np
from numpy import genfromtxt
import math
import pandas as pd
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn import linear_model

### Opening the dataset files and storing in a dictionary.

In [0]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [None]:
dict1 = unpickle('/content/drive/My Drive/Assignment-2_Dataset//Datasets/Question-1/cifar-10-python/cifar-10-batches-py/data_batch_1')
dict2 = unpickle('/content/drive/My Drive/Assignment-2_Dataset//Datasets/Question-1/cifar-10-python/cifar-10-batches-py/data_batch_2')
dict3 = unpickle('/content/drive/My Drive/Assignment-2_Dataset//Datasets/Question-1/cifar-10-python/cifar-10-batches-py/data_batch_3')
dict4 = unpickle('/content/drive/My Drive/Assignment-2_Dataset//Datasets/Question-1/cifar-10-python/cifar-10-batches-py/data_batch_4')
dict5 = unpickle('/content/drive/My Drive/Assignment-2_Dataset//Datasets/Question-1/cifar-10-python/cifar-10-batches-py/data_batch_5')
# print(dict1)
# print(dict1.keys())
X = np.concatenate((dict1[b'data'], dict2[b'data'], dict3[b'data'], dict4[b'data'], dict5[b'data']), axis=0)
y = np.concatenate((dict1[b'labels'], dict2[b'labels'], dict3[b'labels'], dict4[b'labels'], dict5[b'labels']), axis=0)
# print(X.shape)
# print(y.shape)


### Applying Principal Component Analysis (PCA) on the data to reduce it to 150 components.

In [0]:
pca = PCA(n_components=150)
X = pca.fit_transform(X)
print(X.shape)

(50000, 150)


### Splitting the training data into train and test sets.


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

## Testing Linear SVC accuracy with different values of C

### Linear SVC with C=1

In [0]:
clf_c1 = LinearSVC(C=1.0, max_iter=10000)
clf_c1.fit(X_train, y_train)

In [0]:
#LinearSVC with n_components=300
y_pred_c1 = clf_c1.predict(X_test)
accuracy_c1 = accuracy_score(y_test, y_pred_c1)
print("Accuracy: ",accuracy_c1*100)

Accuracy:  23.419999999999998


In [0]:
f1 = f1_score(y_test, y_pred_c1,  average='macro')
print("F1 score: ",f1)

F1 score:  0.22716911481830668


In [0]:
print(confusion_matrix(y_test, y_pred_c1))

[[430  65  50  36  91  31  51  85 113  73]
 [ 60 274  54  23 177  82  55  45  58 197]
 [162  50 108  53 244  64 105 102  40 111]
 [ 78  63  90  93 255  86  95  56  39 107]
 [ 73  35  72  35 352  69  94  92  24 113]
 [ 75  40  94  78 376  79  68  52  34  90]
 [ 60  44  58  41 279  49 221  27  24 167]
 [100  39  66  44 251  52  82 258  34 111]
 [209 110  37  24 153  41  32  51 279  78]
 [ 72 212  46  27  68  45 115  72  78 248]]


### Linear SVC with C=2

In [0]:
clf_c2 = LinearSVC(C=2.0)
clf_c2.fit(X_train, y_train)

In [0]:
y_pred_c2 = clf_c2.predict(X_test)
accuracy_c2 = accuracy_score(y_test, y_pred_c2)
print("Accuracy: ",accuracy_c2*100)

Accuracy:  21.560000000000002


In [0]:
f1 = f1_score(y_test, y_pred_c2,  average='macro')
print("F1 score: ",f1)

F1 score:  0.19212740152981594


In [0]:
print(confusion_matrix(y_test, y_pred_c2))

[[ 24 298  46  24  92  26  59  52 351  53]
 [ 11 260  59  55 145  37 139  23 180 116]
 [ 24  82 154  81 176  49 238  70 139  26]
 [ 10  67 100 122  99  91 281  51 121  20]
 [  6  61 152  94 134  50 291  51  91  29]
 [ 12  56 134 142  88  92 177  58 202  25]
 [  6  52 146  68  88  50 458  16  65  21]
 [ 23  75 105  86 157  48 249 135  95  64]
 [ 16 191  38  27  59  14  22  30 579  38]
 [ 17 193  43  34 111  31 134  25 197 198]]


### Linear SVC with C=4

In [0]:
clf_c4 = LinearSVC(C=4.0)
clf_c4.fit(X_train, y_train)

In [0]:
y_pred_c4 = clf_c4.predict(X_test)
accuracy_c4 = accuracy_score(y_test, y_pred_c4)
print("Accuracy: ",accuracy_c4*100)

Accuracy:  23.35


In [0]:
f1 = f1_score(y_test, y_pred_c4,  average='macro')
print("F1 score: ",f1)

F1 score:  0.22996864647968432


In [0]:
print(confusion_matrix(y_test, y_pred_c4))

[[409  21 193  53  73  17  18 153  47  41]
 [ 74 162 106 100 130  42  90  90  73 158]
 [110  54 112  90 268 113  86 157  20  29]
 [ 85  37  43 212 138 157 128  98  25  39]
 [ 82  39  73  67 339 136  70 114   8  31]
 [109  43  36 157 179 172 132  93  28  37]
 [ 31  40  22 157 255  80 224 121   7  33]
 [ 43  51  61  87 183 126  75 335  21  55]
 [221  34 331  62  65  13  23  60 137  68]
 [ 60  43 151 136  63  31  50 131  85 233]]


In [0]:
pca = PCA(n_components=200)
X = pca.fit_transform(X)
print(X.shape)

In [0]:
#LinearSVC with n_components=200
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ",accuracy*100)

Accuracy:  23.200000000000003


In [0]:
pca = PCA(n_components=100)
X = pca.fit_transform(X)
print(X.shape)

(50000, 100)


In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [0]:
clf_pca_55 = LinearSVC()
clf_pca_55.fit(X_train, y_train)

In [0]:
#LinearSVC with n_components=200
y_pred_pca_55 = clf_pca_55.predict(X_test)
accuracy_pca_55 = accuracy_score(y_test, y_pred_pca_55)
print("Accuracy: ",accuracy_pca_55*100)

Accuracy:  14.96


## Testing the accuracy, F1 score and confusion matrix for SGD Classifier

In [0]:
#SGD
clf_sgd = linear_model.SGDClassifier(alpha=1000,n_jobs=-1,max_iter=100000)
clf_sgd.fit(X_train, y_train)

In [0]:
#SGD
y_pred_sgd=clf_sgd.predict(X_test)
accuracy_sgd = accuracy_score(y_test, y_pred_sgd)
print("Accuracy: ",accuracy_sgd*100)

Accuracy:  37.19


In [0]:
f1 = f1_score(y_test, y_pred_sgd,  average='macro')
print("F1 score: ",f1)

F1 score:  0.354736999861951


In [0]:
print(confusion_matrix(y_test,y_pred_sgd))

[[404  50  19  72  34   6  13  80 288  30]
 [ 43 545  11  41  51  13  26  51 174  82]
 [ 88  43 167 129 233  22  65 102 124  24]
 [ 57  56  45 304 124  81  67  52 132  28]
 [ 74  32  68  89 466  21  83 125  96  15]
 [ 53  57  65 286 161  76  61  92 147  13]
 [ 17  55  40 180 205  28 280  60  72  24]
 [ 41  58  34  86 162  22  12 459  75  44]
 [100  57   7  38  24   7   7  22 681  27]
 [ 63 246   2  38  21   7  24  65 217 337]]


## We see the accuracy obtained is 37.19 %