# Data prep

In [14]:
import os
import numpy as np

dir = "cifar-10-batches-py"

def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict
#train data
train_data = []
train_labels = []

for i in range(1,6):
    data_file = unpickle(dir + f"\\data_batch_{i}")
    train_data.extend(data_file[b"data"])
    train_labels.extend(data_file[b"labels"])
#test data
test_data = []
test_labels = []

test_data_file = unpickle(dir + "\\test_batch")
test_data = test_data_file[b"data"]

test_labels = test_data_file[b"labels"]

#convert to numpy
train_data = np.array(train_data)
train_labels = np.array(train_labels)

test_data = np.array(test_data)
test_labels = np.array(test_labels)

#test
print(train_data.shape)
print(train_labels.shape)
print(test_data.shape)
print(test_labels.shape)

(50000, 3072)
(50000,)
(10000, 3072)
(10000,)


In [17]:
#put all training data in vertical stack
all_train_data = np.vstack(train_data)
normalized_train_data = all_train_data / 255.0

#test
print(normalized_train_data.shape)
print(normalized_train_data)

all_test_data = np.vstack(test_data)

normalized_test_data = all_test_data / 255.0

(50000, 3072)
[[0.23137255 0.16862745 0.19607843 ... 0.54901961 0.32941176 0.28235294]
 [0.60392157 0.49411765 0.41176471 ... 0.54509804 0.55686275 0.56470588]
 [1.         0.99215686 0.99215686 ... 0.3254902  0.3254902  0.32941176]
 ...
 [0.1372549  0.15686275 0.16470588 ... 0.30196078 0.25882353 0.19607843]
 [0.74117647 0.72941176 0.7254902  ... 0.6627451  0.67058824 0.67058824]
 [0.89803922 0.9254902  0.91764706 ... 0.67843137 0.63529412 0.63137255]]


# KNN

In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn_results = {}

for k in range(1,9):
    knn = KNeighborsClassifier(k)
    knn.fit(normalized_train_data,train_labels)
    pred = knn.predict(normalized_test_data)
    knn_results[k] = pred

In [21]:
from sklearn.metrics import accuracy_score

for k in knn_results.items():
    print(f"k = {k} accuracy: {accuracy_score(test_labels, pred)}")

k = (1, array([4, 8, 8, ..., 5, 3, 4], shape=(10000,))) accuracy: 0.3415
k = (2, array([2, 1, 8, ..., 5, 3, 4], shape=(10000,))) accuracy: 0.3415
k = (3, array([2, 8, 8, ..., 5, 6, 4], shape=(10000,))) accuracy: 0.3415
k = (4, array([2, 8, 8, ..., 5, 6, 4], shape=(10000,))) accuracy: 0.3415
k = (5, array([2, 8, 8, ..., 5, 6, 4], shape=(10000,))) accuracy: 0.3415
k = (6, array([2, 8, 8, ..., 5, 6, 4], shape=(10000,))) accuracy: 0.3415
k = (7, array([2, 8, 8, ..., 5, 6, 4], shape=(10000,))) accuracy: 0.3415
k = (8, array([2, 8, 8, ..., 5, 6, 4], shape=(10000,))) accuracy: 0.3415


# Decision Tree

In [23]:
from sklearn.tree import DecisionTreeClassifier
#max depth = 15
dt = DecisionTreeClassifier(random_state=1, max_depth=15)

dt.fit(normalized_train_data, train_labels)

pred = dt.predict(normalized_test_data)

print(accuracy_score(test_labels, pred))

0.2851


In [24]:
#max depth = 25
dt = DecisionTreeClassifier(random_state=1, max_depth=25)

dt.fit(normalized_train_data, train_labels)

pred = dt.predict(normalized_test_data)

print(accuracy_score(test_labels, pred))

0.2714


# Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier
#estimators = 100
rf = RandomForestClassifier(n_estimators=100)
rf.fit(normalized_train_data, train_labels)
pred = rf.predict(normalized_test_data)

print(accuracy_score(test_labels, pred))

0.4664


In [27]:
#estimators = 150
rf = RandomForestClassifier(n_estimators=150)
rf.fit(normalized_train_data, train_labels)
pred = rf.predict(normalized_test_data)

print(accuracy_score(test_labels, pred))

0.4832


# SVM

In [28]:
from sklearn.svm import SVC

svc = SVC(C=1.0)
svc.fit(normalized_train_data, train_labels)
pred = svc.predict(normalized_test_data)

print(accuracy_score(test_labels, pred))

0.5437


In [29]:
svc = SVC(C=0.5)
svc.fit(normalized_train_data, train_labels)
pred = svc.predict(normalized_test_data)

print(f"C = 0.5, accuracy: {accuracy_score(test_labels, pred)}")

C = 0.5, accuracy: 0.5193


In [None]:
svc = SVC(C=12.0)
svc.fit(normalized_train_data, train_labels)
pred = svc.predict(normalized_test_data)

print(f"C = 12.0, accuracy: {accuracy_score(test_labels, pred)}")

# MLP