In [6]:
import matplotlib.pyplot as plt
import heapq
import numpy as np
from matplotlib.ticker import PercentFormatter
from sklearn.model_selection import train_test_split
from collections import Counter
from scipy.spatial.distance import cdist

f = open('data/train.csv', 'r')
header = f.readline()
images = []
labels = []
sample_digits = {}

# read file, draw one sample digit and store sample digit
for line in f:
    temp = line.strip().split(',')
    temp = [int(x) for x in temp]
    labels.append(temp[0])
    images.append(np.array(temp[1:]))
    if temp[0] not in sample_digits:
        sample_digits[temp[0]] = temp[1:]
f.close()

In [7]:
# question b
# set up 10 subplots for 10 digits
fig = plt.figure(tight_layout=True)
ax = [fig.add_subplot(2, 5, x) for x in range(1, 11)]
for a in ax:
    a.axes.get_xaxis().set_visible(False)
    a.axes.get_yaxis().set_visible(False)

for i in range(10):
    img = np.reshape(sample_digits[i], (28, 28))
    ax[i].imshow(img, cmap='gray')
fig.savefig('sample_digits.png')
fig.clf()

<Figure size 432x288 with 0 Axes>

In [8]:
# question c
# ref: https://matplotlib.org/gallery/statistics/hist.html
fig, ax = plt.subplots(tight_layout=True)
ax.hist(labels, bins=[x for x in range(11)], density=True) # normalized histogram
ax.yaxis.set_major_formatter(PercentFormatter(xmax=1)) # set y-axis
fig.savefig('digit_distributions.png')
fig.clf()

<Figure size 432x288 with 0 Axes>

In [9]:
# question d
f = open('sample_digits_closest.txt', 'w')
min_dist = {}
for i in range(10):
    digit = sample_digits[i]
    min_dist[i] = (0, float('inf'))
    for j in range(len(images)):
        dist = np.linalg.norm(digit-images[j])
        if dist != 0 and dist < min_dist[i][1]:
            min_dist[i] = (labels[j], dist)
    if i != min_dist[i][0]:
        f.write(str(i)+' closest to '+str(min_dist[i][0])+' with distance '+str(min_dist[i][1])+'*\n')
    else:
        f.write(str(i)+' closest to '+str(min_dist[i][0])+' with distance '+str(min_dist[i][1])+'\n')
f.close()

In [10]:
# question e
# create two groups for digits 0 and 1
zero_images = []
one_images = []
zero_labels = []
one_labels = []
for i in range(len(images)):
    if labels[i] == 0:
        zero_images.append(images[i])
        zero_labels.append(labels[i])
    elif labels[i] == 1:
        one_images.append(images[i])
        one_labels.append(labels[i])

In [11]:
# calculate genuine distance
t_match = []
temp = cdist(zero_images, zero_images, 'euclidean')
for (i, row) in enumerate(temp):
    t_match += list(row[i+1:])
temp = cdist(one_images, one_images, 'euclidean')
for (i, row) in enumerate(temp):
    t_match += list(row[i+1:])

In [12]:
# calculate imposter distance
f_match = []
temp = cdist(one_images, zero_images, 'euclidean')
for (i, row) in enumerate(temp):
    f_match += list(row)

In [13]:
# draw histogram
plt.hist(t_match, bins=100, alpha = 0.5)
plt.hist(f_match, bins=100, alpha = 0.5)
plt.savefig('genuine_imposter.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [14]:
# question f
t_match = sorted(t_match)
f_match = sorted(f_match)

In [15]:
# calculate 400 pairs of true positive rate and false positive rate
# calculate equal error rate ERR and baseline accuracy
t_size, f_size = len(t_match), len(f_match)
ma = max(t_match[-1], f_match[-1])
mi = min(t_match[0], f_match[0])
interval = (ma - mi) / 400 # 400 pairs

tpr = []
fpr = []
tc, fc = t_size - 1, f_size - 1
err = None
while ma >= mi:
    while tc >= 0 and t_match[tc] > ma:
        tc -= 1
    while fc >= 0 and f_match[fc] > ma:
        fc -= 1
    fpr.append((t_size - tc) / t_size)
    tpr.append((f_size - fc) / f_size)
    if err == None and fpr[-1] + tpr[-1] >= 1:
        err = fpr[-1]
    ma -= interval
baseline = max(len(zero_labels), len(one_labels)) / (len(zero_labels) + len(one_labels))

In [37]:
# draw ROC curve
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], ls="--", c="0.3")
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC curve with ERR = '+str(round(err,5))+', baseline accuracy = '+str(round(baseline, 5)))
plt.savefig('ROC_curve.png')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [19]:
# question g, h
image_train, image_test, label_train, label_test = train_test_split(images, labels, test_size=0.5)

In [20]:
temp = cdist(image_test, image_train, 'euclidean')

In [21]:
predict = []
for row in temp:
    row = [(row[i], i) for i in range(len(row))]
    kNN = heapq.nsmallest(10, row)
    neighbors = [label_train[n[1]] for n in kNN]
    predict.append(Counter(neighbors).most_common(1)[0][0])

In [23]:
# calculate data accuracy
correct = 0
for i in range(len(label_test)):
    if label_test[i] == predict[i]:
        correct += 1
# calculate confusion matrix
confusion_matrix = [[0 for x in range(10)] for y in range(10)]
for i in range(len(label_test)):
    confusion_matrix[predict[i]][label_test[i]] += 1

In [33]:
f = open('confusion_matrix.txt', 'w')
f.write('split training data accuracy: '+str(round(correct/len(label_test), 5))+'\n')
f.write('   '+'  '.join([str(i) for i in range(10)]))
f.write('\n')
for (i, row) in enumerate(confusion_matrix):
    f.write(str(i)+'  ')
    f.write('  '.join([str(val) for val in row]))
    f.write('\n')
f.close()

In [2]:
# question j
f = open('data/test.csv', 'r')
test_images = []

header = f.readline()
for line in f:
    temp = line.strip().split(',')
    temp = [int(x) for x in temp]
    test_images.append(temp)
f.close()

In [3]:
temp = cdist(test_images, images, 'euclidean')

In [6]:
predict = []
for row in temp:
    row = [(row[i], i) for i in range(len(row))]
    kNN = heapq.nsmallest(10, row)
    neighbors = [labels[n[1]] for n in kNN]
    predict.append(Counter(neighbors).most_common(1)[0][0])

In [7]:
f = open('result.csv', 'w')
f.write('ImageId,Label\n')
i = 0
for val in predict:
    i += 1
    f.write(str(i)+','+str(val)+'\n')
f.close()