In [18]:
from sklearn.naive_bayes import GaussianNB
from scipy.stats import multivariate_normal
import numpy as np

In [2]:
def get_data_and_labels(images_filename, labels_filename):
    print("Opening files ...")
    images_file = open(images_filename, "rb")
    labels_file = open(labels_filename, "rb")

    try:
        print("Reading files ...")
        images_file.read(4)
        num_of_items = int.from_bytes(images_file.read(4), byteorder="big")
        num_of_rows = int.from_bytes(images_file.read(4), byteorder="big")
        num_of_colums = int.from_bytes(images_file.read(4), byteorder="big")
        labels_file.read(8)

        num_of_image_values = num_of_rows * num_of_colums
        data = [[None for x in range(num_of_image_values)]
                for y in range(num_of_items)]
        labels = []
        for item in range(num_of_items):
            if item % 10000 == 0:
                print("Current image number: %7d" % item)
            for value in range(num_of_image_values):
                data[item][value] = int.from_bytes(images_file.read(1),
                                                   byteorder="big")
            labels.append(int.from_bytes(labels_file.read(1), byteorder="big"))
        return data, labels
    finally:
        images_file.close()
        labels_file.close()
        print("Files closed.")


In [3]:
train_x, train_y  = get_data_and_labels("mnist_train_images", "mnist_train_labels")

Opening files ...
Reading files ...
Current image number:       0
Current image number:   10000
Current image number:   20000
Current image number:   30000
Current image number:   40000
Current image number:   50000
Files closed.


In [4]:
test_x, test_y  = get_data_and_labels("mnist_test_images", "mnist_test_labels")

Opening files ...
Reading files ...
Current image number:       0
Files closed.


In [5]:
valx = train_x[50000:]
valy = train_y[50000:]
print (len(valx))
print (len(valy))
trainx = train_x[:50000]
trainy = train_y[:50000]
print (len(trainx))
print (len(trainy))

10000
10000
50000
50000


In [6]:
helper = GaussianNB()
helper.fit(trainx, trainy)
classes = helper.classes_
priors = helper.class_prior_
print (classes)
priors

[0 1 2 3 4 5 6 7 8 9]


array([ 0.09864,  0.11356,  0.09936,  0.10202,  0.09718,  0.09012,
        0.09902,  0.1035 ,  0.09684,  0.09976])

In [7]:
# sort training set into digits 0-9
digit_train = {}
for i in range(len(trainx)):
    if trainy[i] not in digit_train:
        digit_train[trainy[i]] = [trainx[i]]
    else:
        digit_train[trainy[i]].append(trainx[i])

In [12]:
posteriors_helper=[]

for klass in classes:
    examples = np.matrix(digit_train[klass])
    mean = np.array(examples.mean(0))[0]
    cov = np.cov(examples.T)
    posteriors_helper.append([mean,cov])

In [35]:
def pipeline(c, testx, testy):
    predicted_y = []
    
    posteriors = []
    for klass in classes:
        mean = posteriors_helper[klass][0]
        cov_smoothed = posteriors_helper[klass][1] + (c * np.eye(mean.shape[0]))
        p_x = multivariate_normal(mean=mean, cov=cov_smoothed)
        posteriors.append(p_x)
        
    for i in range(len(testx)):
        bayes_probs = []
        for klass in classes:
            prob = [klass, np.log(priors[klass]) + posteriors[klass].logpdf(testx[i])]
            bayes_probs.append(prob)

        prediction = max(bayes_probs, key= lambda a: a[1])
        predicted_y.append(prediction[0])
    
    errors = 0
    for i in range(len(testy)):
        if predicted_y[i] != testy[i]:
            errors += 1
    total = len(testx)
    print("Error rate: %d/%d = %f" % ((errors,total,(errors/float(total)))))

In [36]:
pipeline(10, valx, valy)

Error rate: 1225/10000 = 0.122500


In [None]:
pipeline(100, valx, valy)

In [None]:
pipeline(1000, valx, valy)