In [0]:
import numpy as np
import pandas as pd
import math
import urllib.request

In [0]:
# Function for reading the image files of training and test data

def readImageFiles(path):
    # Open the url with the path passed and read the lines into lists
    content = urllib.request.urlopen(path)
    lines = []
    with urlopen(path) as content:
       for line in content:
           line = line.decode('utf-8').rstrip("\n\r")
           lines.append(list(line))
    print(lines)
    # Converting the lists into array and resizing the array as required
    a_images = np.array(lines, order='C')
    a_images.resize((5000,784))
    # Converting array to dataframe and replacing features to binary
    df_images = pd.DataFrame(a_images)
    df1 = df_images.replace('+',1)
    df2 = df1.replace('#',1)
    d_images = df2.replace(' ',0)
    # Final images data as array
    images=np.asarray(d_images)
    return images

In [0]:
# Function for reading the label files of training and test data

def readLabelFiles(path):
    lines = []
    with urlopen(path) as content:
       for line in content:
           line = line.decode('utf-8').rstrip("\n\r")
           lines.append(list(line))
    # Converting the lists into array - final label data
    labels=np.asarray(lines)
    return labels

In [0]:
# Getting the data
train_images = readImageFiles("https://raw.githubusercontent.com/PriyanjaniCh/DigitData/master/trainingimages")
train_labels = readLabelFiles("https://raw.githubusercontent.com/PriyanjaniCh/DigitData/master/traininglabels")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [0]:
# Calculating frequency of training labels  - prior
digits, digit_freq = np.unique(train_labels, return_counts=True)
print(digit_freq)

[479 563 488 493 535 434 501 550 462 495]


In [0]:
# Training
# Calculating likelihood
tr_lab = pd.DataFrame(train_labels)
tr_img = pd.DataFrame(train_images)

# Concatenating label and image data for counting the number of F i,j values
n_train = pd.concat([tr_lab, tr_img], axis = 1,ignore_index=True)
n_train2 = n_train.groupby([0],as_index = False).sum()
image_sum= np.asarray(n_train2)
image_sum = image_sum.astype('int')

lapV = 0.1 # Laplace Value
like = []
for i in range(10):
    c = image_sum[i][0]
    for j in range(784):
        pro = ((lapV + image_sum[i][j])/(2*lapV + digit_freq[i]))
        like.append([c, pro])
# Array with likelihood values
likelihood= np.array(like, order='K')
likelihood.resize((10,784,2))


In [0]:

# Reading test images
test_images = readImageFiles("https://raw.githubusercontent.com/PriyanjaniCh/DigitData/master/testimages")

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



(5000, 784)

In [0]:
# Classifying
# Calculating posterior  - implementing Naive Bayes
from operator import itemgetter

# List for storing the calculated value with it's probability
calculated_test = []

for k in range(1000):
    each = []
    for j in range(10):
        post = 0
        for i in range(784):
            if(test_images[k][i] != 0):
                post = post + math.log(likelihood[j][i][1])
        post = post + math.log(digit_freq[j]/1000)
        each.append([j,post])
    # For fetching the value with maximum probability
    max_value = max(each,key=itemgetter(1))
    calculated_test.append(max_value)

calc = pd.DataFrame(calculated_test)

# Predicted labels for test data
predicted = calc[0]


In [0]:
# Reading test label data
test_labels = readLabelFiles("https://raw.githubusercontent.com/PriyanjaniCh/DigitData/master/testlabels")

# Changing the data type of test labels to int
test_labels = test_labels.astype('int')

In [0]:
# Calculating confusion matrix and accuracy

from sklearn.metrics import confusion_matrix, accuracy_score

confusion_matrix= confusion_matrix(test_labels, predicted)
accuracy = accuracy_score(test_labels, predicted)


In [0]:
# Printing the confusion matrix and accuracy

print("============ CONFUSION MATRIX ============")
print(confusion_matrix)
print("\n Accuracy score for test images: ", accuracy)


[[ 85   0   1   0   0   0   0   0   4   0]
 [  0   1   1   0   0   0   1   0 105   0]
 [  2   0  81   6   2   0   4   0   8   0]
 [  1   0   1  83   0   0   1   1  10   3]
 [  2   0   5   0  59   0   3   0  18  20]
 [ 14   0   1  18   2  10   1   1  39   6]
 [  6   0  14   0   1   1  60   0   9   0]
 [  1   0   4   4   4   0   0  63  25   5]
 [  4   0   4   9   0   1   0   1  84   0]
 [  2   0   1   2   6   0   0   0  16  73]]

 Accuracy score for test images:  0.599


In [0]:
# Printing the actual and predicted values

print("\nS.No.\t\t Actual\t\t Predicted \n")
for i in range(1,1001):
    print(i, '\t\t', int(test_labels[i-1]), '\t\t', predicted[i-1])
    


S.No.		 Actual		 Predicted 

1 		 9 		 9
2 		 0 		 0
3 		 2 		 2
4 		 5 		 3
5 		 1 		 8
6 		 9 		 9
7 		 7 		 7
8 		 8 		 8
9 		 1 		 8
10 		 0 		 0
11 		 4 		 4
12 		 1 		 8
13 		 7 		 9
14 		 9 		 9
15 		 6 		 0
16 		 4 		 8
17 		 2 		 2
18 		 6 		 2
19 		 8 		 8
20 		 1 		 8
21 		 3 		 3
22 		 7 		 7
23 		 5 		 9
24 		 4 		 4
25 		 4 		 8
26 		 1 		 8
27 		 8 		 8
28 		 1 		 8
29 		 3 		 3
30 		 8 		 8
31 		 1 		 8
32 		 2 		 2
33 		 5 		 8
34 		 8 		 8
35 		 0 		 0
36 		 6 		 6
37 		 2 		 3
38 		 1 		 8
39 		 1 		 8
40 		 7 		 8
41 		 1 		 8
42 		 5 		 5
43 		 3 		 3
44 		 4 		 4
45 		 6 		 8
46 		 9 		 8
47 		 5 		 8
48 		 0 		 0
49 		 9 		 9
50 		 2 		 2
51 		 2 		 2
52 		 4 		 4
53 		 8 		 8
54 		 2 		 2
55 		 1 		 8
56 		 7 		 7
57 		 2 		 2
58 		 4 		 4
59 		 9 		 9
60 		 4 		 4
61 		 4 		 4
62 		 0 		 0
63 		 3 		 9
64 		 9 		 9
65 		 2 		 2
66 		 2 		 2
67 		 3 		 3
68 		 3 		 3
69 		 8 		 8
70 		 3 		 3
71 		 5 		 0
72 		 7 		 7
73 		 3 		 8
74 		 5 		 8
75 		 8 		 8
76 	