In [5]:
import math
import struct as st
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from future.utils import iteritems
from scipy.stats import multivariate_normal as mvn


def plot_images(images,labels):
    n_cols = min(5,len(images))
    n_rows = len(images) // n_cols
    fig = plt.figure(figsize=(8,8))
    
    for i in range(n_rows*n_cols):
        sp = fig.add_subplot(n_rows, n_cols, i+1)
        plt.axis("off")
        plt.imshow(images[i], cmap = plt.cm.gray)
        sp.set_title(labels[i])
    plt.show()
    
def convert_pixel(x):
    if (x >= 0 and x <= 7):
        return 0
    elif (x >= 8 and x <= 15):
        return 1
    elif (x >= 16 and x <= 23):
        return 2
    elif (x >= 24 and x <= 31):
        return 3
    elif (x >= 32 and x <= 39):
        return 4
    elif (x >= 40 and x <= 47):
        return 5
    elif (x >= 48 and x <= 55):
        return 6
    elif (x >= 56 and x <= 63):
        return 7
    elif (x >= 64 and x <= 71):
        return 8
    elif (x >= 72 and x <= 79):
        return 9
    elif (x >= 80 and x <= 87):
        return 10
    elif (x >= 88 and x <= 95):
        return 11
    elif (x >= 96 and x <= 103):
        return 12
    elif (x >= 104 and x <= 111):
        return 13
    elif (x >= 112 and x <= 119):
        return 14
    elif (x >= 120 and x <= 127):
        return 15
    elif (x >= 128 and x <= 135):
        return 16
    elif (x >= 136 and x <= 143):
        return 17
    elif (x >= 144 and x <= 151):
        return 18
    elif (x >= 152 and x <= 159):
        return 19
    elif (x >= 160 and x <= 167):
        return 20
    elif (x >= 168 and x <= 175):
        return 21
    elif (x >= 176 and x <= 183):
        return 22
    elif (x >= 184 and x <= 191):
        return 23
    elif (x >= 192 and x <= 199):
        return 24
    elif (x >= 200 and x <= 207):
        return 25
    elif (x >= 208 and x <= 215):
        return 26
    elif (x >= 216 and x <= 223):
        return 27
    elif (x >= 224 and x <= 231):
        return 28
    elif (x >= 232 and x <= 239):
        return 29
    elif (x >= 240 and x <= 247):
        return 30
    elif (x >= 248 and x <= 255):
        return 31
    
def regroup(x, y):
    for i in range(60000):
        for j in range(784):
            x[i][j]=convert_pixel(x[i][j])
    for i in range(10000):
        for j in range(784):
            y[i][j]=convert_pixel(y[i][j])
    return x,y

#tried doing one by myself but the result is about 33% only, so using API instead
class MultinomialNaiveBayes(object):

    
    def fit(self, X, Y):
        #create look up table
        self.count = [0]*10
        self.lut = []
        for i in range(10):
            self.lut.append([])
            for j in range(784):
                self.lut[i].append([])
                for k in range(32):
                    self.lut[i][j].append(0)
        for i in range(60000):
            for j in range(10):
                if Y[i] ==j:
                    self.count[j]+=1
                    class_now = j;
            for j in range(784):
                self.lut[class_now][j][int(X[i][j])] +=1
        #calculate log likelihood prob
        for i in range(10):
            for j in range(784):
                for k in range(32):
                    self.lut[i][j][k] /=self.count[i]
            self.count[i]  /= 60000

    def score(self, X, Y):
        logsum = 0.0
        m = 0.0
        error = 0
        for i in range(10000):
            for j in range(10):
                for k in range(784):
                    logsum += self.lut[j][k][int(X[i][k])] + 0.0000000001
                #logsum *= np.log(self.count[j])
                if logsum >= m:
                    m = logsum
                    ans = j
                #print(logsum)
                logsum = 0.0
            #print("Predicted ",ans," when it is ",Y[i])
            if ans == Y[i]:
                error +=1
            m = 0.0
        print("Accuracy: ",error/100,"%")
            
class GaussianNaiveBayes(object):
    def fit(self, X, Y, smoothing=1e-2):
        self.gaussians = dict()
        self.priors = dict()
        labels = set(Y)
        for c in labels:
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': current_x.var(axis=0) + smoothing,
            }
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape
        K = len(self.gaussians)
        P = np.zeros((N, K))
        for c, g in iteritems(self.gaussians):
            mean, var = g['mean'], g['var']
            P[:,c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        print(P)
        return np.argmax(P, axis=1)
    
def naiveBayes(trainingimage,traininglabel,testingimage,testinglabel,mode):
    #0 for discrete, 1 for continuous
    if(mode ==0):
        print('Multinomial NB:')
        print('Converting into 32 bins')
        x,y = regroup(trainingimage,testingimage)
        print('start fitting')
        nb = MultinomialNB()
        nb.fit(x,traininglabel.ravel())
        
        print(nb.predict_log_proba(y))
        print ('Accuracy: ', nb.score(y,test_labels_array.ravel())*100.0,'%')

    else:
        print('Gaussian NB: ')
        nb = GaussianNaiveBayes()
        nb.fit(trainingimage,traininglabel.ravel())
        print('Accuracy: ',nb.score(testingimage,testinglabel.ravel())*100.0,'%')
        
trainingfilenames = {'images' : 'training_set/train-images.idx3-ubyte' ,'labels' : 'training_set/train-labels.idx1-ubyte'}
testfilenames = {'images' : 'testing_set/t10k-images.idx3-ubyte' ,'labels' : 'testing_set/t10k-labels.idx1-ubyte'}

data_types = {
        0x08: ('ubyte', 'B', 1),
        0x09: ('byte', 'b', 1),
        0x0B: ('>i2', 'h', 2),
        0x0C: ('>i4', 'i', 4),
        0x0D: ('>f4', 'f', 4),
        0x0E: ('>f8', 'd', 8)}

#..........................................................For training dataset..............................................................
print ("Training Dataset.......")
for name in trainingfilenames.keys():
	if name == 'images':
		train_imagesfile = open(trainingfilenames[name],'rb')
	if name == 'labels':
		train_labelsfile = open(trainingfilenames[name],'rb')

train_imagesfile.seek(0)
magic = st.unpack('>4B',train_imagesfile.read(4))
if(magic[0] and magic[1])or(magic[2] not in data_types):
	raise ValueError("File Format not correct")

#Information
nDim = magic[3]
dataType = data_types[magic[2]][0]
dataFormat = data_types[magic[2]][1]
dataSize = data_types[magic[2]][2]


#offset = 0004 for number of images
#offset = 0008 for number of rows
#offset = 0012 for number of columns
#32-bit integer (32 bits = 4 bytes)
train_imagesfile.seek(4)
nImg = st.unpack('>I',train_imagesfile.read(4))[0] #num of images/labels
nR = st.unpack('>I',train_imagesfile.read(4))[0] #num of rows
nC = st.unpack('>I',train_imagesfile.read(4))[0] #num of columns

train_labelsfile.seek(8) #Since no. of items = no. of images and is already read

#Training set
#Reading the labels
train_labels_array = np.asarray(st.unpack('>'+dataFormat*nImg,train_labelsfile.read(nImg*dataSize))).reshape((nImg,1))
#Reading the Image data
nBatch = 10000
nIter = int(math.ceil(nImg/nBatch))
nBytes = nBatch*nR*nC*dataSize
nBytesTot = nImg*nR*nC*dataSize
train_images_array = np.array([])
for i in range(0,nIter):
	temp_images_array = 255 - np.asarray(st.unpack('>'+dataFormat*nBytes,train_imagesfile.read(nBytes))).reshape((nBatch,nR,nC))

	#Stacking each nBatch block to form a larger block
	if train_images_array.size == 0:
		train_images_array = temp_images_array
	else:
		train_images_array = np.vstack((train_images_array,temp_images_array))
	temp_images_array = np.array([])


#..........................................................For test dataset..................................................................
print ("Test Dataset.......")
for name in testfilenames.keys():
	if name == 'images':
		test_imagesfile = open(testfilenames[name],'rb')
	if name == 'labels':
		test_labelsfile = open(testfilenames[name],'rb')
test_imagesfile.seek(0)
magic = st.unpack('>4B',test_imagesfile.read(4))
if(magic[0] and magic[1])or(magic[2] not in data_types):
	raise ValueError("File Format not correct")

nDim = magic[3]

#offset = 0004 for number of images
#offset = 0008 for number of rows
#offset = 0012 for number of columns
#32-bit integer (32 bits = 4 bytes)
test_imagesfile.seek(4)
nImg = st.unpack('>I',test_imagesfile.read(4))[0] #num of images/labels
nR = st.unpack('>I',test_imagesfile.read(4))[0] #num of rows
nC = st.unpack('>I',test_imagesfile.read(4))[0] #num of columns

test_labelsfile.seek(8) #Since no. of items = no. of images and is already read
#Test set
#Reading the labels
test_labels_array = np.asarray(st.unpack('>'+dataFormat*nImg,test_labelsfile.read(nImg*dataSize))).reshape((nImg,1))
#Reading the Image data
nBatch = 10000
nIter = int(math.ceil(nImg/nBatch))
nBytes = nBatch*nR*nC*dataSize
nBytesTot = nImg*nR*nC*dataSize
test_images_array = np.array([])
for i in range(0,nIter):
	temp_images_array = 255 - np.asarray(st.unpack('>'+dataFormat*nBytes,test_imagesfile.read(nBytes))).reshape((nBatch,nR,nC))

	#Stacking each nBatch block to form a larger block
	if test_images_array.size == 0:
		test_images_array = temp_images_array
	else:
		test_images_array = np.vstack((test_images_array,temp_images_array))
	temp_images_array = np.array([])

nsamples, nx, ny = train_images_array.shape
d2_train_dataset = train_images_array.reshape((nsamples,nx*ny))
mtest,mx,my = test_images_array.shape
d2_test_dataset = test_images_array.reshape((mtest,mx*my))   

     
naiveBayes(d2_train_dataset,train_labels_array,d2_test_dataset,test_labels_array,1)  

Training Dataset.......
Test Dataset.......
Gaussian NB: 
[[-6.12012415e+06 -2.29318402e+06 -6.12201427e+06 ... -2.25201919e+03
  -4.41345228e+03 -2.24239465e+03]
 [-4.31316072e+04 -1.57295182e+04 -2.82189066e+03 ... -1.52386966e+07
  -9.48161122e+05 -1.21860754e+07]
 [-2.65337073e+03 -1.67369043e+03 -2.59827927e+03 ... -3.62900985e+06
  -2.30877344e+03 -2.81122160e+03]
 ...
 [-3.64471498e+03 -7.12499663e+03 -4.91854163e+03 ... -2.47518963e+03
  -2.33624015e+03 -2.17069755e+03]
 [-3.98937479e+06 -1.16761827e+06 -3.63491017e+05 ... -2.52724355e+05
  -2.66985804e+03 -3.49759877e+06]
 [-2.83701543e+06 -6.05707836e+05 -5.27337357e+03 ... -3.02791902e+07
  -3.87289553e+06 -1.21273010e+07]]
Accuracy:  61.29 %


In [25]:
from operator import mul    # or mul=lambda x,y:x*y
from fractions import Fraction
from functools import reduce

def nCr(n,r): 
  return int( reduce(mul, (Fraction(n-i, i+1) for i in range(r)), 1) )

def online_learning(file,a,b):
    with open(file) as f:
        content = f.readlines()
    l=0     #line number
    Ht=0    #total heads
    Tt=0    #total tails
    for line in content:
        l+=1
        print("In line " + str(l) + ", the alpha and beta for the prior is " + str(a) + " and " + str(b) + " respectively.")
        H=0  #heads in one line
        T=0  #tails in one line
        for i in line:
            if(i=='1'):
                H+=1
                Ht+=1
            elif(i=='0'):
                T+=1
                Tt+=1
            else:
                break
        a+=H  #alpha
        b+=T  #beta
        p=Ht/(Ht+Tt)
        print("In line " + str(l) + ", the p is " + str(p))
        binom = nCr(Ht+Tt,Ht)*(p**Ht)*((1-p)**(Tt))
        print("In line " + str(l) + ", the binomial likelihood is " + str(binom))
        print("In line " + str(l) + ", the alpha and beta for the posterior is " + str(a) + " and " + str(b) + " respectively.")
online_learning("data.txt",1,1)

In line 1, the alpha and beta for the prior is 1 and 1 respectively.
In line 1, the p is 0.5263157894736842
In line 1, the binomial likelihood is 0.18089756554150835
In line 1, the alpha and beta for the posterior is 11 and 10 respectively.
In line 2, the alpha and beta for the prior is 11 and 10 respectively.
In line 2, the p is 0.47368421052631576
In line 2, the binomial likelihood is 0.12876065177021082
In line 2, the alpha and beta for the posterior is 19 and 21 respectively.
In line 3, the alpha and beta for the prior is 19 and 21 respectively.
In line 3, the p is 0.48214285714285715
In line 3, the binomial likelihood is 0.10621385848750886
In line 3, the alpha and beta for the posterior is 28 and 30 respectively.
In line 4, the alpha and beta for the prior is 28 and 30 respectively.
In line 4, the p is 0.4931506849315068
In line 4, the binomial likelihood is 0.09307472205046372
In line 4, the alpha and beta for the posterior is 37 and 38 respectively.
In line 5, the alpha and bet