In [2]:
import scipy.io as scio
import numpy as np

dataFile=('mnist_data.mat')

data=scio.loadmat(dataFile) # read the dataset

data.items() # show values in dataset

# begin to extract features for both training set and testing set.
xTrain=data['trX'] # give the training set as xTrain

yTrain=data['trY']  # give the training dataset label as yTrain
yTrain_label = np.reshape(yTrain, (-1, yTrain.shape[0]))

p7 = 6265/(6265+5851) # get the priority probability of digit 7 and 8
p8 = 5851/(6265+5851)

xTest=data['tsX'] # give the testing dataset as xTest

xTest_mean = np.mean(xTest,1) # extract feature from test dataset
xTest_std = np.std(xTest,1)
xTrain_mean = np.mean(xTrain,1)
xTrain_std = np.std(xTrain,1)

xTrain_7=xTrain[:6265,:] # give the 0-6265 row as images 7
xTrain_8=xTrain[6265:,:] # give the 6265-end row as images 8

# extract the feature of 7s in training dataset
xTrain_7mean=np.mean(xTrain_7,axis=1) 
xTrain_7std=np.std(xTrain_7,axis=1)

# extract the feature of 8s in training dataset
xTrain_8mean=np.mean(xTrain_8,axis=1)  
xTrain_8std=np.std(xTrain_8,axis=1)

# implement the Naive Bayes Classifier and use it produce a predicted label for each testing sample.
# write the pdf formula
import math     
def pdf(t1,t2,m1,m2,s1,s2):
    exponent = math.exp(-(1/2) * (( math.pow(t1 - m1, 2) / (s1 * s1)) + (math.pow(t2 - m2, 2) / (s2 *s2))))
    return  1 / (2 * math.pi * s1 * s2) * exponent

# use MLE density Estimation to get each feature`s mean and
# standard deviation for both 7 and 8 digtis

In [3]:
seven_m1=np.mean(xTrain_7mean) 
seven_s1=np.std(xTrain_7mean)  
seven_m2=np.mean(xTrain_7std)
seven_s2=np.std(xTrain_7std)
eight_m1=np.mean(xTrain_8mean)
eight_s1=np.std(xTrain_8mean)
eight_m2=np.mean(xTrain_8std)
eight_s2=np.std(xTrain_8std)
print ("Digit seven`s two features` means and stds:",seven_m1,seven_s1,seven_m2,seven_s2)
print ("Digit eight`s two features` means and stds:",eight_m1,eight_s1,eight_m2,eight_s2)

Digit seven`s two features` means and stds: 0.11452769775108769 0.03063240469648838 0.28755656517748474 0.038201083694320306
Digit eight`s two features` means and stds: 0.15015598189369758 0.038632488373958954 0.3204758364888714 0.039960074370658606


In [13]:
# predict the result by compare p1 and p2, p1 and p2 are the value of result from apply 
# two features from test dataset * priority of two digits.
predict_test = [0 for x in range(xTest.shape[0])] 
for i in range(xTest.shape[0]):                   
    p1 = pdf(xTest_mean[i], xTest_std[i], seven_m1, seven_m2, seven_s1, seven_s2) * p7
    p2 = pdf(xTest_mean[i], xTest_std[i], eight_m1, eight_m2, eight_s1, eight_s2) * p8
    if p1>p2:
        predict_test[i] = 0
    else:
        predict_test[i] = 1

# Report the classification accuracy for "7" in the testing set.
print("The  Naïve Bayes classification accuracy of 7:",predict_test[:1028].count(0)/1028)

# Report the classification accuracy for "8" in the testing set.
print ("The  Naïve Bayes classification accuracy of 8:",predict_test[1028:].count(1)/974)

# compute the classification accuracy
# Get the accuracy of how many correct prediction of digits 7 and 8 from test dataset
print ("The  Naïve Bayes classification accuracy of both 7 and 8:",(predict_test[:1028].count(0)+predict_test[1028:].count(1))/(xTest.shape[0])) 

# implement the Logistic Regression and use it produce a predicted label for each testing sample.
# define sigmoid formula
def sigmoid (x):
    return 1./(1+np.exp(-x))

# combine mean and std as 2D matrix named datamatrix
datamatrix=np.column_stack((xTrain_mean,xTrain_std))

# define how grad_ascent works, use sigmoid formula to calculate thetas
def grad_ascent(datamatrix,yTrain_label,lr,cycles):
    datamatrix=np.mat(datamatrix)
    labels=np.mat(yTrain_label)
    thetas=np.ones((2,1)) # initital theta
    t_matrix=datamatrix.transpose()
    for k in range(cycles):
        h=sigmoid(datamatrix*thetas)
        thetas=thetas+lr*t_matrix*(labels-h)
    return thetas

# set up learning rate and repeate times when applying testmatrix
thetas= grad_ascent(datamatrix,yTrain_label,0.001,10000)
#make a testmatrix to combine mean and std in Test set
testmatrix=np.column_stack((xTest_mean,xTest_std))
result=sigmoid(testmatrix*thetas)

# to compare the result of prediction, if it less than 0.5 returen 0, else return 1
predict=[0 for x in range(xTest.shape[0])]
for i in range(result.shape[0]):
    if result[i]>0.5:
        predict[i]=1
    else:
        predict[i]=0

# Report the classification accuracy for "7" in the testing set.
print("The Logistic Regression classification accuracy of 7:",predict[:1028].count(0)/1028)

# Report the classification accuracy for "8" in the testing set.
print ("The Logistic Regression classification accuracy of 8:",predict[1028:].count(1)/974)

# compute the classification accuracy of both "7" and "8" in the testing set
print ("The Logistic Regression classification accuracy of both 7 and 8:",(predict[:1028].count(0)+predict[1028:].count(1))/(xTest.shape[0]))

The  Naïve Bayes classification accuracy of 7: 0.7597276264591439
The  Naïve Bayes classification accuracy of 8: 0.6273100616016427
The  Naïve Bayes classification accuracy of both 7 and 8: 0.6953046953046953
The Logistic Regression classification accuracy of 7: 0.796692607003891
The Logistic Regression classification accuracy of 8: 0.6796714579055442
The Logistic Regression classification accuracy of both 7 and 8: 0.7397602397602397
