In [1]:
import pandas as pd
import numpy as np
import cv2
from sklearn.naive_bayes import GaussianNB
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_mnist_data():
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
    X,Y = mnist["data"],mnist["target"].astype(int)
    df_mnist=pd.DataFrame(X)
    df_mnist['target'] = Y
    distinct_class=pd.Series(Y).unique().astype(int)
    return df_mnist,distinct_class

def train_test_split(df_input,train_set=60000,print_ind=False):
    data_loc=np.arange(df_input.shape[0])
    train_splitloc=[data_loc[:60000]]
    test_splitloc=[data_loc[60000:]]
    return train_splitloc,test_splitloc

def priors_calc(df_input):
    priors=df_input.iloc[train_splitloc[0]].groupby('target').count()[0]
    return priors

def mnist_transformed(df_input):
    df_mnist_train = (df_input.iloc[train_splitloc[0]].drop('target',axis=1) >= 128).astype(int)
    df_mnist_test = (df_input.iloc[test_splitloc[0]].drop('target',axis=1) >= 128).astype(int)
    return df_mnist_train,df_mnist_test

def mnist_train_summary(df_input_train,df_train_target):
    #df_mnist_train['target'] = df_mnist['target']
    df_input_train['target'] = df_train_target
    df_train_summary=df_input_train.groupby('target').sum()
    for p in range(len(priors)):
        df_train_summary.iloc[p] = (df_train_summary.iloc[p]+0.01)/(priors[p]+0.02)
    return df_train_summary


def train_class_mean_std(input_train_splitloc,print_ind=False):
    dict_train_mean_stdev = {}
    eps = 1e-4 #Added a small value in order to avoid the variance to 0 (divisible by zero)
    for c in distinct_class:
        #print ("Running for the Class: {}".format(c))
        mean=df_mnist.iloc[input_train_splitloc][df_mnist.iloc[input_train_splitloc]['target'] == c].describe().loc['mean'][:-1]
        stdev=df_mnist.iloc[input_train_splitloc][df_mnist.iloc[input_train_splitloc]['target'] == c].describe().loc['std'][:-1]+eps
        dict_train_mean_stdev[c] = mean,stdev
    if (print_ind):
        print ("Len Train:{}. Number of 0:{} 1:{}".format(len(input_train_splitloc),df_pima.iloc[input_train_splitloc][df_pima.iloc[input_train_splitloc]['Class'] == 0].shape,df_pima.iloc[input_train_splitloc][df_pima.iloc[input_train_splitloc]['Class'] == 1].shape))
    return dict_train_mean_stdev

def mnist_cropped_func(input_df,width=20,height=20):
    i=0
    sr_mnist_cropped = []
    df_mnist_cropped = pd.DataFrame()
    for k in np.array(input_df.drop('target',axis=1)):
        x=k.reshape(28,28)
        coord=np.argwhere(x)
        x0,y0=np.min(coord,axis=0)
        x1,y1=np.max(coord,axis=0)
        X_cropped=x[x0:x1,y0:y1]
        
        dim = (width, height)
        
        X_stretched=cv2.resize(X_cropped, dim, interpolation = cv2.INTER_NEAREST)
        X_stretched=X_stretched.reshape(width*height,)
        sr_mnist_cropped.append(X_stretched)
    #df_sr_mnist_train_cropped=sr_mnist_train_cropped
    df_output_mnist_cropped=pd.DataFrame(sr_mnist_cropped)
    df_output_mnist_cropped['target'] = df_mnist['target']
    return df_output_mnist_cropped


def naive_bayes_pred(input_test_splitloc,input_dict_train_mean_stdev,input_distinct_class):
    fold_predict_class = np.zeros((len(input_test_splitloc),len(input_distinct_class)))
    for c in input_distinct_class:
        exp_nr = -((df_mnist.iloc[input_test_splitloc].drop('target',axis=1)-np.array(input_dict_train_mean_stdev[c][0]))**2)
        exp_dn = (2*((dict_train_mean_stdev[c][1]) ** 2 ))
        exp = exp_nr / exp_dn
        exp = np.exp(exp)
        coef = (1/((np.sqrt(2*np.pi))*input_dict_train_mean_stdev[c][1]))
        ndf = np.sum(np.log(coef * exp),axis=1)
        fold_predict_class[:,c] = ndf
    pred_test = pd.Series(pd.DataFrame(fold_predict_class).idxmax(axis=1).values,index=input_test_splitloc)
    return pred_test

def naive_bayes_bernoulli(df_input,input_test_splitloc,df_input_mnist_test,df_train_summary):
    pred_test_val=np.argmax(np.dot((np.log(1-df_train_summary)),(1-df_input_mnist_test).T)+np.dot((np.log(df_train_summary)),(df_input_mnist_test).T),axis=0)
    #np.array(df_input.iloc[test_splitloc[0]]['target'])
    return np.sum((np.array(df_input.iloc[input_test_splitloc[0]]['target'])) == pred_test_val)/100


In [3]:
df_mnist,distinct_class=load_mnist_data() #Load the Dataset
train_splitloc,test_splitloc=train_test_split(df_mnist)
fold=1

In [4]:
for f in range(fold): #For each Fold
    match_train_class = 0
    match_test_class = 0
    dict_train_mean_stdev=train_class_mean_std(train_splitloc[f])
    pred_train_val=naive_bayes_pred(train_splitloc[f],dict_train_mean_stdev,distinct_class) # Train Accuracy
    pred_test_val=naive_bayes_pred(test_splitloc[f],dict_train_mean_stdev,distinct_class) # Test Accuracy

    for train_val_loc in train_splitloc[f]:
        #print ("Fold:{} Train Loc:{} Pred:{}  Act:{}".format(f,train_val_loc,pred_test_val[train_val_loc],df_mnist.iloc[train_val_loc][-1]))
        if pred_train_val[train_val_loc] == df_mnist.iloc[train_val_loc][-1]:
            match_train_class += 1

    for test_val_loc in test_splitloc[f]:
        #print ("Fold:{} Test Loc:{} Pred:{}  Act:{}".format(f,test_val_loc,pred_test_val[test_val_loc],df_mnist.iloc[test_val_loc][-1]))
        if pred_test_val[test_val_loc] == df_mnist.iloc[test_val_loc][-1]:
            match_test_class += 1

    print ("folder: {} Train Accuracy: {}  Test Accuracy:{}".format(f,(match_train_class/len(train_splitloc[f]))*100,(match_test_class/len(test_splitloc[f]))*100))

folder: 0 Train Accuracy: 53.098333333333336  Test Accuracy:51.89


In [5]:
#priors=df_mnist.iloc[train_splitloc[0]].groupby('target').count()[0]
priors = priors_calc(df_mnist)
df_mnist_train,df_mnist_test=mnist_transformed(df_mnist)

# df_mnist_train['target'] = df_mnist['target']
# df_mnist_train_summary=df_mnist_train.groupby('target').sum()
# for p in range(len(priors)):
#     df_mnist_train_summary.iloc[p] = (df_mnist_train_summary.iloc[p]+0.01)/(priors[p]+0.02)

df_mnist_train_summary=mnist_train_summary(df_mnist_train,df_mnist['target'])

naive_bayes_bernoulli(df_mnist,test_splitloc,df_mnist_test,df_mnist_train_summary)

# pred_test_val=np.argmax(np.dot((np.log(1-df_mnist_train_summary)),(1-df_mnist_test).T)+np.dot((np.log(df_mnist_train_summary)),(df_mnist_test).T),axis=0)
# np.array(df_mnist.iloc[test_splitloc[0]]['target'])
# np.sum((np.array(df_mnist.iloc[test_splitloc[0]]['target'])) == pred_test_val)/100

84.38

In [6]:
df_mnist_cropped=mnist_cropped_func(df_mnist,20,20)

# i=0
# sr_mnist_cropped = []
# df_mnist_cropped = pd.DataFrame()
# for k in np.array(df_mnist.drop('target',axis=1)):
#     x=k.reshape(28,28)
#     coord=np.argwhere(x)
#     x0,y0=np.min(coord,axis=0)
#     x1,y1=np.max(coord,axis=0)
#     X_cropped=x[x0:x1,y0:y1]
    
#     width = 20
#     height = 20
#     dim = (width, height)

#     X_stretched=cv2.resize(X_cropped, dim, interpolation = cv2.INTER_NEAREST)
#     X_stretched=X_stretched.reshape(width*height,)
#     sr_mnist_cropped.append(X_stretched)
#     #df_sr_mnist_train_cropped=sr_mnist_train_cropped
# df_mnist_cropped=pd.DataFrame(sr_mnist_cropped)
# df_mnist_cropped['target'] = df_mnist['target']

In [7]:
df_mnist_cropped_train,df_mnist_cropped_test=mnist_transformed(df_mnist_cropped)
df_mnist_cropped_train_summary=mnist_train_summary(df_mnist_cropped_train,df_mnist['target'])
naive_bayes_bernoulli(df_mnist_cropped,test_splitloc,df_mnist_cropped_test,df_mnist_cropped_train_summary)

83.15

In [8]:
# df_mnist_transformed = (df_mnist_cropped.iloc[train_splitloc[0]].drop('target',axis=1) >= 128).astype(int)
# df_mnist_test = (df_mnist_cropped.iloc[test_splitloc[0]].drop('target',axis=1) >= 128).astype(int)
# df_mnist_transformed['target'] = df_mnist_cropped['target']
# df_mnist_transformed_summary=df_mnist_transformed.groupby('target').sum()
# for p in range(len(priors)):
#     df_mnist_transformed_summary.iloc[p] = (df_mnist_transformed_summary.iloc[p]+0.01)/(priors[p]+0.02)

# pred_test_val=np.argmax(np.dot((np.log(1-df_mnist_transformed_summary)),(1-df_mnist_test).T)+np.dot((np.log(df_mnist_transformed_summary)),(df_mnist_test).T),axis=0)
# np.array(df_mnist.iloc[test_splitloc[0]]['target'])
# np.sum((np.array(df_mnist.iloc[test_splitloc[0]]['target'])) == pred_test_val)/100