In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

In [2]:
def load_dataset(filepath="data/pima-indians-diabetes.csv"):
    df_pima = pd.read_csv(filepath)
    df_pima.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Class']
    distinct_class=df_pima['Class'].unique()
    return df_pima,distinct_class

def train_test_split(input_df=None,fold=10,print_ind=False,train_split=80):
    train_splitloc = []
    test_splitloc  = []
    train_end_loc = np.round(input_df.shape[0]*(train_split/100)).astype(int)
    for f in range(fold):
        loc_arr = np.arange(input_df.shape[0])
        np.random.shuffle(loc_arr)
        train_splitloc.append(loc_arr[:train_end_loc])
        test_splitloc.append(loc_arr[train_end_loc:])
    return train_splitloc,test_splitloc


def train_class_mean_std(input_df,input_train_splitloc,impute_ind=False):
    dict_train_mean_stdev_calc = {}
    dict_train_mean_stdev_impute_calc = {}
    for c in distinct_class:
        #print ("Running for the Class: {}".format(c))
        df_pima_train_set = input_df.iloc[input_train_splitloc][input_df.iloc[input_train_splitloc]['Class'] == c]
        #mean=input_df.iloc[input_train_splitloc][input_df.iloc[input_train_splitloc]['Class'] == c].describe().loc['mean'][:-1]
        #stdev=input_df.iloc[input_train_splitloc][input_df.iloc[input_train_splitloc]['Class'] == c].describe().loc['std'][:-1]
        if (impute_ind):
            #print ("Coming")
            df_pima_train_set['BloodPressure']=df_pima_train_set['BloodPressure'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['SkinThickness']=df_pima_train_set['SkinThickness'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['BMI']=df_pima_train_set['BMI'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['Age']=df_pima_train_set['Age'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
        #print (np.sum(df_pima_train_set.isna()),0)

        mean=df_pima_train_set.describe().loc['mean'][:-1]
        stdev=df_pima_train_set.describe().loc['std'][:-1]
        dict_train_mean_stdev_calc[c] = mean,stdev
        
#         input_df['BloodPressure']=input_df['BloodPressure'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
#         input_df['SkinThickness']=input_df['SkinThickness'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
#         input_df['BMI']=input_df['BMI'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
#         input_df['Age']=input_df['Age'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
#         mean_impute=input_df.iloc[input_train_splitloc][input_df.iloc[input_train_splitloc]['Class'] == c].describe().loc['mean'][:-1]
#         stdev_impute=input_df.iloc[input_train_splitloc][input_df.iloc[input_train_splitloc]['Class'] == c].describe().loc['std'][:-1]
#         dict_train_mean_stdev_impute_calc[c] = mean_impute,stdev_impute

    #print ("Len Train:{}. Number of 0:{} 1:{}".format(len(input_train_splitloc),df_pima.iloc[input_train_splitloc][df_pima.iloc[input_train_splitloc]['Class'] == 0].shape,df_pima.iloc[input_train_splitloc][df_pima.iloc[input_train_splitloc]['Class'] == 1].shape))
    return dict_train_mean_stdev_calc


def gaussian_naive_bayes_pred(input_test_splitloc,input_dict_train_mean_stdev,input_distinct_class):
    fold_predict_class = np.zeros((len(input_test_splitloc),len(input_distinct_class)))
    for c in input_distinct_class:
        exp_nr = -((df_pima.iloc[input_test_splitloc].drop('Class',axis=1)-np.array(input_dict_train_mean_stdev[c][0]))**2)
        exp_dn = (2*((dict_train_mean_stdev[c][1]) ** 2 ))
        exp = exp_nr / exp_dn
        exp = np.exp(exp)
        coef = (1/((np.sqrt(2*np.pi))*input_dict_train_mean_stdev[c][1]))
        ndf = np.sum(np.log(coef * exp),axis=1)
        fold_predict_class[:,c] = ndf
    pred_test = pd.Series(pd.DataFrame(fold_predict_class).idxmax(axis=1).values,index=input_test_splitloc)
    return pred_test

In [3]:
# fold = 10
# overall_match_class = 0
# overall_match_class_scikit=0
# df_pima,distinct_class=load_dataset(imputer_missing_ind=False) #Load the Dataset
# train_splitloc,test_splitloc=train_test_split(df_pima) #Split the Dataset

In [4]:
fold = 10
overall_match_class = 0
#overall_match_class_scikit=0
overall_match_class_ignore_missing=0

df_pima,distinct_class=load_dataset() #Load the Dataset
train_splitloc,test_splitloc=train_test_split(df_pima) #Split the Dataset

for f in range(fold): #For each Fold
    match_class = 0
    #match_scikit_class = 0
    dict_train_mean_stdev=train_class_mean_std(df_pima,train_splitloc[f],impute_ind=False)
    dict_train_mean_stdev_ignore_missing=train_class_mean_std(df_pima,train_splitloc[f],impute_ind=True)
    #print ("dict_train_mean_stdev:{}  dict_train_mean_stdev_ignore_missing:{}".format(dict_train_mean_stdev,dict_train_mean_stdev_ignore_missing))
    pred_test_val=gaussian_naive_bayes_pred(test_splitloc[f],dict_train_mean_stdev,distinct_class)
    pred_test_val_ignore_missing=gaussian_naive_bayes_pred(test_splitloc[f],dict_train_mean_stdev_ignore_missing,distinct_class)
    match_class = (np.sum(np.array(pred_test_val) == df_pima.iloc[test_splitloc[f]]['Class'].values)/len(test_splitloc[f]))*100
    match_class_ignore_missing=(np.sum(np.array(pred_test_val_ignore_missing) == df_pima.iloc[test_splitloc[f]]['Class'].values)/len(test_splitloc[f]))*100
    overall_match_class += match_class
    overall_match_class_ignore_missing += match_class_ignore_missing

#         clf = GaussianNB()
#         clf.fit(df_pima.iloc[train_splitloc[f]].drop('Class',axis=1),df_pima.iloc[train_splitloc[f]]['Class'])
#         pred_test_val_scikit=clf.predict(df_pima.iloc[test_splitloc[f]].drop('Class',axis=1))
#         match_class_scikit=(np.sum(pred_test_val_scikit == df_pima.iloc[test_splitloc[f]]['Class'].values)/len(test_splitloc[f]))*100
#         overall_match_class_scikit +=match_class_scikit
    #print ("folder: {} GaussianNaive Bayes Accuracy: {}  Scikit Accuracy:{}".format(f,match_class,match_class_scikit))
    print ("folder: {} GaussianNaive Bayes Accuracy: {}  Ignore Missing Accuracy:{}".format(f,match_class,match_class_ignore_missing))
#print ("Gaussian Naive Bayes Average Accuracy: {}  Scikit Accuracy:{}".format(overall_match_class/fold,overall_match_class_scikit/fold))
print ("Gaussian Naive Bayes Average Accuracy: {}  Ignore Missing Accuracy:{}".format(overall_match_class/fold,overall_match_class_ignore_missing/fold))


folder: 0 GaussianNaive Bayes Accuracy: 81.04575163398692  Ignore Missing Accuracy:81.04575163398692
folder: 1 GaussianNaive Bayes Accuracy: 77.12418300653596  Ignore Missing Accuracy:77.77777777777779
folder: 2 GaussianNaive Bayes Accuracy: 69.93464052287581  Ignore Missing Accuracy:69.93464052287581
folder: 3 GaussianNaive Bayes Accuracy: 73.8562091503268  Ignore Missing Accuracy:73.8562091503268
folder: 4 GaussianNaive Bayes Accuracy: 74.50980392156863  Ignore Missing Accuracy:74.50980392156863
folder: 5 GaussianNaive Bayes Accuracy: 73.8562091503268  Ignore Missing Accuracy:73.8562091503268
folder: 6 GaussianNaive Bayes Accuracy: 73.8562091503268  Ignore Missing Accuracy:73.8562091503268
folder: 7 GaussianNaive Bayes Accuracy: 76.47058823529412  Ignore Missing Accuracy:77.12418300653596
folder: 8 GaussianNaive Bayes Accuracy: 76.47058823529412  Ignore Missing Accuracy:76.47058823529412
folder: 9 GaussianNaive Bayes Accuracy: 76.47058823529412  Ignore Missing Accuracy:76.47058823529