In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [6]:
def load_dataset(filepath="data/pima-indians-diabetes.csv"):
    df_pima = pd.read_csv(filepath)
    df_pima.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Class']
    distinct_class=df_pima['Class'].unique()
    return df_pima,distinct_class

def train_test_split(input_df=None,fold=10,print_ind=False,train_split=80):
    train_splitloc = []
    test_splitloc  = []
    train_end_loc = np.round(input_df.shape[0]*(train_split/100)).astype(int)
    for f in range(fold):
        loc_arr = np.arange(input_df.shape[0])
        np.random.shuffle(loc_arr)
        train_splitloc.append(loc_arr[:train_end_loc])
        test_splitloc.append(loc_arr[train_end_loc:])
    return train_splitloc,test_splitloc


def train_class_mean_std(input_df,input_train_splitloc,impute_ind=False):
    dict_train_mean_stdev_calc = {}
    dict_train_mean_stdev_impute_calc = {}
    for c in distinct_class:
        df_pima_train_set = input_df.iloc[input_train_splitloc][input_df.iloc[input_train_splitloc]['Class'] == c]
        if (impute_ind):
            df_pima_train_set['BloodPressure']=df_pima_train_set['BloodPressure'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['SkinThickness']=df_pima_train_set['SkinThickness'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['BMI']=df_pima_train_set['BMI'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['Age']=df_pima_train_set['Age'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
        mean=df_pima_train_set.describe().loc['mean'][:-1]
        stdev=df_pima_train_set.describe().loc['std'][:-1]
        dict_train_mean_stdev_calc[c] = mean,stdev
    return dict_train_mean_stdev_calc


def gaussian_naive_bayes_pred(input_test_splitloc,input_dict_train_mean_stdev,input_distinct_class):
    fold_predict_class = np.zeros((len(input_test_splitloc),len(input_distinct_class)))
    for c in input_distinct_class:
        exp_nr = -((df_pima.iloc[input_test_splitloc].drop('Class',axis=1)-np.array(input_dict_train_mean_stdev[c][0]))**2)
        exp_dn = (2*((dict_train_mean_stdev[c][1]) ** 2 ))
        exp = exp_nr / exp_dn
        exp = np.exp(exp)
        coef = (1/((np.sqrt(2*np.pi))*input_dict_train_mean_stdev[c][1]))
        ndf = np.sum(np.log(coef * exp),axis=1)
        fold_predict_class[:,c] = ndf
    pred_test = pd.Series(pd.DataFrame(fold_predict_class).idxmax(axis=1).values,index=input_test_splitloc)
    return pred_test



In [8]:
fold = 10
overall_match_class = 0
overall_match_class_ignore_missing=0

df_pima,distinct_class=load_dataset() #Load the Dataset
train_splitloc,test_splitloc=train_test_split(df_pima) #Split the Dataset

for f in range(fold): #For each Fold
    match_class = 0
    dict_train_mean_stdev=train_class_mean_std(df_pima,train_splitloc[f],impute_ind=False)
    dict_train_mean_stdev_ignore_missing=train_class_mean_std(df_pima,train_splitloc[f],impute_ind=True)
    pred_test_val=gaussian_naive_bayes_pred(test_splitloc[f],dict_train_mean_stdev,distinct_class)
    pred_test_val_ignore_missing=gaussian_naive_bayes_pred(test_splitloc[f],dict_train_mean_stdev_ignore_missing,distinct_class)
    match_class = (np.sum(np.array(pred_test_val) == df_pima.iloc[test_splitloc[f]]['Class'].values)/len(test_splitloc[f]))*100
    match_class_ignore_missing=(np.sum(np.array(pred_test_val_ignore_missing) == df_pima.iloc[test_splitloc[f]]['Class'].values)/len(test_splitloc[f]))*100
    overall_match_class += match_class
    overall_match_class_ignore_missing += match_class_ignore_missing
    print ("folder: {} Gaussian NB Accuracy: {}  Ignore Missing Accuracy:{}".format(f,match_class,match_class_ignore_missing))
print ("Gaussian NB Average Accuracy: {}  Ignore Missing Accuracy:{}".format(overall_match_class/fold,overall_match_class_ignore_missing/fold))


folder: 0 Gaussian NB Accuracy: 73.8562091503268  Ignore Missing Accuracy:73.20261437908496
folder: 1 Gaussian NB Accuracy: 70.58823529411765  Ignore Missing Accuracy:71.24183006535948
folder: 2 Gaussian NB Accuracy: 69.93464052287581  Ignore Missing Accuracy:69.93464052287581
folder: 3 Gaussian NB Accuracy: 75.16339869281046  Ignore Missing Accuracy:75.16339869281046
folder: 4 Gaussian NB Accuracy: 75.81699346405229  Ignore Missing Accuracy:75.81699346405229
folder: 5 Gaussian NB Accuracy: 72.54901960784314  Ignore Missing Accuracy:73.20261437908496
folder: 6 Gaussian NB Accuracy: 71.89542483660131  Ignore Missing Accuracy:73.20261437908496
folder: 7 Gaussian NB Accuracy: 73.20261437908496  Ignore Missing Accuracy:73.20261437908496
folder: 8 Gaussian NB Accuracy: 74.50980392156863  Ignore Missing Accuracy:75.16339869281046
folder: 9 Gaussian NB Accuracy: 76.47058823529412  Ignore Missing Accuracy:76.47058823529412
Gaussian NB Average Accuracy: 73.39869281045752  Ignore Missing Accurac