In [6]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB

In [8]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class'],
      dtype='object')

In [54]:
def train_class_mean_std(input_df,input_train_splitloc,impute_ind=False):
    dict_train_mean_stdev_calc = {}
    dict_train_mean_stdev_impute_calc = {}
    for c in distinct_class:
        df_pima_train_set = input_df.iloc[input_train_splitloc][input_df.iloc[input_train_splitloc]['Class'] == c]
        if (impute_ind):
            df_pima_train_set['BloodPressure']=df_pima_train_set['BloodPressure'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['SkinThickness']=df_pima_train_set['SkinThickness'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['BMI']=df_pima_train_set['BMI'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
            df_pima_train_set['Age']=df_pima_train_set['Age'].replace(0,np.NAN) #impute to NAN, so it won't used in mean/std
        mean=df_pima_train_set.describe().loc['mean'][:-1]
        stdev=df_pima_train_set.describe().loc['std'][:-1]
        dict_train_mean_stdev_calc[c] = mean,stdev
    return dict_train_mean_stdev_calc


def gaussian_naive_bayes_pred(input_test_splitloc,input_dict_train_mean_stdev,input_distinct_class):
    fold_predict_class = np.zeros((len(input_test_splitloc),len(input_distinct_class)))
    for c in input_distinct_class:
        exp_nr = -((data.iloc[input_test_splitloc].drop('Class',axis=1)-np.array(input_dict_train_mean_stdev[c][0]))**2)
        exp_dn = (2*((dict_train_mean_stdev[c][1]) ** 2 ))
        exp = exp_nr / exp_dn
        exp = np.exp(exp)
        coef = (1/((np.sqrt(2*np.pi))*input_dict_train_mean_stdev[c][1]))
        ndf = np.sum(np.log(coef * exp),axis=1)
        fold_predict_class[:,c] = ndf
    pred_test = pd.Series(pd.DataFrame(fold_predict_class).idxmax(axis=1).values,index=input_test_splitloc)
    return pred_test


def train_test_split(input_df=None,fold=10,print_ind=False,train_split=80):
    train_splitloc = []
    test_splitloc  = []
    train_end_loc = np.round(input_df.shape[0]*(train_split/100)).astype(int)
    for f in range(fold):
        loc_arr = np.arange(input_df.shape[0])
        np.random.shuffle(loc_arr)
        train_splitloc.append(loc_arr[:train_end_loc])
        test_splitloc.append(loc_arr[train_end_loc:])
    return train_splitloc,test_splitloc

In [70]:
data = pd.read_csv("data/pima-indians-diabetes.csv")
data.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Class']

X = data.drop('Class',axis = 1)
y = data['Class']

distinct_class=y.unique()

nb_clf = GaussianNB()
no_fold = 5
kfold = KFold(no_fold,random_state = 1234)

overall_match_class = 0
overall_match_class_ignore_missing=0
for (train_idx, test_idx) in (kfold.split(data)):
    #print ("Processing for Train:{} Test:{}".format(len(train_idx), len(test_idx)))
    #dict_train_mean_stdev = train_class_mean_std(data,train_idx)
    dict_train_mean_stdev = train_class_mean_std(data,train_idx,impute_ind=False)
    dict_train_mean_stdev_ignore_missing=train_class_mean_std(data,train_idx,impute_ind=True)
    
    #pred = gaussian_naive_bayes_pred(test_idx,dict_train_mean_stdev,distinct_class)
    
    pred_test_val=gaussian_naive_bayes_pred(test_idx,dict_train_mean_stdev,distinct_class)
    
    pred_test_val_ignore_missing=gaussian_naive_bayes_pred(test_idx,dict_train_mean_stdev_ignore_missing,distinct_class)
    
    match_class = (np.sum(np.array(pred_test_val) == data.iloc[test_idx]['Class'].values)/len(test_idx))*100
    
    match_class_ignore_missing=(np.sum(np.array(pred_test_val_ignore_missing) == data.iloc[test_idx]['Class'].values)/len(test_idx))*100
    
    overall_match_class += match_class
    
    overall_match_class_ignore_missing += match_class_ignore_missing
    
    
    #print (match_class)
    print ("folder: Gaussian NB Accuracy: {}  Ignore Missing Accuracy:{}".format(match_class,match_class_ignore_missing))
#print (overall_match_class/no_fold)
print ("Gaussian NB Average Accuracy: {}  Ignore Missing Accuracy:{}".format(overall_match_class/no_fold,overall_match_class_ignore_missing/no_fold))

folder: Gaussian NB Accuracy: 74.67532467532467  Ignore Missing Accuracy:74.67532467532467
folder: Gaussian NB Accuracy: 69.48051948051948  Ignore Missing Accuracy:69.48051948051948
folder: Gaussian NB Accuracy: 73.20261437908496  Ignore Missing Accuracy:73.8562091503268
folder: Gaussian NB Accuracy: 79.73856209150327  Ignore Missing Accuracy:79.73856209150327
folder: Gaussian NB Accuracy: 75.16339869281046  Ignore Missing Accuracy:75.81699346405229
Gaussian NB Average Accuracy: 74.45208386384857  Ignore Missing Accuracy:74.7135217723453


In [102]:
no_fold = 10
kfold = KFold(no_fold,random_state = 1234)
kfold

KFold(n_splits=10, random_state=1234, shuffle=False)

In [104]:
#print (data.shape[0])
#print (data.shape[0]*80/100)
for (train_idx, test_idx) in (kfold.split(data)):
    #print (len(train_idx),len(test_idx))
    print (train_idx[90:120], test_idx[0:20])

[167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
 185 186 187 188 189 190 191 192 193 194 195 196] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184
 185 186 187 188 189 190 191 192 193 194 195 196] [77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96]
[ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119] [154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
 172 173]
[ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119] [231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
 249 250]
[ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119] [308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
 326 327]
[ 90  91  92

In [79]:
train_idx,test_idx=train_test_split(data)
print (len(train_idx[0]),len(test_idx[0]))
print (len(train_idx[1]),len(test_idx[1]))
print (len(train_idx[2]),len(test_idx[2]))
print (len(train_idx[3]),len(test_idx[3]))
print (len(train_idx[4]),len(test_idx[4]))

614 153
614 153
614 153
614 153
614 153


In [88]:
train_idx in test_idx[2]

  """Entry point for launching an IPython kernel.


False

In [90]:
train_idx

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [82]:
train_idx[2]

array([305, 190,  77,  63, 237, 160, 578, 732, 672,  98, 745, 450, 696,
       280, 703, 345, 701, 149,   1, 284, 264,  31, 435, 351, 646,  97,
       613, 269, 384, 292, 460, 485, 622,  38, 631,  85, 171, 614, 763,
       551, 749, 226, 125, 727, 680,  52, 462, 105, 321,  30, 162, 286,
       665, 498, 710,  50, 431, 313, 329, 252, 494, 387, 714, 607, 575,
       681, 560, 418, 495, 331, 589, 250, 164, 618, 486, 210, 133, 116,
       207, 751, 558, 541, 532, 103, 371, 702, 166, 419, 657,  93, 258,
        91, 139,  69,  17, 605,  12, 456, 529, 163, 294,  22, 448, 506,
       201, 608, 328, 726, 362, 309, 570, 142,  71,  86, 567, 373, 620,
        16, 521, 221, 283, 636, 583, 469,  54, 759, 765, 644, 479,  84,
       629, 156, 174, 585, 199,  60, 401, 679,  36, 303, 184, 325,  24,
        78,   3, 159, 120, 118, 761, 553, 251, 678, 343, 634,  81, 516,
       266, 122, 598, 323, 738, 234, 101, 650, 452, 561, 119, 588, 281,
       647, 189, 730, 220, 378, 202, 599, 509,  20, 244, 232, 32

In [60]:
data = pd.read_csv("data/pima-indians-diabetes.csv")
data.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Class']

X = data.drop('Class',axis = 1)
y = data['Class']

distinct_class=y.unique()

nb_clf = GaussianNB()
no_fold = 10
kfold = KFold(no_fold,random_state = 1234)

overall_match_class = 0
overall_match_class_ignore_missing=0

train_idx,test_idx=train_test_split(data) #Split the Dataset

#for (train_idx, test_idx) in (kfold.split(data)):
for f in (range(no_fold)):
    #print ("Processing for Train:{} Test:{}".format(len(train_idx), len(test_idx)))
    #dict_train_mean_stdev = train_class_mean_std(data,train_idx)
    dict_train_mean_stdev = train_class_mean_std(data,train_idx[f],impute_ind=False)
    dict_train_mean_stdev_ignore_missing=train_class_mean_std(data,train_idx[f],impute_ind=True)
    
    #pred = gaussian_naive_bayes_pred(test_idx,dict_train_mean_stdev,distinct_class)
    
    pred_test_val=gaussian_naive_bayes_pred(test_idx[f],dict_train_mean_stdev,distinct_class)
    
    pred_test_val_ignore_missing=gaussian_naive_bayes_pred(test_idx[f],dict_train_mean_stdev_ignore_missing,distinct_class)
    
    match_class = (np.sum(np.array(pred_test_val) == data.iloc[test_idx[f]]['Class'].values)/len(test_idx[f]))*100
    
    match_class_ignore_missing=(np.sum(np.array(pred_test_val_ignore_missing) == data.iloc[test_idx[f]]['Class'].values)/len(test_idx[f]))*100
    
    overall_match_class += match_class
    
    overall_match_class_ignore_missing += match_class_ignore_missing
    
    
    #print (match_class)
    print ("folder: Gaussian NB Accuracy: {}  Ignore Missing Accuracy:{}".format(match_class,match_class_ignore_missing))
#print (overall_match_class/no_fold)
print ("Gaussian NB Average Accuracy: {}  Ignore Missing Accuracy:{}".format(overall_match_class/no_fold,overall_match_class_ignore_missing/no_fold))

folder: Gaussian NB Accuracy: 79.73856209150327  Ignore Missing Accuracy:78.43137254901961
folder: Gaussian NB Accuracy: 71.24183006535948  Ignore Missing Accuracy:70.58823529411765
folder: Gaussian NB Accuracy: 75.81699346405229  Ignore Missing Accuracy:75.81699346405229
folder: Gaussian NB Accuracy: 69.93464052287581  Ignore Missing Accuracy:69.28104575163398
folder: Gaussian NB Accuracy: 71.24183006535948  Ignore Missing Accuracy:70.58823529411765
folder: Gaussian NB Accuracy: 69.93464052287581  Ignore Missing Accuracy:69.93464052287581
folder: Gaussian NB Accuracy: 73.20261437908496  Ignore Missing Accuracy:73.20261437908496
folder: Gaussian NB Accuracy: 69.28104575163398  Ignore Missing Accuracy:67.97385620915033
folder: Gaussian NB Accuracy: 76.47058823529412  Ignore Missing Accuracy:77.12418300653596
folder: Gaussian NB Accuracy: 72.54901960784314  Ignore Missing Accuracy:73.20261437908496
Gaussian NB Average Accuracy: 72.94117647058823  Ignore Missing Accuracy:72.61437908496733

In [36]:
#K FOLD
data = pd.read_csv("data/pima-indians-diabetes.csv")
data.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Class']

X = data.drop('Class',axis = 1)
y = data['Class']

distinct_class=y.unique()

nb_clf = GaussianNB()
no_fold = 10
kfold = KFold(no_fold,random_state = 1234)

match_class = 0
overall_match_class = 0
for (train_idx, test_idx) in (kfold.split(data)):
    #print ("Processing for Train:{} Test:{}".format(len(train_idx), len(test_idx)))
    nb_clf.fit(X.iloc[train_idx],y.iloc[train_idx])
    pred = nb_clf.predict(X.iloc[test_idx])
    match_class = sum(pred == y.iloc[test_idx])/len(test_idx)*100
    overall_match_class += match_class
    #print (match_class)
print (overall_match_class/no_fold)

75.49384825700615


In [106]:
X.iloc[train_idx]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30
5,3,78,50,32,88,31.0,0.248,26
6,10,115,0,0,0,35.3,0.134,29
7,2,197,70,45,543,30.5,0.158,53
8,8,125,96,0,0,0.0,0.232,54
9,4,110,92,0,0,37.6,0.191,30


In [57]:
#TRAIN - TEST SPLIT
data = pd.read_csv("data/pima-indians-diabetes.csv")
data.columns = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Class']

X = data.drop('Class',axis = 1)
y = data['Class']

distinct_class=y.unique()

nb_clf = GaussianNB()
no_fold = 10
kfold = KFold(no_fold,random_state = 1234)

match_class = 0
overall_match_class = 0

train_idx,test_idx=train_test_split(data) #Split the Dataset


for f in (range(no_fold)):
    #print ("Processing for Train:{} Test:{}".format(len(train_idx), len(test_idx)))
    nb_clf.fit(X.iloc[train_idx[f]],y.iloc[train_idx[f]])
    pred = nb_clf.predict(X.iloc[test_idx[f]])
    match_class = sum(pred == y.iloc[test_idx[f]])/len(test_idx[f])*100
    overall_match_class += match_class
    #print (match_class)
print (overall_match_class/no_fold)

74.83660130718955
