In [None]:
filename = 'diabetes.csv'
import pandas as pd
import numpy as np

df= pd.read_csv(filename)
df = df.astype(float)

train=df.sample(frac=0.8,random_state=100) #random state is a seed value
print(train)
test=df.drop(train.index)
print(test)

# columns ['Pregnancies','Glucose','BP','SkinThickness','Insulin','BMI','DiabetesPedigree','Age','Outcome']

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
173          1.0     79.0           60.0           42.0     48.0  43.5   
253          0.0     86.0           68.0           32.0      0.0  35.8   
207          5.0    162.0          104.0            0.0      0.0  37.7   
737          8.0     65.0           72.0           23.0      0.0  32.0   
191          9.0    123.0           70.0           44.0     94.0  33.1   
..           ...      ...            ...            ...      ...   ...   
401          6.0    137.0           61.0            0.0      0.0  24.2   
82           7.0     83.0           78.0           26.0     71.0  29.3   
650          1.0     91.0           54.0           25.0    100.0  25.2   
721          1.0    114.0           66.0           36.0    200.0  38.1   
74           1.0     79.0           75.0           30.0      0.0  32.0   

     DiabetesPedigreeFunction   Age  Outcome  
173                     0.678  23.0      0.0  
253              

In [None]:
# Train model
# group by outcomes, in the train group.
outcome_group = train.groupby(df.columns[-1])
n_attr = len(df.columns) -1
summaries = {}
#summarize by outcome, find mean and std deviation of each outcome.
for classValue, instances in outcome_group:
    attr_mv=[]
    mean=list(instances.mean(axis=0).values)
    stdev=list(instances.std(axis=0).values)
    for i in range(n_attr):
        attr_mv.append([mean[i],stdev[i]])
        
    summaries[classValue]=attr_mv
    print(classValue)
    print(summaries)


0.0
{0.0: [[3.2130325814536342, 2.972610408317797], [109.76942355889724, 26.905603970990228], [68.4812030075188, 17.522677985264334], [20.100250626566417, 14.665790191703916], [72.43358395989975, 96.65287822555396], [30.435338345864665, 7.7580581756384905], [0.43053634085213033, 0.2915681446463359], [30.86967418546366, 11.496036356224192]]}
1.0
{0.0: [[3.2130325814536342, 2.972610408317797], [109.76942355889724, 26.905603970990228], [68.4812030075188, 17.522677985264334], [20.100250626566417, 14.665790191703916], [72.43358395989975, 96.65287822555396], [30.435338345864665, 7.7580581756384905], [0.43053634085213033, 0.2915681446463359], [30.86967418546366, 11.496036356224192]], 1.0: [[4.902325581395349, 3.852993033222635], [142.09302325581396, 29.85260829685783], [71.14418604651163, 21.708619485476845], [22.29767441860465, 18.08223024522044], [89.81860465116279, 114.64875097814037], [35.415813953488374, 7.527291849134547], [0.5482744186046512, 0.3672769639764627], [36.948837209302326, 1

In [None]:
import math
def calculateProb(x, mean, stdev):
    exponent = math.exp(-math.pow(x-mean,2)/(2*math.pow(stdev,2)))
    return (1 / (math.sqrt(2*math.pi)*math.pow(stdev,2))) * exponent
  

def calculateClassProb(summaries, X_vec):
    probabilities = {}
    for classValue, classSummaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(classSummaries)):
            mean, stdev = classSummaries[i]
            x = X_vec[i]
            probabilities[classValue] *= calculateProb(x, mean, stdev) 
           #print(probabilities)         
    return probabilities 
   
def predict(summaries, X_vec):
    prob = calculateClassProb(summaries, X_vec)
    bestLabel, bestProb = None, -1
    for classValue, probability in prob.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
            #print(bestLabel)
    return bestLabel
    


In [None]:
# test model
predictions = []
testSet=test.values.tolist()
print(testSet)
print(len(testSet))
for i in range(len(testSet)):
    result = predict(summaries, testSet[i])
    print(result)
    predictions.append(result)
    #print(predictions)

[[6.0, 148.0, 72.0, 35.0, 0.0, 33.6, 0.627, 50.0, 1.0], [8.0, 183.0, 64.0, 0.0, 0.0, 23.3, 0.672, 32.0, 1.0], [0.0, 137.0, 40.0, 35.0, 168.0, 43.1, 2.288, 33.0, 1.0], [1.0, 189.0, 60.0, 23.0, 846.0, 30.1, 0.398, 59.0, 1.0], [5.0, 166.0, 72.0, 19.0, 175.0, 25.8, 0.587, 51.0, 1.0], [0.0, 118.0, 84.0, 47.0, 230.0, 45.8, 0.551, 31.0, 1.0], [7.0, 107.0, 74.0, 0.0, 0.0, 29.6, 0.254, 31.0, 1.0], [5.0, 109.0, 75.0, 26.0, 0.0, 36.0, 0.546, 60.0, 0.0], [10.0, 122.0, 78.0, 31.0, 0.0, 27.6, 0.512, 45.0, 0.0], [11.0, 138.0, 76.0, 0.0, 0.0, 33.2, 0.42, 35.0, 0.0], [2.0, 90.0, 68.0, 42.0, 0.0, 38.2, 0.503, 27.0, 1.0], [7.0, 105.0, 0.0, 0.0, 0.0, 0.0, 0.305, 24.0, 0.0], [8.0, 176.0, 90.0, 34.0, 300.0, 33.7, 0.467, 58.0, 1.0], [0.0, 100.0, 88.0, 60.0, 110.0, 46.8, 0.962, 31.0, 0.0], [2.0, 141.0, 58.0, 34.0, 128.0, 25.4, 0.699, 24.0, 0.0], [2.0, 109.0, 92.0, 0.0, 0.0, 42.7, 0.845, 54.0, 0.0], [1.0, 95.0, 66.0, 13.0, 38.0, 19.6, 0.334, 25.0, 0.0], [7.0, 62.0, 78.0, 0.0, 0.0, 32.6, 0.391, 41.0, 0.0], [5.0

In [None]:
def getAccuracy(test, predictions):
    correct = 0
    for i in range(len(test)):
        if test.iloc[i,-1] == predictions[i]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0

accuracy = getAccuracy(test, predictions)
print(f'Split {len(df)} rows into train={len(train)} and test={len(test)}')
print(f'Accuracy: {accuracy}')


Split 768 rows into train=614 and test=154
Accuracy: 74.67532467532467


<h2>Using sci-kit-learn Gaussian NB</h2>

In [None]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
data_train = train.iloc[:,:-1]
target_train  = train.iloc[:,-1]
gnb.fit(data_train, target_train)

data_test = test.iloc[:,:-1]
y_pred = gnb.predict(data_test)


from sklearn import metrics

#Model Accuracy, how often is the classifier correct?
print(f'Split {len(df)} rows into train={len(data_train)} and test={len(data_test)}')
print("Accuracy:",(metrics.accuracy_score(test.iloc[:,-1], y_pred)*100))


Split 768 rows into train=614 and test=154
Accuracy: 74.02597402597402
