In [1]:
import pandas as pd
import numpy as np
from math import sqrt
from math import pi
from math import exp
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = pd.read_csv("diabetes.csv")


In [3]:
df.head()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
y = df['Outcome']
x = df.drop(['Outcome'],axis=1)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42)

In [6]:
x_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,691.0,691.0,691.0,691.0,691.0,691.0,691.0,691.0
mean,3.800289,120.824891,69.104197,20.422576,80.432706,31.972504,0.473645,32.706223
std,3.357112,32.098215,19.126069,15.621919,115.369159,7.884084,0.335638,11.317254
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.2,0.245,24.0
50%,3.0,117.0,72.0,23.0,38.0,32.0,0.38,29.0
75%,6.0,140.0,80.0,32.0,129.5,36.5,0.625,40.0
max,17.0,199.0,122.0,63.0,846.0,67.1,2.42,81.0


In [7]:
model = GaussianNB()
model.fit(x_train,y_train)

GaussianNB()

In [8]:
confusion_matrix(y_test,model.predict(x_test))

array([[35, 15],
       [ 8, 19]])

In [9]:
train_mean_pos = x_train[y_train==1].mean()
train_std_pos = x_train[y_train==1].std()

In [10]:
train_mean_neg = x_train[y_train==0].mean()
train_std_neg = x_train[y_train==0].std()

In [11]:
def cond_probability(x, mean, std):
    exponent = exp(-((x - mean)**2/(2*std**2)))
    return (1 / (sqrt(2*pi)*std)) * exponent

In [12]:
def predict(row):
    prob_pos = len(x_train[y_train==1]) / len(x_train)
    
    for i in range(0,len(row)):
        prob_pos = prob_pos * cond_probability(row[i],train_mean_pos[i],train_std_pos[i])
        
    prob_neg = len(x_train[y_train==0]) / len(x_train)
    
    for i in range(0,len(row)):
        prob_neg = prob_neg * cond_probability(row[i],train_mean_neg[i],train_std_neg[i])    
    
    return [prob_pos,prob_neg]

In [13]:
predictions_raw = []

for row in x_test.values.tolist():
    predictions_raw.append(predict(row))

In [14]:
predictions_raw[0]

[2.127978382846273e-13, 5.907718784883406e-13]

In [15]:
predictions = []
for row in predictions_raw:
    if(row[0]>row[1]):
        predictions.append(1)
    else:
        predictions.append(0)

In [16]:
accuracy_score(y_test.tolist(),predictions)*100

70.12987012987013

In [17]:
confusion_matrix(y_test.tolist(),predictions)

array([[35, 15],
       [ 8, 19]])