In [2]:
import pandas as pd
import numpy as np

In [3]:
class StandardScaler(object):
    def __init__(self):
        pass

    def fit(self, X): # describes the fit function of StandardScaler
        self.mean_ = np.mean(X, axis=0) # gets the mean of the columns of the dataset
        self.scale_ = np.std(X - self.mean_, axis=0) # gets the standard deviation of the columns of the dataset
        return self

    def transform(self, X): # describes the transform function of StandardScaler
        return (X - self.mean_) / self.scale_ # formula for transforming each datapoint in the dataset

    def fit_transform(self, X): # describes the fit_transform function of StandardScaler
        return self.fit(X).transform(X) # fits and transforms data at the same time

In [18]:
def train_test_split_(*arrays, test_size=None, train_size=None, random_state=None):
    length = len(arrays[0])
    if random_state:
        np.random.seed(random_state)
    p = np.random.permutation(length)

    if type(test_size) == int:
        index = length - test_size
    elif type(test_size) == float:
        index = length - np.ceil(length * test_size)
    else:
        if type(train_size) == int:
            index = train_size
        elif type(train_size) == float:
            index = int(length * train_size)
        else:
            index = length - np.ceil(length * 0.25)

    return [b for a in arrays for b in (a[p][:index], a[p][index:])]

# Naive Bayes

## Bayes Theorem
**$ P(A|B) = \frac{P(B|A) \cdot P(A)}{P(B)} $**

## In our case
**$ P(y|x) = \frac{P(x|y) \cdot P(y)}{P(x)} $**

* P(y|x) is the posterior probability of class (y) given predictor (x) 
* P(x|y) is the likelihood which is the probability of predictor (x) given class (y) 
* P(y) is the probability of class (y) 
* P(x) is the prior probability of predictor 

## with features as
**$ x = (x _1, x _2, x _3,..., x _n) $**

## Asuming all features are mutually independent
**$ P(y|x) = \frac{P(x _1|y) \cdot P(x _2|y) \cdot ... \cdot P(x _n|y) \cdot P(y)}{P(x)} $**

## Finding class with maximum probability
**$ y = argmax _y P(y|x) = argmax _y \frac{P(x _1|y) \cdot P(x _2|y) \cdot ... \cdot P(x _n|y) \cdot P(y)}{P(x)} $**

**$ y = argmax _y P(x _1|y) \cdot P(x _2|y) \cdot ... \cdot P(x _n|y) \cdot P(y)$**

**$ y = argmax _y \log(P(x _1|y)) + \log(P(x _2|y)) + ... + \log(P(x _n|y)) + \log(P(y))$**

## Prior probability P(y): frquency of the class

## Conditional Probability P(xi|y)
**$ P(x _i|y) = \frac{1}{\sqrt{2 \pi \sigma^{2} _y }} \cdot exp(- \frac{(x _i - \mu _y)^{2}}{2 \sigma^{2} _y }) $**


In [4]:
class NaiveBayes:

    def fit(self, x, y): # describes the fit function of Naive Bayes
        n_samples, n_features = x.shape # get the shape of x
        self._classes = np.unique(y) # gets unique values in y
        n_classes = len(self._classes) # number of unique values is found
        
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64) # mean is claculated for each class
        self._var = np.zeros((n_classes, n_features), dtype=np.float64) # variance is claculated for each class
        self._priors = np.zeros(n_classes, dtype=np.float64) # prior probability is claculated for each class
        
        for idx,c in enumerate(self._classes):
            x_c = x[y==c] 
            self._mean[idx,:] = x_c.mean(axis=0)
            self._var[idx,:] = x_c.var(axis=0)
            self._priors[idx] = x_c.shape[0]/ float(n_samples)

    def predict(self, x):
        y_predict = [self._predict(i) for i in x] #get the predicted class for the data
        return y_predict

    def _predict(self, x):
        posteriors = []

        for idx in range(len(self._classes)):
            prior = np.log(self._priors[idx]) 
            class_conditional = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + class_conditional
            posteriors.append(posterior)
    
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x): # uses gaussian function to calculate the conditional probability
        mean = self._mean[class_idx] 
        var = self._var[class_idx]
        numerator = np.exp(- (x-mean)**2 / (2 * var)) 
        denominator = np.sqrt(2 * np.pi * var)
        return (numerator/denominator) 

In [5]:
def accuracy_score(y_test, y_pred, normalize=True):
    correct = sum(y_test == y_pred)
    return correct/len(y_test) if normalize else correct


In [6]:
df = pd.read_csv("/Users/swapnilsingh/Downloads/heart.csv")
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [7]:
s = StandardScaler()
df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach','exang', 'oldpeak', 'slope', 'ca', 'thal']] = s.fit_transform(df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach','exang', 'oldpeak', 'slope', 'ca', 'thal']])
print(s.mean_)
print(s.scale_)

age          54.366337
sex           0.683168
cp            0.966997
trestbps    131.623762
chol        246.264026
fbs           0.148515
restecg       0.528053
thalach     149.646865
exang         0.326733
oldpeak       1.039604
slope         1.399340
ca            0.729373
thal          2.313531
dtype: float64
age          9.067102
sex          0.465241
cp           1.030348
trestbps    17.509178
chol        51.745151
fbs          0.355610
restecg      0.524991
thalach     22.867333
exang        0.469019
oldpeak      1.159157
slope        0.615208
ca           1.020918
thal         0.611265
dtype: float64


In [7]:
from sklearn.model_selection import train_test_split

In [9]:
x = df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal']]
y = df['target']
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state = 31)# spliting the dataset into training and testing
x_test = np.array(x_test) #converting testing dataset into numpy array
model = NaiveBayes() #creating an object for NaiveBayes class
model.fit(x_train,y_train) # fitting the model
y_pred = model.predict(x_test) # predicting the model
acc = accuracy_score(y_test, y_pred) # getting the accuracy of the model
print('Accuracy: ',acc)

Accuracy:  0.9016393442622951


In [10]:
from tkinter import *
import tkinter.font as tkfont

def result(a): # displays the popup window for class
    t=Tk()
    t.configure(background='LemonChiffon1')
    t.title('Result')
    f = tkfont.Font(family='Consolas', size=30)
    if a[0] == 1:
        text = 'Heart disease detected, with the accuracy of '+str(acc*100)
    elif a[0] == 0:
        text = 'Heart disease not detected'
    l1 = Label(t, text=text, compound=CENTER,font=f,background='LemonChiffon1')
    l1.grid(row=1, column=0)
    
t = Tk()
t.configure(background='LemonChiffon1')
f = tkfont.Font(family='Consolas', size=10)
t.title('Cardio Disease Detector')
v1 = IntVar(t)
v2 = IntVar(t)
v3 = StringVar(t)
v4 = IntVar(t)
v5 = IntVar(t)
v6 = IntVar(t)
v7 = StringVar(t)
v8 = IntVar(t)
v9 = IntVar(t)
v10 = StringVar(t)
v11 = StringVar(t)
v12 = IntVar(t)
v13 = StringVar(t)

# creating dictionary and list for drop-down menu
cpt = {0:'Typical Angina', 1:'Atypical Angina', 2:'Non-anginal', 3:'Asymptomatic'}
ecg = {0:'Normal', 1:'St-T wave abnormality', 2:'Probable or definite left ventricular hypertropy'}
st = {0:'Upsloping', 1:'Flat', 2:'Downward Sloping'}
fl = [0,1,2,3]
thal = {1:'Normal',2:'Fixed Defect',3:'Reversible Defect'}

# for adding the labels in the GUI
text1 = Label(t,font=f,justify=RIGHT, text="Age",background='LemonChiffon1')
text2 = Label(t,font=f,justify=RIGHT, text="Sex",background='LemonChiffon1')
text3 = Label(t,font=f,justify=RIGHT, text="Chest Pain Type",background='LemonChiffon1')
text4 = Label(t,font=f,justify=RIGHT, text="Resting Blood Pressure",background='LemonChiffon1')
text5 = Label(t,font=f,justify=RIGHT, text="Serum Cholestrol in mg/dl",background='LemonChiffon1')
text6 = Label(t,font=f,justify=RIGHT, text="Fasting blood Sugar > 120 mg/dl",background='LemonChiffon1')
text7 = Label(t,font=f,justify=RIGHT, text="Resting Electrocardiographic Results",background='LemonChiffon1')
text8 = Label(t,font=f,justify=RIGHT, text="Maximum Heart Rate Achieved",background='LemonChiffon1')
text9 = Label(t,font=f,justify=RIGHT, text="Exercise Induced Angina",background='LemonChiffon1')
text10 = Label(t,font=f,justify=RIGHT, text="ST depression induced by exercise relative to rest",background='LemonChiffon1')
text11 = Label(t,font=f,justify=RIGHT, text="The slope of the peak exercise ST segment",background='LemonChiffon1')
text12 = Label(t,font=f,justify=RIGHT, text="Number of major vessels (0-3) colored by Flourosopy",background='LemonChiffon1')
text13 = Label(t,font=f,justify=RIGHT, text="Thal",background='LemonChiffon1')

# adding the entry boxes or drop down menu or radio buttons
entry1 = Entry(t,font=f,textvariable=v1)
r1 = Radiobutton(t, text="Male", font=f, background='LemonChiffon1', variable=v2, value=1)
r2 = Radiobutton(t, text="Female", font=f, background='LemonChiffon1', variable=v2, value=0)
om1 = OptionMenu(t, v3, *cpt.values())
om1.config(font=f)
v3.set('Typical Angina')
entry2 = Entry(t,font=f,textvariable=v4)
entry3 = Entry(t,font=f,textvariable=v5)
r3 = Radiobutton(t, text="Yes", font=f, background='LemonChiffon1', variable=v6, value=1)
r4 = Radiobutton(t, text="No", font=f, background='LemonChiffon1', variable=v6, value=0)
om2 = OptionMenu(t, v7, *ecg.values())
om2.config(font=f)
v7.set('Normal')
entry4 = Entry(t,font=f,textvariable=v8)
r5 = Radiobutton(t, text="Yes", font=f, background='LemonChiffon1', variable=v9, value=1)
r6 = Radiobutton(t, text="No", font=f, background='LemonChiffon1', variable=v9, value=0)
entry5 = Entry(t,font=f,textvariable=v10)
om3 = OptionMenu(t, v11, *st.values())
om3.config(font=f)
v11.set('Upsloping')
om4 = OptionMenu(t, v12, *fl)
om4.config(font=f)
v12.set(fl[0])
om5 = OptionMenu(t, v13, *thal.values())
om5.config(font=f)
v13.set('Normal')

# aligning the elements in the GUI  
text1.grid(row=1, column=0) 
text2.grid(row=2, column=0)
text3.grid(row=3, column=0)
text4.grid(row=4, column=0)
text5.grid(row=5, column=0)
text6.grid(row=6, column=0)
text7.grid(row=7, column=0)
text8.grid(row=8, column=0)
text9.grid(row=9, column=0)
text10.grid(row=10, column=0)
text11.grid(row=11, column=0)
text12.grid(row=12, column=0)
text13.grid(row=13, column=0)
entry1.grid(row=1, column=2)
r1.grid(row=2,column=2)
r2.grid(row=2,column=3)
om1.grid(row=3,column=2)
entry2.grid(row=4, column=2)
entry3.grid(row=5, column=2)
r3.grid(row=6,column=2)
r4.grid(row=6,column=3)
om2.grid(row=7,column=2)
entry4.grid(row=8, column=2)
r5.grid(row=9,column=2)
r6.grid(row=9,column=3)
entry5.grid(row=10, column=2)
om3.grid(row=11,column=2)
om4.grid(row=12,column=2)
om5.grid(row=13,column=2)

# Get the key and value of the dictionaries
cpt_key=list(cpt.keys())
cpt_value=list(cpt.values())
ecg_key=list(ecg.keys())
ecg_value=list(ecg.values())
st_key=list(st.keys())
st_value=list(st.values())
thal_key=list(thal.keys())
thal_value=list(thal.values())

# adding a button in the GUI
b = Button(t, text='Predict',font=f, command=lambda: [result(model.predict(np.array(s.transform(pd.DataFrame([[float(v1.get()),float(v2.get()),float(cpt_key[cpt_value.index(v3.get())]),float(v4.get()),float(v5.get()),float(v6.get()),float(ecg_key[ecg_value.index(v7.get())]),float(v8.get()),float(v9.get()),float(v10.get()),float(st_key[st_value.index(v11.get())]),float(v12.get()),float(thal_key[thal_value.index(v13.get())])]],columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach','exang', 'oldpeak', 'slope', 'ca', 'thal']))))), print([[float(v1.get()),float(v2.get()),float(cpt_key[cpt_value.index(v3.get())]),float(v4.get()),float(v5.get()),float(v6.get()),float(ecg_key[ecg_value.index(v7.get())]),float(v8.get()),float(v9.get()),float(v10.get()),float(st_key[st_value.index(v11.get())]),float(v12.get()),float(thal_key[thal_value.index(v13.get())])]])])
b.grid(row=14, column=1)
t.mainloop()

[[42.0, 1.0, 3.0, 148.0, 244.0, 0.0, 0.0, 178.0, 0.0, 0.8, 2.0, 2.0, 2.0]]
[[63.0, 1.0, 0.0, 145.0, 233.0, 1.0, 0.0, 150.0, 0.0, 2.0, 0.0, 0.0, 1.0]]


In [27]:
[[42.0, 1.0, 3.0, 148.0, 244.0, 0.0, 0.0, 178.0, 0.0, 0.8, 2.0, 2.0, 2.0]]

[1]

In [28]:
[[63.0, 1.0, 0.0, 145.0, 233.0, 1.0, 0.0, 150.0, 0.0, 2.0, 0.0, 0.0, 1.0]]

[0]
