In [1]:
import random
import numpy as np
import pandas as pd
# importing the required libraries

In [2]:
# Reading the heart.csv file
heart = []

f = open('heart.csv') 
labels = f.readline().split(',')
temp = labels[-1]
labels[-1] = temp[0:-1]
for line in f.readlines():
    line = line[0:-1]
    data = [line.split(',')]
    for row in data:
        heart.append([float(item) for item in row])
f.close()

# shuffling the heart.csv file to keep things random
random.shuffle(heart)

In [3]:
# shifting the continuous features to the start of the dataframe and
# the discrete features to the later part of the dataframe
train = pd.DataFrame(heart, columns=labels)

train_pd = pd.DataFrame()
train_pd.insert(0, 'age', train['age'])
train_pd.insert(1, 'trestbps', train['trestbps'])
train_pd.insert(2, 'chol', train['chol'])
train_pd.insert(3, 'thalach', train['thalach'])
train_pd.insert(4, 'oldpeak', train['oldpeak'])
train_pd.insert(5, 'sex', train['sex'])
train_pd.insert(6, 'cp', train['cp'])
train_pd.insert(7, 'fbs', train['fbs'])
train_pd.insert(8, 'restecg', train['restecg'])
train_pd.insert(9, 'exang', train['exang'])
train_pd.insert(10, 'slope', train['slope'])
train_pd.insert(11, 'ca', train['ca'])
train_pd.insert(12, 'thal', train['thal'])
train_pd.insert(13, 'target', train['target'])

heart = np.array(train_pd)

In [4]:
# Splitting the dataset into train and test 70/30 split
train_heart = heart[0:int(0.7*len(heart))]
test_heart = heart[int(0.7*len(heart)): len(heart)]
train_heart = np.array(train_heart)
test_heart = np.array(test_heart)

In [5]:
# seperating the target from the features 
train_x = train_heart[: , 0:13]
train_y = train_heart[:,13]
test_x = test_heart[:,0:13]
test_y = test_heart[:,13]

In [6]:
# calculating the prior probabilities
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

In [7]:
# calculating likelihoods using a gaussian distribution
# keep a minimum value so we dont encounter zeros
def calculate_likelihood_gaussian(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    mean, std = df[feat_name].mean(), df[feat_name].std()
    p_x_given_y = (1 / (np.sqrt(2 * np.pi) * max(0.001,std))) *  max(0.0001,np.exp(-((feat_val-mean)**2 / (2 *std**2) )))
    return p_x_given_y

In [8]:
# calculating the likelihoods for discrete features
def discrete(df, feat_name,feat, Y, label):
#     for i in range(num_classes):
#         for i in range(len())
#         df[feat_name]==i and df[Y]==0
#         df[feat_name]==i and df[Y]==1
    df = df[df[Y]==label]
    a = np.array(pd.crosstab(df[feat_name], df[Y]))
    return float(a[int(feat)]/717)
        

In [9]:
# The main code that can parse multiple test samples at a time
def naive_bayes_gaussian(df, a, X, Y):
    # getting names of features using df dataframe column names
    features = list(df.columns)[:-1]
    # calculating prior
    prior = calculate_prior(df, Y)
    Y_pred = []
    # loop over every data sample
    count=0
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        
        for j in range(len(labels)):
            for i in range(len(features)):
                if i<=4:
                    likelihood[j] *= calculate_likelihood_gaussian(df, features[i], x[i], Y, labels[j])
#                     print(likelihood[j])
                else:
                    m = discrete(df, features[i], x[i], Y, labels[j])
                    likelihood[j] *= m
#                     print(likelihood[j])
#                     print(likelihood[j])
        # calculating posterior probability by using bayes theorem  
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]
            
        Y_pred.append(np.argmax(post_prob))
        a.append(post_prob)
        # returning max values of all possible classes
    return np.array(Y_pred)

In [10]:
# creating the train dataframe with correct column order
train = pd.DataFrame(train_heart, columns=labels)

b = [train['age'], train['trestbps'], train['chol'], train['thalach'], train['oldpeak'],
                         train['sex'], train['cp'], train['fbs'], train['restecg'], train['exang'], train['slope'], train['ca'], train['thal'], train['target']]
train_pd = pd.DataFrame()
train_pd.insert(0, 'age', train['age'])
train_pd.insert(1, 'trestbps', train['trestbps'])
train_pd.insert(2, 'chol', train['chol'])
train_pd.insert(3, 'thalach', train['thalach'])
train_pd.insert(4, 'oldpeak', train['oldpeak'])
train_pd.insert(5, 'sex', train['sex'])
train_pd.insert(6, 'cp', train['cp'])
train_pd.insert(7, 'fbs', train['fbs'])
train_pd.insert(8, 'restecg', train['restecg'])
train_pd.insert(9, 'exang', train['exang'])
train_pd.insert(10, 'slope', train['slope'])
train_pd.insert(11, 'ca', train['ca'])
train_pd.insert(12, 'thal', train['thal'])
train_pd.insert(13, 'target', train['target'])

a = []
Y_pred = naive_bayes_gaussian(train_pd, a,X=test_x, Y="target")

#reporting metrics
from sklearn.metrics import confusion_matrix, f1_score
print("The confusion matrix for the given model is: ")
print(confusion_matrix(test_y, Y_pred))
print("The accuracy for the given model is: ")
print(f1_score(test_y, Y_pred, average='macro'))

The confusion matrix for the given model is: 
[[106  36]
 [ 56 110]]
The accuracy for the given model is: 
0.7012483130904184


In [11]:
# printing the posterior probabilities
print(f"These are the posterior probabilities for each test sample : ")
print(np.array(a))

These are the posterior probabilities for each test sample : 
[[8.95522705e-23 2.88697568e-22]
 [4.52827734e-22 2.95257384e-24]
 [2.35900520e-24 1.74490743e-26]
 [2.79340749e-23 4.47002591e-23]
 [3.06459221e-22 8.01409213e-22]
 [2.66481524e-23 5.02944906e-21]
 [5.24388444e-27 2.07802404e-26]
 [5.15869956e-23 4.79051446e-25]
 [1.19007704e-22 7.79192088e-24]
 [9.02469104e-23 7.97747568e-22]
 [7.40553144e-26 9.67101809e-23]
 [3.95606607e-23 3.53605754e-21]
 [6.99809906e-23 1.49134582e-25]
 [1.31324084e-22 6.55479360e-21]
 [3.08382085e-25 2.97990462e-27]
 [5.68302925e-27 1.00907487e-27]
 [9.10560848e-24 9.41486540e-26]
 [2.64049150e-23 2.28925333e-24]
 [8.99477039e-24 3.21794378e-24]
 [1.08342494e-23 4.17323648e-22]
 [1.08770148e-22 4.40637900e-23]
 [4.11358636e-24 1.94942241e-24]
 [1.50518015e-24 1.04485737e-21]
 [7.95143546e-26 3.04750660e-26]
 [6.83048183e-28 4.59014072e-26]
 [7.87856039e-25 1.75860145e-27]
 [1.13194311e-23 4.60437583e-25]
 [6.83048183e-28 4.59014072e-26]
 [5.84816661e-