**Loading Dataset and Preprocessing**

In [1]:
import numpy as np
import pandas as pd
from scipy import stats

data = (pd.read_csv(r'data.csv'))

data.head()

Unnamed: 0,id,age_days,age_year,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,50.391781,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,55.419178,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,51.663014,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,48.282192,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,47.873973,1,156,56.0,100,60,1,1,0,0,0,0


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age_days     70000 non-null  int64  
 2   age_year     70000 non-null  float64
 3   gender       70000 non-null  int64  
 4   height       70000 non-null  int64  
 5   weight       70000 non-null  float64
 6   ap_hi        70000 non-null  int64  
 7   ap_lo        70000 non-null  int64  
 8   cholesterol  70000 non-null  int64  
 9   gluc         70000 non-null  int64  
 10  smoke        70000 non-null  int64  
 11  alco         70000 non-null  int64  
 12  active       70000 non-null  int64  
 13  cardio       70000 non-null  int64  
dtypes: float64(2), int64(12)
memory usage: 7.5 MB


In [3]:
#dropping unnecessary features
data = data.drop(columns=['id','age_days'])

**Splitting Dataset (dataSplit Function)**

In [4]:
def dataSplit(alldata, percentatgeOfTrain):
    Y = alldata.iloc[: , -1]
    targets = Y.unique()
    temp = []
    temp_2 = []
    
    #splitting based on target values
    for target in targets:
        df = alldata[alldata.iloc[:, -1] == target]
        temp.append(df)
    
    #splitting data based on percentage
    train = []
    test = []
    for df in temp:
        num_of_rows = int(len(df) * percentatgeOfTrain)
        train.append(df[:num_of_rows])
        test.append(df[num_of_rows:])
    
    #concatting dataframes in train and test lists
    train_data = pd.concat(train)
    test_data = pd.concat(test)
        
    return train_data, test_data
    
data = dataSplit(data, 0.8)

**Printing the Probablity Table (fitNaive and printNaive Functions)**

In [5]:
def fitNaive(trainData):
    Dict = {} #using a dictionary as probablity table
    X = trainData.iloc[: , :-1]
    Y = trainData.iloc[: , -1]
    targets = Y.unique()
    for target in targets:
        Dict[target] = {}
        for i in X.columns:
            Dict[target][i] = {}
            uniqueValues = X[i].unique()
            if(all(np.diff(uniqueValues) == np.diff(uniqueValues)[0])): #detecting nominal features
                for j in uniqueValues:
                    Dict[target][i]["Type"] = "Nominal"
                    #computing probablity for each nominal feature + Laplace Estimator
                    Dict[target][i][j] = (1+ (len(trainData[(trainData[i] == j) & 
                        (trainData.iloc[: , -1] == target)])))/(len(X.columns) + len(Y[(Y == target)]))
            else:
                #handling numeric features
                Dict[target][i]["Type"] = "Numeric"
                Dict[target][i]["Mean"] = X[i].mean()
                Dict[target][i]["Std"] = X.std()[i]                   
    return Dict

fitNaive(data[0])

{0: {'age_year': {'Type': 'Numeric',
   'Mean': 53.33369553810605,
   'Std': 6.766390938021898},
  'gender': {'Type': 'Nominal', 2: 0.3439540443144111, 1: 0.6557248367645485},
  'height': {'Type': 'Numeric',
   'Mean': 164.3517741388239,
   'Std': 8.195745566330928},
  'weight': {'Type': 'Numeric',
   'Mean': 74.21586456901017,
   'Std': 14.346868059143905},
  'ap_hi': {'Type': 'Numeric',
   'Mean': 129.23652208075146,
   'Std': 171.52513143676984},
  'ap_lo': {'Type': 'Numeric',
   'Mean': 96.63285058661762,
   'Std': 194.2709566658545},
  'cholesterol': {'Type': 'Nominal',
   1: 0.8385842223570129,
   2: 0.10839547579120135,
   3: 0.052734862810860955},
  'gluc': {'Type': 'Nominal',
   1: 0.8810789595746958,
   2: 0.060548756556177974,
   3: 0.058086844828201374},
  'smoke': {'Type': 'Nominal', 0: 0.906483034217005, 1: 0.09319584686195455},
  'alco': {'Type': 'Nominal', 0: 0.9438755485781568, 1: 0.0558033325008028},
  'active': {'Type': 'Nominal',
   1: 0.8175687729689228,
   0: 0.18

**Naive Bayes Prediction (predictNaive function)**

In [6]:
import math
def predictNaive(instance, table):
    prob = {} #a dictionary to save probability values for each class
    for target in table:
        score = 1 #considering probability as score
        features = list(table[target].keys())
        dim = len(features)
        for i in range(dim): #this loop will compute score for each class
            if table[target][features[i]]["Type"] == "Numeric":
                mean = table[target][features[i]]["Mean"]
                std = table[target][features[i]]["Std"]
                
                #Gaussian Probability Distribution for numeric features
                temp = (1/(math.sqrt(2*math.pi)*std))*pow(math.e, -(pow(instance[i]-mean, 2)/(2*pow(std,2))))
                score *= temp
                
            elif table[target][features[i]]["Type"] == "Nominal": 
                score *= table[target][features[i]][instance[i]]
                
        prob[target] = score
    prediction = max(prob, key=prob.get) #finding the class with maximum score(probability)
    return prediction

**Testing Naive Bayes (testNaive Function)**

In [7]:
def testNaive(model, testData):
    predictions = []
    testData = testData.values.tolist()
    for instance in testData:
        predictions.append(predictNaive(instance, model))
    return predictions

**Average Error Function**

In [8]:
def avgErr(pred, actual):
    err = 0
    for i in range(len(pred)):
        if pred[i] != actual[i]:
            err += 1
        else:
            err += 0
    avg_err = (1/(len(pred)))*err
    return avg_err

**Evaluation for Naive Bayes from Scratch**

In [9]:
model = fitNaive(data[0])
scratch_pred = testNaive(model, data[1])
print("Average Error(From Scratch): ", avgErr(scratch_pred, data[1].to_numpy()[:, -1])*100)

Average Error(From Scratch):  40.897078780087135


**Naive Bayes Algorithm using sklearn library + Evaluation**

In [10]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
model = NB.fit(data[0].to_numpy()[:, :-1], data[0].to_numpy()[:, -1])
lib_pred = model.predict(data[1].to_numpy()[:, :-1])

print("Average Error(From Scratch): ", avgErr(lib_pred, data[1].to_numpy()[:, -1])*100)

Average Error(From Scratch):  40.61852724805371
