# Self Implementation of Naive Bayes For Discrete Data

In [12]:
import numpy as np
class NaiveBayes:
    
    def __init__(self):
        
        self.dictionary={}
    
    def fit(self,x_train,y_train) :
        
        classes=np.unique(y_train)
        total=len(y_train)
        self.dictionary["Total_Data"] = total
        for cls in classes:
            
            self.dictionary[cls]={}
            
            indexes = (y_train == cls)
            
            x_train_cls=x_train[indexes]
            y_train_cls=y_train[indexes]
            
            nFeatures=x_train_cls.shape[1]
            self.dictionary[cls]["Total_Count"] = len(y_train_cls)
            for f in range(1,nFeatures+1):
                self.dictionary[cls][f]={}
                
                xCol=x_train_cls[:,f-1]
                distinct_vals=np.unique(x_train[:,f-1])
                
                for val in distinct_vals:
                    
                    self.dictionary[cls][f][val] = np.sum(xCol == val)
                    
    def predict(self,x_test):
        
        y_pred=[]
        
        for x in x_test:
            prediction=self._getSinglePoint(x)
            y_pred.append(prediction)
        
        return y_pred
    
    def _getProbability(self,x,cls):
        
        overall_y = self.dictionary["Total_Data"]
        count_cls = self.dictionary[cls]["Total_Count"]
        
        result = np.log(count_cls) - np.log(overall_y)
        nFeatures = len(self.dictionary[cls].keys()) -1
        
        for f in range(1,nFeatures+1):
            
            xj=x[f-1]
            
            prob_num = self.dictionary[cls][f][xj] + 1
            
            distinct_vals_count= len(self.dictionary[cls][f].keys())
            
            prob_deno = count_cls + distinct_vals_count
            
            prob = np.log(prob_num) - np.log(prob_deno)
            
            result += prob
            
        return result
            
    def _getSinglePoint(self,x):
        classes = self.dictionary.keys()
        best_class=None
        best_prob=float('-inf')
        first_run=True
        for cls in classes:
            if cls == "Total_Data":
                continue
            prob=self._getProbability(x,cls)
            if first_run == True or best_prob < prob:
                best_prob = prob
                best_class = cls
            first_run = False
            
        return best_class

In [26]:
from sklearn import datasets
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

In [17]:
def getLabelled(xCol):
    
    first_limit = np.mean(xCol)
    zero_limit = 0.5 * first_limit
    third_limit = 1.5 * first_limit
    
    for i in range(len(xCol)):
        
        if xCol[i] < zero_limit:
            xCol[i] = 0
        elif xCol[i] < first_limit:
            xCol[i] = 1
        elif xCol[i] < third_limit:
            xCol[i] = 2
        else:
            xCol[i] = 3
    return xCol

In [19]:
iris = datasets.load_iris()

x = iris.data
y = iris.target

nFeatures = x.shape[1]
for f in range(nFeatures):
    x[:,f] = getLabelled(x[:,f])
    
x_train,x_test,y_train,y_test = train_test_split(x,y)

In [25]:
clf = NaiveBayes()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        12
           1       1.00      0.93      0.96        14
           2       0.92      1.00      0.96        12

    accuracy                           0.97        38
   macro avg       0.97      0.98      0.97        38
weighted avg       0.98      0.97      0.97        38



In [27]:
print(confusion_matrix(y_test,y_pred))

[[12  0  0]
 [ 0 13  1]
 [ 0  0 12]]
