In [15]:
# importing the needed libraries/packages/modules

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
# fit function to train the model and return a corresponding dictionary

def fit(X, Y) :
    # making a dictionary and storing the total count of data points in it

    dict={}
    dict["total_count"]=Y.shape[0]

    # iterating on the unique class values in the output 

    class_values=set(Y)
    
    for class_value in class_values :
        # making a dictionary for each class

        dict[class_value]={}

        # getting input and output corresponding to the current class

        X_class_value=X[Y==class_value]
        Y_class_value=Y[Y==class_value]

        # storing the total count of data points with the current class

        dict[class_value]["total_count"]=Y_class_value.shape[0]

        # iterating on all the features for a particular class

        for j in range(X_class_value.shape[1]) :
            # making a dictionary for each feature corresponding to the current class

            dict[class_value][j]={}

            # iterating on the unique values of the current feature for the current class

            feature_values=set(X_class_value[:, j])

            for feature_value in feature_values :
                # setting the count for a particular feature value for a particular class

                dict[class_value][j][feature_value]=(X_class_value[:, j]==feature_value).sum()

    return dict

In [17]:
# get probability function which returns the log probability(we're considering the log probability as the multiplication of many small probabilities can lead to zero probability which indicates that x cannot belong to a given class which is a bold assumption, which we should not make) that given data point belongs to a given class

def getProb(dict, x, class_value) :
    # initializing output with the probability that the class is the current class

    output=(np.log(dict[class_value]["total_count"])-np.log(dict["total_count"]))        

    # iterating on all the features 

    for j in range(X.shape[1]) :                
        # finding the probability that the input data points with the current class have the value of the jth feature similar to that current data point, dict[class_value][j][x[j]] can lead to a key error as the value xj of feature j may not have been encountered in the training, so we include a conditional and add the net log probability to the output(each class in dictionary will have all the features)

        count_class_value_xj=(dict[class_value][j][x[j]]+1 if x[j] in dict[class_value][j].keys() else 1)
        count_class_value=(dict[class_value]["total_count"]+len(dict[class_value][j].keys()))  
        
        output+=(np.log(count_class_value_xj)-np.log(count_class_value))
    
    return output

In [18]:
# get class function which returns the class which a particular data point belongs to

def getClass(dict, x) :
    # getting the unqiue classes from the dictionary and then iterating on them

    class_values=dict.keys()

    # default values for the best probability and the output

    best_prob=-1
    output=None

    for class_value in class_values :
        # skipping the total count property 

        if(class_value=="total_count") :
            continue

        # getting the probability that a particular data point belongs to a particular class and then making the necessary updations
        
        prob_class_value=getProb(dict, x, class_value)

        if(best_prob==-1 or prob_class_value>best_prob) :
            best_prob=prob_class_value
            output=class_value

    return output

In [19]:
# predict function which returns the predictions for the test input

def predict(dict, X) :
    Y_pred=[]

    for x in X:   
        # getting the class which each test data point belongs to

        x_class=getClass(dict, x)
        Y_pred.append(x_class)

    return Y_pred

In [20]:
# loading the data and getting the corresponding input and output

data=datasets.load_iris()

X=data.data
Y=data.target

In [21]:
# label column function which lables a particular column and then returns it

def labelCol(col) :
    # for each column we divide the values into 4 parts(0, 1, 2 and 3), divided by three seperators which are 0.5 mean, mean and 1.5 mean

    col_mean=col.mean()

    lim_1=(col_mean*0.5)
    lim_2=col_mean
    lim_3=(col_mean*1.5)

    for i in range(len(col)) :
        if(col[i]<lim_1) :
            col[i]=0
        elif(col[i]<lim_2) :
            col[i]=1
        elif(col[i]<lim_3) :
            col[i]=2
        else :
            col[i]=3

    return col

In [22]:
# converting each input column into labelled data(needed to apply naive bayes on it, currently we're not using gaussian probability density function to predict the probability for continuous valued input)

for j in range(X.shape[1]) :
    X[:, j]=labelCol(X[:, j])

In [23]:
# splitting the data into train and test

X_train, X_test, Y_train, Y_test=train_test_split(X, Y, random_state=1)

In [24]:
# fitting the training data into the naive bayes model and getting the correponding dictionary

dict=fit(X_train, Y_train)

In [25]:
# getting the predictions for the testing data

Y_pred=predict(dict, X_test)

In [26]:
# score function which returns the mean accuracy of the model

def score(Y_true, Y_test) :
    count=0

    for i in range(len(Y_test)) :
        if(Y_test[i]==Y_true[i]) :
            count+=1

    return (count/len(Y_test))

In [27]:
# getting the score of the naive bayes classification algorithm on the testing data

score(Y_test, Y_pred)

1.0

In [28]:
# printing the classification report and the confusion matrix for better understanding

print(classification_report(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        16
           2       1.00      1.00      1.00         9

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38

[[13  0  0]
 [ 0 16  0]
 [ 0  0  9]]
