In [1]:
import numpy as np
import pandas as pd

In [2]:
def fit(x_train,y_train):
    result={}
    class_values=set(y_train)#top level keys distinct values in y_train
    for current_class in class_values:
        result[current_class]={}
        result["total_data"]=len(y_train)
        current_class_rows=(y_train==current_class)
        x_train_current=x_train[current_class_rows]
        y_train_current=y_train[current_class_rows]
        
        num_features=x_train.shape[1]#number of features
        result[current_class]["total_count"]=len(y_train_current)
        for j in range(1,num_features+1):
            
            result[current_class][j]={}
            all_possible_values=set(x_train[:,j-1])
            for current_value in all_possible_values:
                result[current_class][j][current_value]=(x_train[:,j-1]==current_value).sum()
                
            
    return result

In [3]:
def probability(result,x,current_class):
    class_prob=np.log(result[current_class]["total_count"])-np.log(result["total_data"])
    output=class_prob
    num_features=len(result[current_class].keys())-1
    for j in range(1,num_features):
        xj=x[j-1]
        count_current_class_with_value_xj=result[current_class][j][xj]+1
        count_current_class=result[current_class]["total_count"]+len(result[current_class][j].keys())
        current_xj_prob=np.log(count_current_class_with_value_xj)-np.log(count_current_class)
        output+=current_xj_prob  #small data hence take log of probability
    return output
    

In [4]:
def predictSinglePoint(result,x):
    classes=result.keys()
    best_p=-1000
    best_class=-1
    first_run=True
    for current_class in classes:
        if(current_class=="total_data"):
            continue
        p_current_class=probability(result,x,current_class)
        if (first_run or p_current_class>best_p):
            best_p=p_current_class
            best_class=current_class
        first_run=False
    return best_class

In [5]:
def predict(result,x_test):
    y_pred=[]
    for x in x_test:
        x_class=predictSinglePoint(result,x)
        y_pred.append(x_class)
    return y_pred

In [6]:
 def makeLabelled(column): #changing continuous data to labelled data
        second_limit=column.mean()
        first_limit=0.5*second_limit
        third_limit=1.5*second_limit
        for i in range(0,len(column)):
            if(column[i]<first_limit):
                column[i]=0
            elif(column[i]<second_limit):
                column[i]=1
            elif(column[i]<third_limit):
                column[i]=2
            else:
                column[i]=3
        return column
        

In [7]:
from sklearn import datasets
iris=datasets.load_iris()
X=iris.data
Y=iris.target

In [8]:
for i in range(0,X.shape[-1]):
    X[:,i]=makeLabelled(X[:,i])

In [9]:
from sklearn import model_selection
X_train,X_test,Y_train,Y_test=model_selection.train_test_split(X,Y,test_size=.25,random_state=0)


In [10]:
dictionary=fit(X_train,Y_train)

In [11]:
y_pred=predict(dictionary,X_test)

In [13]:
from sklearn.metrics import classification_report,confusion_matrix


In [15]:
print(classification_report(Y_test,y_pred))
print(confusion_matrix(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        13
           1       0.42      1.00      0.59        16
           2       0.00      0.00      0.00         9

    accuracy                           0.42        38
   macro avg       0.14      0.33      0.20        38
weighted avg       0.18      0.42      0.25        38

[[ 0 13  0]
 [ 0 16  0]
 [ 0  9  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
