In [1]:
import numpy as np

In [2]:
def fit(X, Y):
    result = {}
    class_values = set(Y)
    for curr_class in class_values:
        result[curr_class] = {}
        result["total_count"] = len(Y)
        curr_class_rows = (Y == curr_class)
        X_curr = X[curr_class_rows]
        Y_curr = Y[curr_class_rows]
        num_features = X.shape[1]
        result[curr_class]["total_count"] = len(Y_curr)
        for j in range(1, num_features+1):
            result[curr_class][j] = {}
            all_possible_values = set(X[:, j-1])
            for curr_val in all_possible_values:
                result[curr_class][j][curr_val] = (X_curr[:, j-1] == curr_val).sum()
    return result

In [3]:
# I will use log to compare probabiloities as the probabilities can get very low 
def probability(dictionary, x, curr_class):
    out = np.log(dictionary[curr_class]["total_count"]) - np.log(dictionary["total_count"])
    num_features = len(dictionary[curr_class].keys())-1
    for j in range(1, num_features+1):
        xj = x[j-1]
        count_current_class_with_values_xj = dictionary[curr_class][j][xj] + 1
        count_current_class = dictionary[curr_class]["total_count"] + len(dictionary[curr_class][j].keys())
        current_xj_probability = np.log(count_current_class_with_values_xj) - np.log(count_current_class)
        out+=current_xj_probability
    return out

In [4]:
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for curr_class in classes:
        if curr_class == "total_count":
            continue
        p_curr_class = probability(dictionary, x, curr_class)
        if(first_run or (p_curr_class>best_p)):
            best_class = curr_class
            best_p = p_curr_class
        first_run = False
    return best_class

In [5]:
def predict(dictionary, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [6]:
def mark_labelled(column):
    second = column.mean()
    first = .5*second
    third = 1.5*second
    
    for i in range(len(column)):
        if (column[i]<first):
            column[i] = 0
        elif(column[i]<second):
            column[i] = 1
        elif(column[i]<third):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [7]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target

In [8]:
for i in range(0,X.shape[-1]):
    X[:,i] = mark_labelled(X[:,i])

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state = 0)

In [10]:
dictionary = fit(x_train, y_train)

In [11]:
y_pred = predict(dictionary, x_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]
