In [183]:
import numpy as np

## Own Functions For Fit,Predict,etc For Naive Bayes

In [196]:
def fit(x_train, y_train):
    result = {}
    class_values = set(y_train)
    for current_class in class_values:
        result[current_class] = {}
        result["Total_data"] = len(y_train)
        current_class_rows = (y_train == current_class)
        x_train_current = x_train[current_class_rows]
        y_train_current = y_train[current_class_rows]
        num_features = x_train.shape[1]
        result[current_class]["Total_count"] = len(y_train_current)
        for j in range(1, num_features + 1):
            result[current_class][j] = {}
            all_possible_values = set(x_train[:,j - 1])
            for current_value in all_possible_values:
                result[current_class][j][current_value] = (current_value == x_train_current[:j - 1]).sum()
                
    return result

In [197]:
def predictSingleLine(dictionary, x):
    classes = dictionary.keys()
    best_proba = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if(current_class == "Total_data"):
            continue
        proba_current_class = probability(dictionary, x, current_class)
        if(first_run or proba_current_class > best_proba):
            best_proba = proba_current_class
            best_class = current_class
        first_run = False
    return best_class

In [198]:
def predict(dictionary, x_test):
    y_pred = []
    for x in x_test:
        x_class = predictSingleLine(dictionary, x)
        y_pred.append(x_class)
    return y_pred

In [199]:
def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["Total_count"]/dictionary["Total_data"])
    num_features = len(dictionary[current_class].keys()) - 1
    for j in range(1, num_features + 1):
        xj = x[j - 1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1
        count_current_class = dictionary[current_class]["Total_count"] + len(dictionary[current_class][j].keys())
        count_current_proba = np.log(count_current_class_with_value_xj/count_current_class)
        output = output + count_current_proba
    return output

In [200]:
def MakeLabelled(column):
    second_limit = column.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5 * second_limit
    for i in range(0,len(column)):
        if(column[i] < first_limit):
            column[i] = 0
        elif(column[i] < second_limit):
            column[i] = 1
        elif(column[i] < third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column
            
        

## Data selection and Training Testing Split

In [201]:
from sklearn import datasets
iris = datasets.load_iris()
x = iris.data
y = iris.target

In [202]:
for i in range(0,x.shape[-1]):
    x[:,i] = MakeLabelled(x[:,i])

In [203]:
from sklearn import model_selection
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size= 0.25, random_state= 0)
dictionary = fit(x_train,y_train)
y_pred = predict(dictionary, x_test)

### Classification Report and Confusion Matrix

In [205]:
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       0.94      1.00      0.97        16
           2       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.96      0.97        38
weighted avg       0.98      0.97      0.97        38

[[13  0  0]
 [ 0 16  0]
 [ 0  1  8]]


### Comparing with Gaussian Naive Bayes(used for Finding Probabilities for Continuous Valued Features)

In [207]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        13
           1       0.76      1.00      0.86        16
           2       1.00      0.67      0.80         9

    accuracy                           0.87        38
   macro avg       0.92      0.84      0.86        38
weighted avg       0.90      0.87      0.87        38

[[11  2  0]
 [ 0 16  0]
 [ 0  3  6]]
