In [1]:
import numpy as np

In [30]:
# Load train and test data
x_train = np.loadtxt("mnb_x_train.csv")
y_train = np.loadtxt("mnb_y_train.csv")
x_test = np.loadtxt("mnb_x_test.csv")
y_test = np.loadtxt("mnb_y_test.csv")

In [31]:

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(x_train, y_train)
y_pred_sk = clf.predict(x_test)

In [3]:
def fit(x_train, y_train):
    result = {}
    class_values = set(y_train)
    result['total_data'] = len(y_train)
    for current_class in class_values:
        result[current_class] = {}
        current_class_rows = (y_train == current_class)
        x_train_current = x_train[current_class_rows]
        num_features = x_train.shape[1]
        temp = 0
        for j in range(1, num_features+1):
            result[current_class][j] = x_train_current[:, j-1].sum()
            temp += result[current_class][j]
        result[current_class]['total_count'] = temp
        result[current_class]['num'] = current_class_rows.sum()
    return result

In [4]:
def probability(current_class, x, dictionary):
    output = np.log(dictionary[current_class]['num']) - np.log(dictionary['total_data'])
    num_features = len(dictionary[current_class].keys()) - 2
    for j in range(1, num_features+1):
        if x[j-1] > 0:
            count_with_xj = dictionary[current_class][j] + 1
            count_current_class = dictionary[current_class]['total_count'] + num_features
            current_xj_prob = (np.log(count_with_xj) - np.log(count_current_class)) + np.log(x[j-1])
            output += current_xj_prob
    return output

In [5]:
def predictSinglePoint(x, dictionary):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if current_class == 'total_data':
            continue
        p_current_class = probability(current_class, x, dictionary)
        if first_run or p_current_class > best_p:
            best_p = p_current_class
            best_class = current_class
        first_run = False
    return best_class

In [6]:
def predict(x_test, dictionary):
    y_pred = []
    for x in x_test:
        x_class = predictSinglePoint(x, dictionary)
        y_pred.append(x_class)
    return y_pred

In [32]:
dictionary = fit(x_train,y_train)

In [33]:
y_pred_self = predict(x_test, dictionary)

In [34]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred_sk))
print(confusion_matrix(y_test, y_pred_sk))

             precision    recall  f1-score   support

        0.0       0.71      0.70      0.70       100
        1.0       0.81      0.87      0.84       100
        2.0       0.90      0.91      0.91       100
        3.0       0.96      0.94      0.95       100
        4.0       0.95      0.97      0.96       100
        5.0       0.92      0.97      0.94       100
        6.0       0.97      0.88      0.92       100
        7.0       0.96      0.98      0.97       100
        8.0       0.99      0.97      0.98       100
        9.0       0.90      0.88      0.89       100
       10.0       0.98      0.98      0.98       100
       11.0       0.96      0.93      0.94       100
       12.0       0.85      0.90      0.87       100
       13.0       0.95      0.93      0.94       100
       14.0       0.80      0.82      0.81       100
       15.0       0.94      0.91      0.92       100
       16.0       0.84      0.94      0.89       100
       17.0       0.96      0.90      0.93   

In [35]:
print(confusion_matrix(y_test, y_pred_self))
print(classification_report(y_test, y_pred_self))

[[59  0  0  0  0  0  0  0  0  0  1  0  0  0 30  0  5  0  5  0]
 [ 0 89  1  0  1  0  1  0  0  2  0  0  6  0  0  0  0  0  0  0]
 [ 0  0 96  0  1  1  0  0  0  0  0  0  1  0  0  0  0  0  0  1]
 [ 0  0  0 99  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 98  1  0  0  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  3  0  1 96  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  9  2  0  2  0 77  0  0  4  0  0  0  0  0  5  0  0  0  1]
 [ 0  0  0  0  0  0  0 99  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  3  0  0  0  0  0 96  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  5  2  0  0  0  0  0  0 91  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  1  0  0  0  0  0  0  0 99  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0  0  0 97  0  0  0  0  0  0  1  0]
 [ 0  3  1  0  1  0  0  0  0  0  0  0 95  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  1  0  1  0  0  0 96  0  1  0  0  1  0]
 [ 9  0  0  0  0  0  0  0  0  0  1  0  0  1 89  0  0  0  0  0]
 [ 0  0  2  0  0  0  2  0  0  4  0  0  2  1  0 89  0  0