In [1]:
import numpy as np

In [17]:
def fit(X_train,y_train):
    result = {}
    y_classes = set(y_train)
    for y_class in y_classes:
        result[y_class] = {}
        result["total_data"] = len(y_train)
        y_class_rows = (y_train==y_class)
        X_train_y_class = X_train[y_class_rows]
        Y_train_y_class = y_train[y_class_rows]
        features = X_train.shape[1]
        result[y_class]["total_count"] = len(Y_train_y_class) 
        for ith_feature in range(features):
            result[y_class][ith_feature+1] = {}
            ith_feature_label = set(X_train[:,ith_feature])
            for current_label in ith_feature_label:
                result[y_class][ith_feature+1][current_label] = (X_train_y_class[:,ith_feature] == current_label).sum()
    return result

In [26]:
def probability(result,x,class_):
#     '''
#     changing probalitiy into log probability to prevent 0 after multipying output*current_prob because
#     multiplying these small value may leads to 0 and we don't want this.
#     so, a/b changes to loga-logb and a*b changes to loga + logb
#     '''
#     print("class is ",class_)
    out = np.log(result[class_]["total_count"])-np.log(result["total_data"])
    num_features = len(result[class_].keys())-1
    for ith_feature in range(num_features):
        xj = x[ith_feature] #xj is label of ith_feature
        cnt_current_class_with_value_xj = result[class_][ith_feature+1][xj]+1
        cnt_current_class = result[class_]["total_count"]+len(result[class_][ith_feature+1].keys()) # P(y==ai)
        current_prob = np.log(cnt_current_class_with_value_xj)-np.log(cnt_current_class) # P(Xj=xj/y=ai)
        out = out+current_prob
    return out

In [24]:
def step_prediction(result,X):
    classes = result.keys()
    best_prob = -1
    best_class = -1
    ok = True
    for class_ in classes:
        if class_ == "total_data":
            continue
        class_prob = probability(result,X,class_)
        if (class_prob > best_prob or ok):
            best_prob = class_prob
            best_class = class_
        ok = False
    return best_class

In [5]:
def predict(result,X_test):
    y_pred = []
    for each in X_test:
        each_predict = step_prediction(result,each)
        y_pred.append(each_predict)
    return y_pred

In [6]:
def make_labelled(column):
    second_limit = column.mean()
    first_limit = 0.5*second_limit
    third_limit = 1.5*second_limit
    for i in range(len(column)):
        if first_limit > column[i]:
            column[i] = 0
        elif column[i] < second_limit:
            column[i] = 1
        elif column[i] < third_limit:
            column[i] = 2
        else:
            column[i] = 3
    return column

In [7]:
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [21]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [8]:
for i in range(X.shape[1]):
    X[:,i] = make_labelled(X[:,i])

In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

In [29]:
result = fit(X_train,y_train)
print(result)
y_pred = predict(result,X_test)

{0: {'total_count': 35, 1: {1.0: 35, 2.0: 0}, 2: {1.0: 7, 2.0: 28}, 3: {0.0: 33, 1.0: 2, 2.0: 0, 3.0: 0}, 4: {0.0: 34, 1.0: 1, 2.0: 0, 3.0: 0}}, 'total_data': 112, 1: {'total_count': 40, 1: {1.0: 19, 2.0: 21}, 2: {1.0: 33, 2.0: 7}, 3: {0.0: 0, 1.0: 6, 2.0: 34, 3.0: 0}, 4: {0.0: 0, 1.0: 9, 2.0: 30, 3.0: 1}}, 2: {'total_count': 37, 1: {1.0: 6, 2.0: 31}, 2: {1.0: 28, 2.0: 9}, 3: {0.0: 0, 1.0: 0, 2.0: 23, 3.0: 14}, 4: {0.0: 0, 1.0: 0, 2.0: 2, 3.0: 35}}}


In [30]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       0.77      1.00      0.87        10
           2       1.00      0.77      0.87        13

    accuracy                           0.92        38
   macro avg       0.92      0.92      0.91        38
weighted avg       0.94      0.92      0.92        38

[[15  0  0]
 [ 0 10  0]
 [ 0  3 10]]
