In [233]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
data = load_iris()
d_fr = pd.DataFrame(data.data,columns=data.feature_names)
d_fr_out = pd.DataFrame(data.target,columns=["Flower"])
def label(val, *bounds):
    if val < bounds[0]:
        return 'a'
    elif val < bounds[1]:
        return 'b'
    elif val < bounds[2]:
        return 'c'
    else:
        return 'd'

def to_label(df,old_f):
    mini = df[old_f].min()
    maxi = df[old_f].max()
    mean = df[old_f].mean()
    first = (mini+mean)/2 
    second = mean
    third = (mean+maxi)/2
    return df[old_f].apply(label,args=(first,second,third))

d_fr['sl_new'] = to_label(d_fr,'sepal length (cm)')
d_fr['sw_new'] = to_label(d_fr,'sepal width (cm)')
d_fr['pl_new'] = to_label(d_fr,'petal length (cm)')
d_fr['pw_new'] = to_label(d_fr,'petal width (cm)')

cols = ['sl_new','sw_new','pl_new','pw_new']
x = d_fr[cols].values
y = d_fr_out.values.reshape(-1)
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.75,random_state=45)

In [234]:
def fit(x_train,y_train):
    result = {}
    result["Total_data_points"] = len(y_train)
    diff_classes = set(y_train)
    for curr_class in diff_classes:
        result[curr_class] = {}
        x_train_curr_class = x_train[y_train == curr_class]
        y_train_curr_class = y_train[y_train == curr_class]
        result[curr_class]["Total_count_ccls"] = len(y_train_curr_class)
        features = x_train.shape[1]
        for fea in range(1,features+1):
            result[curr_class][fea] = {}
            distinct_values = set(x_train[:,fea-1])
            for curr_value in distinct_values:
                result[curr_class][fea][curr_value] = (x_train_curr_class[:,fea-1] == curr_value).sum()
    return result

In [235]:
def probablity(cls,x,dict):
    p_clss = np.log(dict[cls]["Total_count_ccls"])-np.log(dict["Total_data_points"])
    prob = p_clss
    num_fea = len(dict[cls].keys())-1
    for j in range(1,num_fea+1):
        count_clss_xj = dict[cls][j][x[j-1]]
        count_clss = dict[cls]["Total_count_ccls"]
        prob_xj_clss = np.log(count_clss_xj)-np.log(count_clss)
        prob = p_clss + prob_xj_clss
    return prob
        


In [236]:
def PredictSingleDataPoint(dict,x):
    classes = dict.keys()
    best_p = -100
    best_class = -1
    for curr_cls in classes:
        if(curr_cls == "Total_data_points"):
            continue
        curr_prob = probablity(curr_cls,x,dict)
        if(curr_prob >= best_p):
            best_p = curr_prob
            best_class = curr_cls
    return best_class

In [237]:
def predict(x_test,naive_dict):
    y_pred = []
    for x in x_test:
        y_pred.append(PredictSingleDataPoint(naive_dict,x))
    return y_pred

In [238]:
naive_dict = fit(x_train,y_train)
y_pred = predict(x_test,naive_dict)
mat = confusion_matrix(y_test,y_pred)
print(mat)


[[14  0  0]
 [ 0  9  0]
 [ 0  7  8]]


  prob_xj_clss = np.log(count_clss_xj)-np.log(count_clss)


In [239]:
clf = GaussianNB()
x_1 = data.data
y_1 = data.target
x_trn,x_tst,y_trn,y_tst = train_test_split(x_1,y_1,train_size = 0.75,random_state=45)
clf.fit(x_trn,y_trn)
y_pred = clf.predict(x_tst)
print(confusion_matrix(y_tst,y_pred))

[[14  0  0]
 [ 0  9  0]
 [ 0  2 13]]
