# import

In [27]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [34]:
df = pd.read_csv('Example01_Naive_Bayes.csv')
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,sunny,hot,high,False,NO
1,sunny,hot,high,True,NO
2,overcast,hot,high,False,YES
3,rainy,mild,high,False,YES
4,rainy,cool,normal,False,YES
5,rainy,cool,normal,True,NO
6,overcast,cool,normal,True,YES
7,sunny,mild,high,False,NO
8,sunny,cool,normal,False,YES
9,rainy,mild,normal,False,YES


In [35]:
df= df.astype('category')

df["Outlook"]= df["Outlook"].cat.codes
df["Temperature"]= df["Temperature"].cat.codes
df["Humidity"]= df["Humidity"].cat.codes
df["Windy"]= df["Windy"].cat.codes
df["Play"]= df["Play"].cat.codes
df.head(14)

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play
0,2,1,0,0,0
1,2,1,0,1,0
2,0,1,0,0,1
3,1,2,0,0,1
4,1,0,1,0,1
5,1,0,1,1,0
6,0,0,1,1,1
7,2,2,0,0,0
8,2,0,1,0,1
9,1,2,1,0,1


# Calculate P(Y=y) for all possible y

In [36]:
def calculate_prior(df, Y):
    classes = sorted(list(df[Y].unique()))
    prior = []
    for i in classes:
        prior.append(len(df[df[Y]==i])/len(df))
    return prior

# Calculate P(X=x|Y=y) 

In [37]:
def calculate_likelihood_categorical(df, feat_name, feat_val, Y, label):
    feat = list(df.columns)
    df = df[df[Y]==label]
    p_x_given_y = len(df[df[feat_name]==feat_val]) / len(df)
    return p_x_given_y

# Calculate P(X=x1|Y=y)P(X=x2|Y=y)...P(X=xn|Y=y) * P(Y=y) for all y and find the maximum

In [38]:
def naive_bayes_categorical(df, X, Y):
    # get feature names
    features = list(df.columns)[:-1]

    # calculate prior
    prior = calculate_prior(df, Y)

    Y_pred = []
    # loop over every data sample
    for x in X:
        # calculate likelihood
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for j in range(len(labels)):
            for i in range(len(features)):
                likelihood[j] *= calculate_likelihood_categorical(df, features[i], x[i], Y, labels[j])

        # calculate posterior probability (numerator only)
        post_prob = [1]*len(labels)
        for j in range(len(labels)):
            post_prob[j] = likelihood[j] * prior[j]

        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred) 

# Test Categorical model

In [41]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=.2, random_state=35)

X_test = test.iloc[:,:-1].values
Y_test = test.iloc[:,-1].values
Y_pred = naive_bayes_categorical(train, X=X_test, Y="Play")

from sklearn.metrics import confusion_matrix, f1_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))

[[3]]
1.0
