# Naive bayes is an algorithm to find the probability of yk occuring for a variable feature x = {x1,x2,x3...xn} where yk is the each of the possible classes y = {y1,y2,y3...yk} for example if there are 3 companies having different features then finding what is the probability that the defect is from the first company where y is the 3 companies and its features are x. So we use naive bayes theorem to solve this which is P(yk|x) = P(X|yk)*P(yk)/P(x) where P(yk) is called a prior, P(yk|x) is called the posterior, P(x|yk) is called as likelihood and we can write P(X|yk) = P(x1|yk)*P(x2|yk)*...*P(xn|yk), and P(x) is called an evidence.

In [None]:
# There is a problem if for a particular xi there is not particular case with some yk then its probability will be equal to 0.
# Which will not be helpful to our model and hence we use the method known as laplace smoothing which is:
# P(xi|Y=Yi) =( Number of cases where xi occurs with Y=Yi + alpha)/(N+alpha*K)
# here alpha is the small smoothing constant, N is the total occurence for Y=Yi, K is the total number of Y classes.

In [2]:
import numpy as np 
import pandas as pd 
from sklearn.naive_bayes import BernoulliNB

In [61]:
# Below is a small movie based train and test where x features are liking movies 1,2,3 and Y is if liked a target movie 4.
X_train = np.array([[0,1,1],[0,0,1],[0,0,0],[1,1,0]])
Y_train = ['Y','N','Y','Y'] # in this case Y is having 2 data points so it will be easier
X_test = np.array([[1,1,0],[1,1,1],[0,0,0]])

In [7]:
# We will calculate our prior and label each of them
def get_labels_indices(labels):
    from collections import defaultdict
    label_indices = defaultdict(list)
    for index,label in enumerate(labels):
        label_indices[label].append(index)
    return label_indices

In [14]:
# Let see the labels for our y data
label_indices = get_labels_indices(Y_train)
label_indices

defaultdict(list, {'Y': [0, 2, 3], 'N': [1]})

In [10]:
# Now lets calculate prior
def get_prior(label_indices):
    prior = {label: len(indices) for label, indices in
                                    label_indices.items()}
    total_count = sum(prior.values())
    for label in prior:
        prior[label] /= total_count
    return prior
    

In [11]:
get_prior(label_indices)

{'Y': 0.75, 'N': 0.25}

In [22]:
def get_likelihood(X_train,label_indices,smoothing=0):
    likelihood = {}
    for label,indices in label_indices.items():
        likelihood[label] = X_train[indices,:].sum(axis=0)+smoothing
        likelihood[label] = likelihood[label]/(len(indices)+ smoothing*len(label_indices)) # len(indices) will tell us the length of the string and other part will calculate alpha*label_indices
    return likelihood


In [23]:
get_likelihood(X_train,label_indices,smoothing=1)

{'Y': array([0.4, 0.6, 0.4]), 'N': array([0.33333333, 0.33333333, 0.66666667])}

In [66]:
>>> def get_posterior(X, prior, likelihood):
...     """
...     Compute posterior of testing samples, based on prior and 
...     likelihood
...     @param X: testing samples
...     @param prior: dictionary, with class label as key, 
...                   corresponding prior as the value
...     @param likelihood: dictionary, with class label as key, 
...                        corresponding conditional probability
...                            vector as value
...     @return: dictionary, with class label as key, corresponding 
...              posterior as value
...     """
...     posteriors = []
...     for x in X:
...         # posterior is proportional to prior * likelihood
...         posterior = prior.copy()
...         for label, likelihood_label in likelihood.items():
...             for index, bool_value in enumerate(x):
...           
                    if bool_value:
                        posterior[label] *= likelihood_label[index] 
                    else:
                        posterior[label]*= (1 - likelihood_label[index])
...         # normalize so that all sums up to 1
...         sum_posterior = sum(posterior.values())
...         for label in posterior:
...             if posterior[label] == float('inf'):
...                 posterior[label] = 1.0
...             else:
...                 posterior[label] /= sum_posterior
...         posteriors.append(posterior.copy())
...     return posteriors


In [67]:
prior = get_prior(label_indices)
likelihood = get_likelihood(X_train,label_indices,smoothing=1)
posterior = get_posterior(X_test,prior,likelihood)

In [68]:
posterior

[{'Y': 0.9210360075805433, 'N': 0.07896399241945673},
 {'Y': 0.795417348608838, 'N': 0.204582651391162},
 {'Y': 0.7446373850868232, 'N': 0.2553626149131768}]

In [69]:
# Using scikit learn and comparing our model answer we get both of them are the same
post = BernoulliNB(alpha = 1.0,fit_prior=True)
post.fit(X_train,Y_train)
post.predict_proba(X_test)

array([[0.07896399, 0.92103601],
       [0.20458265, 0.79541735],
       [0.25536261, 0.74463739]])