<a href="https://colab.research.google.com/github/RaminParker/Text-Classification-with-Python/blob/master/Naive_Bayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

- Naive Bayes Classifier 
- Naive Bayes Classifier from scratch
- Naive Bayes Classifier optimized

In [0]:
import pandas as pd
import numpy as np
from sklearn import datasets

from sklearn.model_selection import train_test_split

from sklearn import metrics
from sklearn.naive_bayes import GaussianNB # for continuous features

# Load data

In [0]:
iris = datasets.load_iris() #load data

d = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])

In [0]:
d.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [0]:
d.columns # get columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

In [0]:
d.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target               float64
dtype: object

In [0]:
d['target'] = d['target'].astype('category') # change type pf column

In [0]:
d.dtypes

sepal length (cm)     float64
sepal width (cm)      float64
petal length (cm)     float64
petal width (cm)      float64
target               category
dtype: object

# Fit model

In [0]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(d.drop('target', axis = 1), d['target'], test_size=0.2, random_state=1526)

In [0]:
#Create a Gaussian Classifier
model = GaussianNB()

In [0]:
#Train the model using the training sets
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [0]:
#Train the model using the training sets
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [0]:
#Predict the response for test dataset
y_pred = model.predict(X_test)

# Interpretation

In [0]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy: ",metrics.accuracy_score(y_test, y_pred))

Accuracy:  0.9


# Naive bayes from scratch

1. Calculate the prior probability for given class labels
2. Find Likelihood probability with each attribute for each class
3. Put these value in Bayes Formula and calculate posterior probability.
4. See which class has a higher probability, given the input belongs to the higher probability clas



#p(y|X) = p(X|y)*p(y) / p(X) = p(X|y)*p(y) * 1/p(X)




In [0]:
from scipy.stats import norm

In [0]:
train = pd.DataFrame(data = np.c_[X_train, y_train], columns= iris['feature_names'] + ['target']) # load data

In [0]:
train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,4.4,2.9,1.4,0.2,0.0
1,6.4,3.2,4.5,1.5,1.0
2,4.8,3.0,1.4,0.3,0.0
3,5.7,2.8,4.5,1.3,1.0
4,7.7,3.8,6.7,2.2,2.0


In [0]:
train['target'] = train['target'].astype('int') # change type of column

In [0]:
train.dtypes

sepal length (cm)    float64
sepal width (cm)     float64
petal length (cm)    float64
petal width (cm)     float64
target                 int64
dtype: object

1) Calculate the prior probability for given class labels



In [0]:
# count of specific label / count of all labels
# P(y = 0)
P_y = []
P_y.append(len(train.loc[train['target'] == 0]) / len(train['target']))  # p(y)
# P(y = 1)
P_y.append( len(train.loc[train['target'] == 1]) / len(train['target'])) # p(y)
# P(y = 2)
P_y.append(len(train.loc[train['target'] == 2]) / len(train['target']))  # p(y)

In [0]:
P_y

[0.3416666666666667, 0.325, 0.3333333333333333]

2) Find Likelihood probability with each attribute for each class

In [0]:
mu = []     # define array
sigma = []

In [0]:
# Get parameters for the normal distributions below
# every variable has got a mean and std given each condition (y = 0 / 1 / 2)

#...given y = 0
mu.append(train.loc[train['target'] == 0].drop('target', axis = 1).mean(axis = 0, skipna = True))   # Get all rows where target = 0. Then get mean for each feature (column)
sigma.append(train.loc[train['target'] == 0].drop('target', axis = 1).std(axis = 0, skipna = True)) # Get all rows where target = 0. Then get standart deviation for each feature (column)

In [0]:
#...given y = 1
mu.append(train.loc[train['target'] == 1].drop('target', axis = 1).mean(axis = 0, skipna = True))
sigma.append(train.loc[train['target'] == 1].drop('target', axis = 1).std(axis = 0, skipna = True))

In [0]:
#...given y = 2
mu.append(train.loc[train['target'] == 2].drop('target', axis = 1).mean(axis = 0, skipna = True))
sigma.append(train.loc[train['target'] == 2].drop('target', axis = 1).std(axis = 0, skipna = True))

The constant is just a factor that makes the result equal to a probability (scale the result values to a range from 0 to 1)! You do not need this constant factor. However, which group has the highest value will not change by multiplication with this constant. We are only interested in which group has got the highest value.

P(y|X) = p(X|y)*p(y)/p(X) = 1/p(X) * p(X|y) * p(y) = constant * likelihood * prior

![alt text](https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2019/05/Capture.jpg1-300x136.jpg)

In [0]:
def getGroup(values):
    res = []
    for i in [0,1,2]:
        group = i
        
        # likelihood_0*likelihood_1*...*likelihood_n*prior_proba

        res.append(P_y[group]* # prior_proba
                   norm.pdf(values[0], mu[group][0], sigma[group][0])* \
                   norm.pdf(values[1], mu[group][1], sigma[group][1])* \
                   norm.pdf(values[2], mu[group][2], sigma[group][2])* \
                   norm.pdf(values[3], mu[group][3], sigma[group][3]))
    return (res.index(max(res))) #return expected group label

In [0]:
results = pd.DataFrame(columns = ['prediction','true value']) # create empty df

In [0]:
for i in range(len(y_test)):
    vals = list(X_test.iloc[i])
    results = results.append({'prediction': getGroup(vals), 'true value': y_test.iloc[i]}, ignore_index=True)

In [0]:
results.head()

Unnamed: 0,prediction,true value
0,2.0,2.0
1,2.0,2.0
2,1.0,1.0
3,2.0,2.0
4,2.0,2.0


In [0]:
for i in range(len(y_test)):
    if results.loc[i,'prediction'] == results.loc[i,'true value'] :
        results.loc[i,'match'] = True 
    else: 
        results.loc[i,'match'] = False

In [0]:
results.head()

Unnamed: 0,prediction,true value,match
0,2.0,2.0,True
1,2.0,2.0,True
2,1.0,1.0,True
3,2.0,2.0,True
4,2.0,2.0,True


In [0]:
print('Accuracy: ',sum(results['match'])/len(results['match']))

Accuracy:  0.9


# Naive bayes from scratch - optimized

It works, but this method will not be as exact as it could be, because of  float point underflow (computers work inaccurately with very small floats).

Therefor instead of calculating 
argmax_y P(y) * multiplication(P(x_i|y))
we calcualte
argmax_y log(P(y)) + sum(log(P(x_i|y)))

In [0]:
from math import log

In [0]:
# P(y|X) = p(X|y)*p(y)/p(X) = 1/p(X) * p(X|y)*p(y) = constant * likelihood * prior
def getGroup(values):
    res = []
    for i in [0,1,2]:
        group = i

        # likelihood_0*likelihood_1*...*likelihood_n*prior_proba
        # note: we changed (*) symbol to (+) symbol because of log rules
        res.append(log(P_y[group])+ \
                   log(norm.pdf(values[0], mu[group][0], sigma[group][0]))+ \
                   log(norm.pdf(values[1], mu[group][1], sigma[group][1]))+ \
                   log(norm.pdf(values[2], mu[group][2], sigma[group][2]))+ \
                   log(norm.pdf(values[3], mu[group][3], sigma[group][3])))
    return (res.index(max(res))) #return expected group label
    
results = pd.DataFrame(columns = ['prediction','true value'])
for i in range(len(y_test)):
    vals = list(X_test.iloc[i])
    results = results.append({'prediction': getGroup(vals), 'true value': y_test.iloc[i]}, ignore_index=True)
    

for i in range(len(y_test)):
    if results.loc[i,'prediction'] == results.loc[i,'true value'] :
        results.loc[i,'match'] = True 
    else: 
        results.loc[i,'match'] = False

print('Accuracy: ',sum(results['match'])/len(results['match']))

Accuracy:  0.9



- When working with categorical features, use the multinomial or categorical instead of the normal distribution
- sklearn PROBABLY does this log-ing automatically...
