# Shrey Viradiya
### 18BCE259

# Naive Bayes Classifiers


In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import *
import plotly.graph_objects as go
import plotly.express as px
from collections import Counter

In [2]:
np.random.seed(259)

## Data

In [3]:
iris = load_iris()

In [4]:
print("IRIS Dataset")
print(f"Feature names: {iris['feature_names']}")
print(f"Target names: {iris['target_names']}")

IRIS Dataset
Feature names: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Target names: ['setosa' 'versicolor' 'virginica']


In [5]:
X = iris['data']
y = iris['target']
X = pd.DataFrame(X, columns=iris['feature_names'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=259)

## GaussianNB Naive Bayes

In [7]:
gauss = GaussianNB()
gauss.fit(X, y)

GaussianNB()

In [8]:
prediction = gauss.predict(X_test)

In [9]:
print("Report: ____________________________\n")
print(f"Accuracy: {accuracy_score(y_test, prediction)}")
print("Confisuion metrix:")
print(confusion_matrix(y_test, prediction))
print("Classification Report:")
print(classification_report(y_test, prediction))

Report: ____________________________

Accuracy: 0.9466666666666667
Confisuion metrix:
[[28  0  0]
 [ 0 18  3]
 [ 0  1 25]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       0.95      0.86      0.90        21
           2       0.89      0.96      0.93        26

    accuracy                           0.95        75
   macro avg       0.95      0.94      0.94        75
weighted avg       0.95      0.95      0.95        75



### GaussianNB without sklearn

In [10]:
class GNB:
    def __init__(self):
        self.means = None
        self.stds = None
        self.class_prab = None
        self.classes = None
    
    def fit(self,X_train, y_train):
        self.means = X_train.groupby(y_train).apply(np.mean)
        self.stds = X_train.groupby(y_train).apply(np.std)
        self.class_prob = X_train.groupby(y_train).apply(lambda x: len(x)/X_train.shape[0])
        self.classes = np.unique(y_train)
    
    def predict(self,X_test):
        y_pred = []
        for index in range(X_test.shape[0]):
            p = {}
            for clss in self.classes:
                p[clss] = self.class_prob[clss]
                for idx, value in enumerate(X_test.iloc[index]):
                    p[clss] *= ( 1/(np.sqrt(2 * np.pi)*self.stds.iloc[clss, idx]) )* np.exp( -(value - self.means.iloc[clss, idx])**2/(2* (self.stds.iloc[clss, idx])**2 ))
            y_pred.append(pd.Series(p).values.argmax())
        return y_pred

In [11]:
gnb = GNB()
gnb.fit(X_train, y_train)
prediction = gnb.predict(X_test)

In [12]:
print("Report: ____________________________\n")
print(f"Accuracy: {accuracy_score(y_test, prediction)}")
print("Confisuion metrix:")
print(confusion_matrix(y_test, prediction))
print("Classification Report:")
print(classification_report(y_test, prediction))

Report: ____________________________

Accuracy: 0.9333333333333333
Confisuion metrix:
[[28  0  0]
 [ 0 18  3]
 [ 0  2 24]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        28
           1       0.90      0.86      0.88        21
           2       0.89      0.92      0.91        26

    accuracy                           0.93        75
   macro avg       0.93      0.93      0.93        75
weighted avg       0.93      0.93      0.93        75



## Text Data

In [13]:
reviews = pd.read_csv('yelp_labelled.txt',delimiter='\t')

In [14]:
reviews

Unnamed: 0,Sentence,bool
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [15]:
vectorizer = CountVectorizer()

In [16]:
X = vectorizer.fit_transform(reviews['Sentence'])
X = np.array(X.toarray())
y = np.array(reviews['bool'])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=259)

## Multivariate Bernoulli Naive Bayes

In [18]:
Bern = BernoulliNB()

In [19]:
Bern.fit(X_train, y_train)

BernoulliNB()

In [20]:
prediction = Bern.predict(X_test)

In [21]:
print("Report: ____________________________\n")
print(f"Accuracy: {accuracy_score(y_test, prediction)}")
print("Confisuion metrix:")
print(confusion_matrix(y_test, prediction))
print("Classification Report:")
print(classification_report(y_test, prediction))

Report: ____________________________

Accuracy: 0.79
Confisuion metrix:
[[73 32]
 [10 85]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.70      0.78       105
           1       0.73      0.89      0.80        95

    accuracy                           0.79       200
   macro avg       0.80      0.79      0.79       200
weighted avg       0.81      0.79      0.79       200



## Multinomial Naive Bayes

In [22]:
Mnom = MultinomialNB()

In [23]:
Mnom.fit(X_train, y_train)

MultinomialNB()

In [24]:
prediction = Mnom.predict(X_test)

In [25]:
print("Report: ____________________________\n")
print(f"Accuracy: {accuracy_score(y_test, prediction)}")
print("Confisuion metrix:")
print(confusion_matrix(y_test, prediction))
print("Classification Report:")
print(classification_report(y_test, prediction))

Report: ____________________________

Accuracy: 0.825
Confisuion metrix:
[[82 23]
 [12 83]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.78      0.82       105
           1       0.78      0.87      0.83        95

    accuracy                           0.82       200
   macro avg       0.83      0.83      0.82       200
weighted avg       0.83      0.82      0.82       200

