# **Naive Bayes Text Classifier**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
data = pd.read_csv("TextClassification.csv")

display(data)

Unnamed: 0,Text,Class
0,I love this sandwich,pos
1,this is an amazing place,pos
2,I feel very good about these beers,pos
3,this is my best work,pos
4,what an awesome view,pos
5,I do not like this restaurant,neg
6,I am tired of this stuff,neg
7,I can't deal with this,neg
8,he is my sworn enemy,neg
9,my boss is horrible,neg


In [4]:
X, Y = data["Text"], data["Class"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=False)

In [5]:
Vocabulary = []

for i in X_train:
    Vocabulary.extend(i.lower().split())

Vocabulary = sorted(list(set(Vocabulary)))

In [6]:
PositiveProbability = Y_train[Y_train == "pos"].count() / Y_train.count()
NegativeProbability = Y_train[Y_train == "neg"].count() / Y_train.count()

In [7]:
PositiveText = []

for text in X_train[Y_train == "pos"]:
    PositiveText.extend(text.lower().split())

PositiveText = sorted(PositiveText)
PositiveCount = len(set(PositiveText))

NegativeText = []

for text in X_train[Y_train == "neg"]:
    NegativeText.extend(text.lower().split())

NegativeText = sorted(NegativeText)
NegativeCount = len(set(NegativeText))

In [8]:
WordProbability = {}

for word in Vocabulary:
    positive = (PositiveText.count(word) + 1) / (PositiveCount + len(Vocabulary))
    negative = (NegativeText.count(word) + 1) / (NegativeCount + len(Vocabulary))

    WordProbability.update({word: [positive, negative]})

In [9]:
estimation = []

for row in range(X_test.shape[0]):
    rowPositive = PositiveProbability
    rowNegative = NegativeProbability

    for word in X_test.iloc[row].lower().split():
        if word not in Vocabulary:
            continue
        rowPositive *= WordProbability[word][0]
        rowNegative *= WordProbability[word][1]

    estimation.append("pos" if rowPositive >= rowNegative else "neg")

In [10]:
confusion_matrix = pd.DataFrame([[0, 0], [0, 0]], columns=['negative','positive'], index=['negative', 'positive'])

for i in range(Y_test.shape[0]):
    if Y_test.iloc[i] == "neg" and estimation[i] == "neg":
        confusion_matrix.loc['negative', 'negative'] += 1
    elif Y_test.iloc[i] == "neg" and estimation[i] == "pos":
        confusion_matrix.loc['negative', 'positive'] += 1
    elif Y_test.iloc[i] == "pos" and estimation[i] == "neg":
        confusion_matrix.loc['positive', 'negative'] += 1
    elif Y_test.iloc[i] == "pos" and estimation[i] == "pos":
        confusion_matrix.loc['positive', 'positive'] += 1

display(confusion_matrix)

Unnamed: 0,negative,positive
negative,1,2
positive,0,2


In [11]:
tn, fp, fn, tp = confusion_matrix.loc['negative', 'negative'], confusion_matrix.loc['negative', 'positive'], \
                confusion_matrix.loc['positive', 'negative'], confusion_matrix.loc['positive', 'positive']

precision = tp / (tp + fp)
recall = tn / (tp + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
f1score = (2 * precision * recall) / (precision + recall)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1score}")

Precision: 0.5
Recall: 0.5
Accuracy: 0.6
F1 Score: 0.5
