# **Naive Bayes Text Classifier**

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
data = pd.read_csv("TextClassification.csv")

display(data)

Unnamed: 0,Text,Class
0,I love this sandwich,pos
1,this is an amazing place,pos
2,I feel very good about these beers,pos
3,this is my best work,pos
4,what an awesome view,pos
5,I do not like this restaurant,neg
6,I am tired of this stuff,neg
7,I can't deal with this,neg
8,he is my sworn enemy,neg
9,my boss is horrible,neg


In [5]:
X, Y = data["Text"], data["Class"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, shuffle=False)

In [6]:
Vocabulary = []

for i in X_train:
    Vocabulary += i.lower().split()

Vocabulary.sort()

In [7]:
PositiveProbability = Y_train[Y_train == "pos"].count() / Y_train.count()
NegativeProbability = Y_train[Y_train == "neg"].count() / Y_train.count()

In [8]:
PositiveText = []
for text in X_train[Y_train == "pos"]:
    PositiveText += text.lower().split()
PositiveText.sort()
PositiveCount = len(set(PositiveText))

NegativeText = []
for text in X_train[Y_train == "neg"]:
    NegativeText += text.lower().split()
NegativeText.sort()
NegativeCount = len(set(NegativeText))

In [9]:
WordProbability = {}

for word in Vocabulary:
    positive = (PositiveText.count(word) + 1) / (PositiveCount + len(Vocabulary))
    negative = (NegativeText.count(word) + 1) / (NegativeCount + len(Vocabulary))

    WordProbability[word] = [positive, negative]

In [25]:
estimation = []

for row in range(X_test.shape[0]):
    rowPositive = PositiveProbability
    rowNegative = NegativeProbability

    for word in X_test.iloc[row].lower().split():
        if word not in Vocabulary:
            continue
        rowPositive *= WordProbability[word][0]
        rowNegative *= WordProbability[word][1]

    estimation += ["pos"] if rowPositive >= rowNegative else ["neg"]

    print(f'For test hypothesis: {X_test.iloc[row]}')
    print(f'Prediction: {estimation[-1]}\nActual value: {Y_test.iloc[row]}\n')

For test hypothesis: I am sick and tired of this place
Prediction: neg
Actual value: neg

For test hypothesis: what a great holiday
Prediction: pos
Actual value: pos

For test hypothesis: that is a bad locality to stay
Prediction: pos
Actual value: neg

For test hypothesis: we will have good fun tomorrow
Prediction: pos
Actual value: pos

For test hypothesis: I went to my enemy's house today
Prediction: pos
Actual value: neg



In [26]:
confusion_matrix = pd.DataFrame([[0, 0], [0, 0]],
                                columns=['neg','pos'],
                                index=['neg', 'pos'])

for i in range(Y_test.shape[0]):
    confusion_matrix.loc[Y_test.iloc[i], estimation[i]] += 1

print("Confusion Matrix:")
display(confusion_matrix)

Confusion Matrix:


Unnamed: 0,neg,pos
neg,1,2
pos,0,2


In [27]:
tn, fp, fn, tp = confusion_matrix.loc['neg', 'neg'],\
                 confusion_matrix.loc['neg', 'pos'],\
                 confusion_matrix.loc['pos', 'neg'],\
                 confusion_matrix.loc['pos', 'pos']

precision = tp / (tp + fp)
recall = tn / (tp + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
f1score = (2 * precision * recall) / (precision + recall)

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1score}")

Precision: 0.5
Recall: 0.5
Accuracy: 0.6
F1 Score: 0.5
