In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict


data = pd.read_csv('reviews.csv', header=None, names=["review", "label"])
data

Unnamed: 0,review,label
0,The product is amazing,positive
1,I love this item so much,positive
2,Terrible quality,negative
3,Fast shipping and great service,positive
4,Not worth the money,negative
...,...,...
97,Disgusted by this product,negative
98,Can’t live without it now,positive
99,The quality is subpar,negative
100,Best decision I made,positive


In [3]:
#split dataset in training and testing

X = data["review"] #feature set
y = data["label"] #class label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train

Unnamed: 0,review
53,Very practical and useful
22,Would not buy again
68,Highly efficient and effective
44,Misleading description
98,Can’t live without it now
...,...
60,Totally worth it
71,Worst thing I’ve bought
14,Feels cheap and flimsy
92,Love everything about it


In [4]:
#vectorize the test data --> convert the reviews into a bag of words

vectorizer = CountVectorizer(stop_words='english')
X_train_vector = vectorizer.fit_transform(X_train)
X_test_vector = vectorizer.transform(X_test)


X_test_vector

<21x140 sparse matrix of type '<class 'numpy.int64'>'
	with 27 stored elements in Compressed Sparse Row format>

In [5]:
#compute prior probabilty
class_counts = y_train.value_counts() #store the number of positive and negative reviews
class_prior = class_counts / len(y_train) #calculate the probability of positive and negative reviews
class_prior

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
positive,0.518519
negative,0.481481


In [6]:
# Compute likelihood probabilities

from collections import defaultdict

#assign a new value when a new key is encountered

#store count for each word within each class label
word_counts = defaultdict(lambda: defaultdict(int))

#stroe a single count for each label
class_totals = defaultdict(int)

for review, label in zip(X_train, y_train):
  words = review.split()
  for word in words:
    word_counts[label][word] += 1
    class_totals[label] += 1

vocab_size = len(vectorizer.vocabulary_) #store the total number of unique words

# Compute conditional probabilities
def calculate_word_probabilities(word_counts, class_totals, vocab_size):
  word_probs = {}
  for cls, words in word_counts.items():
    # stroe probability of individual words for each class
    word_probs[cls] = {
      word: (count + 1) / (class_totals[cls] + vocab_size)
      for word, count in words.items()
    }
  return word_probs

word_probs = calculate_word_probabilities(word_counts, class_totals, vocab_size)


In [7]:
#BaYES classifier pseudocode

# BayesClassifier(X, C, P_Ck, P_XgCk):
#   predClass = - infinity
#   for each C_k in C:
#     Compute likelihood P(X_gCk)
#     Compute prior:P(C_k)
#     Compute posterior prob:
#       likelihood * prior

#     if posterior > predClass prob:
#       predClass pron = posterior
#       predClassNum = C_k
#   return predClassNum, predClassProb

In [8]:
def bayes_classifier(X, C, P_Ck, P_XgCk):
    pred_class = None
    pred_prob = float('-inf')

    for C_k in C:
      likelihood = 1
      for word in X.split():
        likelihood *= P_XgCk[C_k].get(word, 1 / (class_totals[C_k] + vocab_size))
      posterior = likelihood * P_Ck[C_k]

      if posterior > pred_prob:
        pred_prob = posterior
        pred_class = C_k

    return pred_class, pred_prob

In [9]:
from sklearn.metrics import accuracy_score

def predict(review):
  return bayes_classifier(review, class_counts.keys(), class_prior, word_counts)[0]

# Evaluate accuracy
predictions = [predict(review) for review in X_test]
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy*100:.2f}%')

Accuracy: 71.43%


In [10]:
new_reviews = [
    "I had a terrible experience with this company",
    "This is a great company with excellent customer service",
    "I was really disappointed with this product",
    "The service is too expensive for what it offers"
]

for review in new_reviews:
  print(f'Review: "{review}" -> Prediction: {predict(review)}')

Review: "I had a terrible experience with this company" -> Prediction: negative
Review: "This is a great company with excellent customer service" -> Prediction: positive
Review: "I was really disappointed with this product" -> Prediction: positive
Review: "The service is too expensive for what it offers" -> Prediction: positive
