In [28]:
!pip install datasets



In [29]:
from datasets import load_dataset
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

## Preparing and pre-processing the data

In [30]:
dataset = load_dataset("sst", "default")
trainDataset = dataset["train"]
validationDataset = dataset["validation"]
testDataset = dataset["test"]

In [31]:
# create pandas dataframes
trainDf = dataset["train"].to_pandas()
testDf = dataset["test"].to_pandas()
validationDf = dataset["validation"].to_pandas()
print(trainDf.head())
print(testDf.head())
print(validationDf.head())

                                            sentence    label  \
0  The Rock is destined to be the 21st Century 's...  0.69444   
1  The gorgeously elaborate continuation of `` Th...  0.83333   
2  Singer\/composer Bryan Adams contributes a sle...  0.62500   
3  You 'd think by now America would have had eno...  0.50000   
4               Yet the act is still charming here .  0.72222   

                                              tokens  \
0  The|Rock|is|destined|to|be|the|21st|Century|'s...   
1  The|gorgeously|elaborate|continuation|of|``|Th...   
2  Singer\/composer|Bryan|Adams|contributes|a|sle...   
3  You|'d|think|by|now|America|would|have|had|eno...   
4               Yet|the|act|is|still|charming|here|.   

                                                tree  
0  70|70|68|67|63|62|61|60|58|58|57|56|56|64|65|5...  
1  71|70|69|69|67|67|66|64|63|62|62|61|61|58|57|5...  
2  72|71|71|70|68|68|67|67|66|63|62|62|60|60|58|5...  
3  36|35|34|33|33|32|30|29|27|26|25|24|23|23|22|2...

In [32]:
def assignSentimentClass(score):
  if 0 <= score <= 0.2:
    return 0
  elif 0.2 < score <= 0.4:
    return 1
  elif 0.4 < score <= 0.6:
    return 2
  elif 0.6 < score <= 0.8:
    return 3
  else:
    return 4

In [33]:
def tokenize(tokens):
  splitArray = tokens.split("|")
  return np.array(splitArray)

tokenize = np.vectorize(tokenize, otypes=[object])

In [34]:
trainSentences = trainDf['sentence'].str.lower()
trainLabels = trainDf['label']
trainTokens = trainDf['tokens'].str.lower()
trainTokens = tokenize(np.array(trainTokens))
trainClasses = np.array(trainLabels.apply(assignSentimentClass))

testSentences = testDf['sentence'].str.lower()
testLabels = testDf['label']
testTokens = testDf['tokens'].str.lower()
testTokens = tokenize(np.array(testTokens))
testClasses = np.array(testLabels.apply(assignSentimentClass))

validationSentences = validationDf['sentence'].str.lower()
validationLabels = validationDf['label']
validationTokens = validationDf['tokens'].str.lower()
validationTokens = tokenize(np.array(validationTokens))
validationClasses = np.array(validationLabels.apply(assignSentimentClass))

## Naive Bayes

In [92]:
class NaiveBayes:
  def __init__(self, smoothingFactor = 1):
    self.classes = None
    self.classesLogPrior = None
    self.featureLogLikelihood = {0:{},1:{},2:{},3:{},4:{}}
    self.smoothingFactor = smoothingFactor
    self.vocab = None
    self.bigDocs = {}

  def fit(self, features, classes):
    self.vocab = np.unique(np.concatenate(features))
    self.classes = np.unique(classes)
    self.calculateClassesPrior(classes)
    self.createBigDoc(features, classes)
    self.calculateLogLikelihood(features, classes)

  def calculateClassesPrior(self, classes):
    classFreq = np.bincount(classes)
    totalDocs = len(classes)
    self.classesLogPrior = np.log(classFreq / totalDocs)

  def createBigDoc(self, features, classes):
    for c in self.classes:
      filteredArray = features[classes == c]
      self.bigDocs[c] = np.concatenate(filteredArray)

  def calculateLogLikelihood(self, features, classes):
    for c in self.classes:
      classDoc = self.bigDocs[c]
      noWords = len(classDoc)
      classVocab = np.unique(classDoc)
      noUniqueWords = len(self.vocab)
      for w in self.vocab:
        count = np.count_nonzero(classDoc == w)
        logLikelihood = np.log((count + self.smoothingFactor)/(noWords + (self.smoothingFactor * noUniqueWords)))
        self.featureLogLikelihood[c][w] = logLikelihood

  def predict(self, testDoc):
    sum = [0 for _ in range(len(self.classes))]
    for c in self.classes:
      sum[c] = self.classesLogPrior[c]
      for w in testDoc:
        if w in self.vocab:
          sum[c] += self.featureLogLikelihood[c][w]

    return np.argmax(sum)

  def testModel(self, features):
    predictions = []
    for i in range(len(features)):
      prediction = self.predict(features[i])
      predictions.append(prediction)
    return predictions


In [93]:
naiveBayes = NaiveBayes()
naiveBayes.fit(trainTokens, trainClasses)

In [94]:
predictionNaiveBayes = naiveBayes.testModel(testTokens)

### scikit

In [85]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
scikitX = vectorizer.fit_transform(trainSentences).astype(int)
scikitY = trainClasses

scikitNaiveBayes = MultinomialNB()
scikitNaiveBayes.fit(scikitX, scikitY)

In [89]:
print(scikitNaiveBayes.class_log_prior_)
print(naiveBayes.classesLogPrior)

print(scikitNaiveBayes.feature_log_prob_)
print(naiveBayes.featureLogLikelihood)

[-2.0572184  -1.34862339 -1.66033704 -1.3028004  -1.89213865]
[-2.0572184  -1.34862339 -1.66033704 -1.3028004  -1.89213865]
[[-11.47047715 -11.47047715 -11.47047715 ... -11.47047715 -11.47047715
  -10.77732997]
 [-11.63595693 -10.94280975 -10.94280975 ... -11.63595693 -11.63595693
  -11.63595693]
 [-10.8486961  -11.54184329 -11.54184329 ... -10.8486961  -11.54184329
  -11.54184329]
 [-11.65872134 -11.65872134 -11.65872134 ... -11.65872134 -11.65872134
  -11.65872134]
 [-11.49646066 -11.49646066 -11.49646066 ... -11.49646066 -10.80331348
  -11.49646066]]


## Evaluations

In [39]:
def confusionMatrix(predictions, realValues, numClasses):
  confMatrix = np.zeros((numClasses, numClasses), dtype=int)

  for pred, value in zip(predictions, realValues):
    confMatrix[pred, value] += 1

  return confMatrix

def computeMetrics(confMatrix):
    noClasses = confMatrix.shape[0]
    precision = np.zeros(noClasses)
    recall = np.zeros(noClasses)
    f1 = np.zeros(noClasses)

    for i in range(noClasses):
        truePositive = confMatrix[i, i]
        falsePositive = np.sum(confMatrix[i, :]) - truePositive
        falseNegative = np.sum(confMatrix[:, i]) - truePositive

        precision[i] = truePositive / (truePositive + falsePositive) if (truePositive + falsePositive) != 0 else 0
        recall[i] = truePositive / (truePositive + falseNegative) if (truePositive + falseNegative) != 0 else 0

        f1[i] = 2 * (precision[i] * recall[i]) / (precision[i] + recall[i]) if (precision[i] + recall[i]) != 0 else 0

    # Macro-averaged precision, recall, and F1 score
    macroPrecision = np.mean(precision)
    macroRecall = np.mean(recall)
    macroF1 = np.mean(f1)

    return precision, recall, f1, macroPrecision, macroRecall, macroF1

def printMetrics(metricsTuple):
  precision, recall, f1, macroPrecision, macroRecall, macroF1 = metricsTuple
  print("Precision: ", precision)
  print("Recall: ", recall)
  print("F1: ", f1)
  print("Macro Precision: ", macroPrecision)
  print("Macro Recall: ", macroRecall)
  print("Macro F1: ", macroF1)

### Evaluate Naive Bayes

In [95]:
naiveBayesConfMatrix = confusionMatrix(predictionNaiveBayes, testClasses, len(np.unique(testClasses)))
metrics = computeMetrics(naiveBayesConfMatrix)
printMetrics(metrics)

Precision:  [0.4        0.45394737 0.21848739 0.3527668  0.58267717]
Recall:  [0.05734767 0.65402844 0.06683805 0.7        0.18546366]
F1:  [0.10031348 0.53592233 0.1023622  0.46911958 0.28136882]
Macro Precision:  0.4015757454304678
Macro Recall:  0.33273556233804336
Macro F1:  0.2978172830477508
