# Importing libraries

In [1]:
import math
import os
import sys
from subprocess import call
from nltk import FreqDist
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
import sklearn as sk
import pickle
import json
from collections import Counter
import requests
import matplotlib.pyplot as plt
import numpy as np

# Importing movie reviews dataset

In [2]:
# !wget https://gist.githubusercontent.com/bastings/d47423301cca214e3930061a5a75e177/raw/5113687382919e22b1f09ce71a8fecd1687a5760/reviews.json

In [3]:
with open("reviews.json", mode="r", encoding="utf-8") as f:
  reviews = json.load(f)

# Lexicon based approach

## Importing lexicon

In [4]:
# !wget https://gist.githubusercontent.com/bastings/d6f99dcb6c82231b94b013031356ba05/raw/f80a0281eba8621b122012c89c8b5e2200b39fd6/sent_lexicon

## Binary classification

In [5]:
# creating lexicon dictionary with required features
lexicon_dict = {}

with open("sent_lexicon", mode="r", encoding="utf-8") as f:
  for i in f:
    i_list = i.strip().split()

    key = i_list[2].split("=")[1]

    if i_list[5].split("=")[1] == "positive":
      value_1 = 1
    elif i_list[5].split("=")[1] == "negative":
      value_1 = -1
    else:
      value_1 = 0

    MULTIPLIER_WEAK = 0.5
    MULTIPLIER_STRONG = 1
    value_2 = MULTIPLIER_WEAK * value_1 if i_list[0].split("=")[1] == "weaksubj" else MULTIPLIER_STRONG * value_1

    lexicon_dict[key] = [value_1, value_2]

# function to get binary_scores
def get_binary_score(review):
  binary_score = 0
  doc_length = 0

  for sentences in review["content"]:
    for word, _ in sentences:
      try:
        binary_score += lexicon_dict[word][0]
        # doc_length += 1
      except KeyError:
        binary_score += 0
      
      doc_length += 1
      
  return [binary_score, doc_length]

# function to classify reivew
def classify_review(parameters):
  score, doc_length = parameters
  THRESHOLD = 8

  if score > THRESHOLD:
    return "POS"
  else:
    return "NEG"

# calculating accuracy
classifications = [classify_review(get_binary_score(review)) for review in reviews]
token_results = [1 if classification == reviews[i]["sentiment"] else 0 for i, classification in enumerate(classifications)]
token_accuracy = token_results.count(1)/len(token_results)
print("Accuracy: %0.2f" % token_accuracy)

Accuracy: 0.68


## Weighted classification

In [6]:
# function to get weighted scores
def get_weighted_score(review):
  weighted_score = 0
  doc_length = 0

  for sentences in review["content"]:
    for word, _ in sentences:
      try:
        weighted_score += lexicon_dict[word][1]
        # doc_length += 1
      except KeyError:
        weighted_score += 0

      doc_length += 1

  return [weighted_score, doc_length]

# calculating accuracy
classifications_weighted = [classify_review(get_weighted_score(review)) for review in reviews]
magnitude_results = [1 if classification == reviews[i]["sentiment"] else 0 for i, classification in enumerate(classifications_weighted)]
magnitude_accuracy = magnitude_results.count(1)/len(magnitude_results)
print("Accuracy: %0.2f" % magnitude_accuracy)

Accuracy: 0.69


## Creating better threshold

In [7]:
# new threshold classification
def classify_review_better(parameters):
  score, doc_length = parameters
  THRESHOLD = 1.02 * math.log(doc_length)

  if score >= THRESHOLD:
    return "POS"
  else:
    return "NEG"

# calculating new weighted classification accuracy
classifications_weighted_new = [classify_review_better(get_weighted_score(review)) for review in reviews]
magnitude_results_2 = [1 if classification == reviews[i]["sentiment"] else 0 for i, classification in enumerate(classifications_weighted_new)]
magnitude_accuracy_2 = magnitude_results_2.count(1)/len(magnitude_results_2)
print("New accuracy of weighted classification: %0.2f" % magnitude_accuracy_2)

New accuracy of weighted classification: 0.70


# Naive Bayes approach

## Model definition

In [8]:
from collections import defaultdict

class NaiveBayes:
  # constructor
  def __init__(self, train_set, test_set, kappa=0, stemmer=False, bigram=False, trigram=False):
    self.train_set = train_set
    self.test_set = test_set
    self.kappa = kappa
    self.stemmer = stemmer
    self.bigram = bigram
    self.trigram = trigram

    # initialising empty conditional probability dictionaries
    self.conditional_bigram_positive = {}
    self.conditional_bigram_negative = {}

    self.conditional_trigram_positive = {}
    self.conditional_trigram_negative = {}

    # initialising empty vocabulary for applying classifier
    self.tokens_bigram = []
    self.tokens_trigram = []

    # initialising empty variable to count correct classifications
    self.correct_count = []


  # count reviews in class
  def count_reviews(self):
    review_count_positive = 0
    review_count_negative = 0

    for review in self.train_set:
      sentiment = review["sentiment"]

      if sentiment == "POS":
        review_count_positive += 1

      elif sentiment == "NEG":
        review_count_negative += 1

    return review_count_positive, review_count_negative


  # tokenisation
  def tokenise(self, word):
    if self.stemmer == False:
      return word.lower()
    else:
      return PorterStemmer().stem(word)


  # getting ngram features
  def get_ngram_features(self, n):
    vocabulary_positive = defaultdict(int)
    vocabulary_negative = defaultdict(int)

    for review in self.train_set:
      sentiment = review["sentiment"]

      for sentence in review["content"]:
        sentence_ngram = list(ngrams(sentence, n))

        for ngram in sentence_ngram:
          tokens = tuple(self.tokenise(word[0]) for word in ngram)

          if sentiment == "POS":
            vocabulary_positive[tokens] += 1
          elif sentiment == "NEG":
            vocabulary_negative[tokens] += 1

    return vocabulary_positive, vocabulary_negative


  # calculating condtional probabilities
  def calculate_conditional(self, vocabulary, vocabulary_count):
    denominator = sum(vocabulary.values()) + (vocabulary_count * self.kappa)
    conditional_vocabulary = {word: ((count + self.kappa) / denominator) for word, count in vocabulary.items()}

    return conditional_vocabulary


  # training bayes classifier
  def train_bayes_classifier(self):
    # review count per class
    review_count_positive, review_count_negative = self.count_reviews()

    # unigram feature extraction
    vocabulary_unigram_positive, vocabulary_unigram_negative = self.get_ngram_features(1)

    unigram_positive_count = len(set(vocabulary_unigram_positive))
    unigram_negative_count = len(set(vocabulary_unigram_negative))

    self.conditional_unigram_positive = self.calculate_conditional(vocabulary_unigram_positive, unigram_positive_count)
    self.conditional_unigram_negative = self.calculate_conditional(vocabulary_unigram_negative, unigram_negative_count)

    # bigram feature extraction
    if self.bigram != False:
      vocabulary_bigram_positive, vocabulary_bigram_negative = self.get_ngram_features(2)

      bigram_positive_count = len(set(vocabulary_bigram_positive))
      bigram_negative_count = len(set(vocabulary_bigram_negative))

      self.conditional_bigram_positive = self.calculate_conditional(vocabulary_bigram_positive, bigram_positive_count)
      self.conditional_bigram_negative = self.calculate_conditional(vocabulary_bigram_negative, bigram_negative_count)

    # trigram feature extraction
    if self.trigram != False:
      vocabulary_trigram_positive, vocabulary_trigram_negative = self.get_ngram_features(3)

      trigram_positive_count = len(set(vocabulary_trigram_positive))
      trigram_negative_count = len(set(vocabulary_trigram_negative))

      self.conditional_trigram_positive = self.calculate_conditional(vocabulary_trigram_positive, trigram_positive_count)
      self.conditional_trigram_negative = self.calculate_conditional(vocabulary_trigram_negative, trigram_negative_count)

    # conditional probability dictionary
    self.conditional_positive = self.conditional_unigram_positive | self.conditional_bigram_positive | self.conditional_trigram_positive
    self.conditional_negative = self.conditional_unigram_negative | self.conditional_bigram_negative | self.conditional_trigram_negative

    # total number of reviews
    reviews_count = len(self.train_set)

    # calculating prior
    self.prior_positive = review_count_positive / reviews_count
    self.prior_negative = review_count_negative / reviews_count


  # extract tokens from review
  def extract_tokens(self, review, n):
    tokens = []

    for sentence in review["content"]:
        sentence_ngram = list(ngrams(sentence, n))

        for ngram in sentence_ngram:
            tokens.append(tuple(self.tokenise(word[0]) for word in ngram))

    return tokens


  # calculating score
  def calculate_score(self, tokens_intersection):
    score_positive = math.log(self.prior_positive)
    score_negative = math.log(self.prior_negative)

    for token in tokens_intersection:
        score_positive += math.log(self.conditional_positive[token])
        score_negative += math.log(self.conditional_negative[token])

    return score_positive, score_negative


  # applying bayes theorem
  def apply_bayes(self):
    for review in self.test_set:
      # extracting tokens
      self.tokens_unigram = self.extract_tokens(review, 1)

      if self.bigram != False:
        self.tokens_bigram = self.extract_tokens(review, 2)

      if self.trigram != False:
        self.tokens_trigram = self.extract_tokens(review, 3)
      
      tokens = self.tokens_unigram + self.tokens_bigram + self.tokens_trigram

      # removing unseen words
      positive_words = self.conditional_positive.keys()
      negative_words = self.conditional_negative.keys()

      tokens_intersection = set(tokens).intersection(positive_words, negative_words)

      # calculating scores
      score_positive, score_negative = self.calculate_score(tokens_intersection)

      # classifying review
      sentiment = review["sentiment"]

      if score_positive > score_negative:
        if sentiment == "POS":
          self.correct_count.append(1)
        else:
          self.correct_count.append(0)

      else:
        if sentiment == "NEG":
          self.correct_count.append(1)
        else:
          self.correct_count.append(0)


  # calculating accuracy
  def classify_and_get_accuracy(self):
    self.train_bayes_classifier()
    self.apply_bayes()

    accuracy = sum(self.correct_count) / len(self.correct_count)

    return accuracy

## Default accuracy

In [9]:
# train/test split
train_set = [review for review in reviews if (review["cv"] >= 000 and review["cv"] <= 899)]
test_set = [review for review in reviews if (review["cv"] >= 900 and review["cv"] <= 999)]

# creating classifier
classifier = NaiveBayes(train_set, test_set)

# calculating accuracy
accuracy = classifier.classify_and_get_accuracy()

print(f"Classification accuracy with Naive Bayes: {accuracy * 100:.2f}%")

Classification accuracy with Naive Bayes: 86.50%


## Accuracy with smoothing

In [10]:
# train/test split
train_set = [review for review in reviews if (review["cv"] >= 000 and review["cv"] <= 899)]
test_set = [review for review in reviews if (review["cv"] >= 900 and review["cv"] <= 999)]

# setting kappa
kappa = 1

# creating classifier
classifier = NaiveBayes(train_set, test_set, kappa=kappa)

# calculating accuracy
accuracy = classifier.classify_and_get_accuracy()

print(f"Classification accuracy with Naive Bayes: {accuracy * 100:.2f}%")

Classification accuracy with Naive Bayes: 84.00%


## Round-robin cross validation

In [11]:
# getting k folds
def get_k_folds(k):
  dataset = {}

  for review in reviews:
    try:
      dataset[review["cv"] % k].append(review)
    except KeyError:
      dataset[review["cv"] % k] = []
      dataset[review["cv"] % k].append(review)

  return dataset


# getting round robin accuracy
def get_round_robin_accuracies(k_folds, kappa=0, stemmer=False, bigram=False, trigram=False):
  accuracies = []

  # train / test split
  for key_1 in k_folds:
    test_set = k_folds[key_1]
    train_set = []

    for key_2, value in k_folds.items():
      if key_1 != key_2:
        train_set.extend(value)

    # creating classifier
    classifier = NaiveBayes(train_set, test_set, kappa, stemmer, bigram, trigram)

    # calculating accuracy
    accuracy = classifier.classify_and_get_accuracy()

    accuracies.append(accuracy)

  return accuracies

## Accuracy with smoothing, cross-validation

In [12]:
# initialising number of folds
k = 10

# creating k folds
k_folds = get_k_folds(k)

# calculating accuracy
accuracies = get_round_robin_accuracies(k_folds, kappa)

# printing accuracies
average_accuracy = sum(accuracies) / len(accuracies)
print(f"Average accuracy per fold: {average_accuracy * 100:.2f}%")

Average accuracy per fold: 81.50%


## Accuracy with smoothing, cross-validation, stemming

In [13]:
# calculating accuracy
accuracies = get_round_robin_accuracies(k_folds, kappa, stemmer=True)

# printing accuracies
average_accuracy = sum(accuracies) / len(accuracies)
print(f"Average accuracy per fold: {average_accuracy * 100:.2f}%")

Average accuracy per fold: 81.45%


## Accuracy with smoothing, cross-validation, ngrams

In [14]:
# calculating accuracy
accuracies = get_round_robin_accuracies(k_folds, kappa, bigram=True, trigram=True)

# printing accuracies
average_accuracy = sum(accuracies) / len(accuracies)
print(f"Average accuracy per fold for unigram + bigram + trigram: {average_accuracy * 100:.2f}%")

Average accuracy per fold for unigram + bigram + trigram: 80.75%
