# Importing libraries

In [2]:
import math
import os
import sys
from subprocess import call
from nltk import FreqDist
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
import sklearn as sk
import pickle
import json
from collections import Counter
import requests
import matplotlib.pyplot as plt
import numpy as np

# Importing movie reviews dataset

In [3]:
# !wget https://gist.githubusercontent.com/bastings/d47423301cca214e3930061a5a75e177/raw/5113687382919e22b1f09ce71a8fecd1687a5760/reviews.json

In [4]:
with open("reviews.json", mode="r", encoding="utf-8") as f:
  reviews = json.load(f)

# Lexicon based approach

## Importing lexicon

In [5]:
# !wget https://gist.githubusercontent.com/bastings/d6f99dcb6c82231b94b013031356ba05/raw/f80a0281eba8621b122012c89c8b5e2200b39fd6/sent_lexicon

## Binary classification

In [6]:
# creating lexicon dictionary with required features
lexicon_dict = {}

with open("sent_lexicon", mode="r", encoding="utf-8") as f:
  for i in f:
    i_list = i.strip().split()

    key = i_list[2].split("=")[1]

    if i_list[5].split("=")[1] == "positive":
      value_1 = 1
    elif i_list[5].split("=")[1] == "negative":
      value_1 = -1
    else:
      value_1 = 0

    MULTIPLIER_WEAK = 0.5
    MULTIPLIER_STRONG = 1
    value_2 = MULTIPLIER_WEAK * value_1 if i_list[0].split("=")[1] == "weaksubj" else MULTIPLIER_STRONG * value_1

    lexicon_dict[key] = [value_1, value_2]

# function to get binary_scores
def get_binary_score(review):
  binary_score = 0
  doc_length = 0

  for sentences in review["content"]:
    for word, _ in sentences:
      try:
        binary_score += lexicon_dict[word][0]
        # doc_length += 1
      except KeyError:
        binary_score += 0
      
      doc_length += 1
      
  return [binary_score, doc_length]

# function to classify reivew
def classify_review(parameters):
  score, doc_length = parameters
  THRESHOLD = 8

  if score > THRESHOLD:
    return "POS"
  else:
    return "NEG"

# calculating accuracy
classifications = [classify_review(get_binary_score(review)) for review in reviews]
token_results = [1 if classification == reviews[i]["sentiment"] else 0 for i, classification in enumerate(classifications)]
token_accuracy = token_results.count(1)/len(token_results)
print("Accuracy: %0.2f" % token_accuracy)

Accuracy: 0.68


## Weighted classification

In [7]:
# function to get weighted scores
def get_weighted_score(review):
  weighted_score = 0
  doc_length = 0

  for sentences in review["content"]:
    for word, _ in sentences:
      try:
        weighted_score += lexicon_dict[word][1]
        # doc_length += 1
      except KeyError:
        weighted_score += 0

      doc_length += 1

  return [weighted_score, doc_length]

# calculating accuracy
classifications_weighted = [classify_review(get_weighted_score(review)) for review in reviews]
magnitude_results = [1 if classification == reviews[i]["sentiment"] else 0 for i, classification in enumerate(classifications_weighted)]
magnitude_accuracy = magnitude_results.count(1)/len(magnitude_results)
print("Accuracy: %0.2f" % magnitude_accuracy)

Accuracy: 0.69


## Creating better threshold

In [8]:
# new threshold classification
def classify_review_better(parameters):
  score, doc_length = parameters
  THRESHOLD = 1.02 * math.log(doc_length)

  if score >= THRESHOLD:
    return "POS"
  else:
    return "NEG"

# calculating new weighted classification accuracy
classifications_weighted_new = [classify_review_better(get_weighted_score(review)) for review in reviews]
magnitude_results_2 = [1 if classification == reviews[i]["sentiment"] else 0 for i, classification in enumerate(classifications_weighted_new)]
magnitude_accuracy_2 = magnitude_results_2.count(1)/len(magnitude_results_2)
print("New accuracy of weighted classification: %0.2f" % magnitude_accuracy_2)

New accuracy of weighted classification: 0.70


# Naive Bayes approach

## Model definition

In [11]:
def train_bayes_classifier(train_set, kappa=0):
  vocabulary_positive = {}
  vocabulary_negative = {}
  vocabulary = []

  reviews_count_positive = 0
  reviews_count_negative = 0

  for review in train_set:
    sentiment = review["sentiment"]

    # counting positive/negative reviews
    if sentiment == "POS":
      reviews_count_positive += 1
    elif sentiment == "NEG":
      reviews_count_negative += 1
    
    # feature extraction
    for sentences in review["content"]:
      for word, _ in sentences:
        token = word.lower()
        vocabulary.append(token)

        if sentiment == "POS":
          try:
            vocabulary_positive[token] += 1
          except KeyError:
            vocabulary_positive[token] = 1

        elif sentiment == "NEG":
          try:
            vocabulary_negative[token] += 1
          except KeyError:
            vocabulary_negative[token] = 1

  # vocabulary of training set
  vocabulary = list(set(vocabulary))

  # total reviews count
  reviews_count_total = len(train_set)

  # calculating prior
  prior_positive = reviews_count_positive / reviews_count_total
  prior_negative = reviews_count_negative / reviews_count_total

  # calculating conditional probability
  denominator_positive = sum(vocabulary_positive.values()) + (len(vocabulary) * kappa)
  denominator_negative = sum(vocabulary_negative.values()) + (len(vocabulary) * kappa)

  if kappa == 0:
    conditional_positive = {word.lower(): (count / denominator_positive) for word, count in vocabulary_positive.items()}
    conditional_negative = {word.lower(): (count / denominator_negative) for word, count in vocabulary_negative.items()}
    
  else:
    conditional_positive = {word.lower(): ((count + kappa) / denominator_positive) for word, count in vocabulary_positive.items()}
    conditional_negative = {word.lower(): ((count + kappa) / denominator_negative) for word, count in vocabulary_negative.items()}

  return vocabulary, prior_positive, prior_negative, conditional_positive, conditional_negative


def apply_bayes_classifier(review, voacabulary, prior_positive, prior_negative, conditional_positive, conditional_negative):
  # extracting tokens
  tokens = []

  for sentences in review["content"]:
    for word, _ in sentences:
      tokens.append(word.lower())

  positive_words = conditional_positive.keys()
  negative_words = conditional_negative.keys()

  tokens_intersection = set(vocabulary).intersection(tokens, positive_words, negative_words)

  # calculating scores
  score_positive = math.log(prior_positive)
  score_negative = math.log(prior_negative)

  for token in tokens_intersection:
      score_positive += math.log(conditional_positive[token])
      score_negative += math.log(conditional_negative[token])

  # classifying review
  if score_positive > score_negative:
    return "POS"
  else:
    return "NEG"

## Training / prediction

In [12]:
# train/test split
train_set = [review for review in reviews if (review["cv"] >= 000 and review["cv"] <= 899)]
test_set = [review for review in reviews if (review["cv"] >= 900 and review["cv"] <= 999)]

# training model
KAPPA = 2
vocabulary, prior_positive, prior_negative, conditional_positive, conditional_negative = train_bayes_classifier(train_set, KAPPA)

# predicting sentiment
predictions = []

for review in test_set:
  prediction = apply_bayes_classifier(review, vocabulary, prior_positive, prior_negative, conditional_positive, conditional_negative)
  
  if prediction == review["sentiment"]:
    predictions.append(1)
  else:
    predictions.append(0)

# calculating accuracy
accuracy = predictions.count(1) / len(predictions)
print(f"Classification accuracy with Naive Bayes: {accuracy:.2f}")

Classification accuracy with Naive Bayes: 0.88
