**Sentiment Analysis of Movie Reviews**

In [1]:
from __future__ import division
from collections import Counter # Counter() is a dict for counting
from collections import defaultdict
from numpy import mean
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import spacy
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Load train set

In [2]:
train = pd.read_csv("train.tsv", sep='\t', index_col=0)
train['Phrase'] = train['Phrase'].str.lower()  # change to lowercase
train['Phrase'] = train['Phrase'].str.replace('[,.:;\'`-]','', regex=True)

stop_words = set(stopwords.words('english'))
# train['Phrase']= train['Phrase'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [3]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return ' '.join([lemmatizer.lemmatize(w) for w in text.split(' ')])

train['Phrase'] = train.Phrase.apply(lemmatize_text)

Count class occurances

In [4]:
negative_count = train[train.Sentiment == 0].shape[0]
some_negative_count = train[train.Sentiment == 1].shape[0]
neutral_count = train[train.Sentiment == 2].shape[0]
some_positive_count = train[train.Sentiment == 3].shape[0]
positive_count = train[train.Sentiment == 4].shape[0]

Posterior probabilites for 3 sentiment

In [5]:
def posterior_probabilities():
  negative = negative_count/ train.shape[0]
  som_negative = some_negative_count/ train.shape[0]
  neutral = neutral_count / train.shape[0]
  som_positive = some_positive_count / train.shape[0]
  positive = positive_count / train.shape[0]
  return negative, som_negative, neutral, som_positive, positive

posterior_negative, posterior_som_negative, posterior_neutral, posterior_som_positive, posterior_positive  = posterior_probabilities()

Create features dictionary

In [6]:
# Dictionaries to keep word occurances for each class
like_neg = Counter(); 
like_som_neg = Counter();
like_neu = Counter(); 
like_som_pos = Counter(); 
like_pos = Counter();

# Word counters for each class
neg_count = 0
som_neg_count = 0
neu_count = 0
som_pos_count = 0
pos_count = 0

vocabulary = set()

for tweet, sentiment in train.itertuples(index=False):
    words = tweet.strip().split()
    if sentiment == 0:
      occ_counts = like_neg
      neg_count += len(words)
    elif sentiment == 1:
      occ_counts = like_som_neg
      som_neg_count += len(words)
    elif sentiment == 2:
      occ_counts = like_neu
      neu_count += len(words)
    elif sentiment == 3:
      occ_counts = like_som_pos
      som_pos_count += len(words)
    elif sentiment == 4:
      occ_counts = like_pos
      pos_count += len(words)

    for word in words:
      occ_counts[word] += 1 # Store occurence counts for all words
      vocabulary.add(word)

Likelihoods for features (dont run multiple times)

In [7]:
vocab_len = len(vocabulary)
def calculate_likelihoods(ar, n):
  for word in ar:
    ar[word] = (ar[word] + 1)/ (n + vocab_len)

calculate_likelihoods(like_neg, neg_count)
calculate_likelihoods(like_som_neg, som_neg_count)
calculate_likelihoods(like_neu, neu_count)
calculate_likelihoods(like_som_pos, som_pos_count)
calculate_likelihoods(like_pos, pos_count)

def calculate_default_likelihoods(n):
    return 1/(n + vocab_len)

default_like_neg = calculate_default_likelihoods(neg_count)
default_like_som_neg = calculate_default_likelihoods(som_neg_count)
default_like_neu = calculate_default_likelihoods(neu_count)
default_like_som_pos = calculate_default_likelihoods(som_pos_count)
default_like_pos = calculate_default_likelihoods(pos_count)

Load dev set

In [8]:
dev = pd.read_csv("dev.tsv", sep='\t', index_col=0)
dev['Phrase'] = dev['Phrase'].str.lower()  # change to lowercase
dev["Phrase"] = dev['Phrase'].str.replace('[,.:;\'`-]','', regex=True)

Use model on dev set

In [9]:
result = []
for id, tweet, s in dev.itertuples():
  words = tweet.strip().split()
  result_neg = posterior_negative
  result_som_neg = posterior_som_negative  
  result_neu = posterior_neutral
  result_pos = posterior_positive
  result_som_pos = posterior_som_positive

  def calculate_final_posterior(ar, token, default):
    if token in ar:
      return ar[token]
    else:
      return default

  for token in words:

    result_pos *= calculate_final_posterior(like_pos, token, default_like_pos)
    result_som_pos *= calculate_final_posterior(like_som_pos, token, default_like_som_pos)
    result_neu *= calculate_final_posterior(like_neu, token, default_like_neu)
    result_som_neg *= calculate_final_posterior(like_som_neg, token, default_like_som_neg)
    result_neg *= calculate_final_posterior(like_neg, token, default_like_neg)

  score_max = max(result_neg, result_som_neg, result_neu, result_som_pos, result_pos)

  if (score_max == result_neg):
    result.append([id, 0])
  elif (score_max == result_som_neg):
    result.append([id, 1])
  elif (score_max == result_neu):
    result.append([id, 2])
  elif (score_max == result_som_pos):
    result.append([id, 3])
  elif (score_max == result_pos):
    result.append([id, 4])

Write to result file

In [10]:
result_df = pd.DataFrame(result, columns=['SentenceId', 'Sentiment'])
result_df.set_index('SentenceId', inplace=True)
result_df.to_csv('result.tsv', sep='\t')

Evaluate

In [11]:
dev_real = []
for id, tweet, s in dev.itertuples():
  dev_real.append([id, s])

dev_df = pd.DataFrame(dev_real, columns=['SentenceId', 'Sentiment'])
dev_df.set_index('SentenceId', inplace=True)

Macro-F1

In [12]:
def calculate_tp_fp_fn(sentiment):
  tp = fp = fn = 0 
  for id, s in dev_df[dev_df.Sentiment == sentiment].itertuples():
      if s == result_df.loc[id, 'Sentiment']:
        tp += 1
      else:
        fp += 1
  fn = result_df[result_df.Sentiment == sentiment].shape[0] - tp
  return tp, fp, fn

def calculate_f1(sentiment):
  tp, fp, fn = calculate_tp_fp_fn(sentiment)
  f1 = (2 * tp) / (2 * tp + fp + fn)
  return f1

def calculate_macro_f1():
  m_f1 = 1
  for i in range (0,5):
    m_f1 += calculate_f1(i)
  
  return m_f1/5

calculate_macro_f1()

0.5039077608357471

In [13]:
correct = 0
incorrect = 0
for id, s in dev_df.itertuples():
  if s == result_df.loc[id, 'Sentiment']:
    correct += 1
  else:
    incorrect += 1
print(incorrect)

690


In [14]:
# https://www.researchgate.net/publication/306364792_On_stopwords_filtering_and_data_sparsity_for_sentiment_analysis_of_twitter