**Sentiment Analysis of Movie Reviews**

In [None]:
from __future__ import division
from collections import Counter # Counter() is a dict for counting
from collections import defaultdict
from numpy import mean
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Load train set

In [None]:
train = pd.read_csv("train.tsv", sep='\t', index_col=0)
train['Phrase'] = train['Phrase'].str.lower()  # change to lowercase
train["Phrase"] = train['Phrase'].str.replace('[,.:;\'`-]','')

Change to three values sentiment and count pos, neu, neg occurances

In [None]:
positive_count = 0
neutral_count = 0
negative_count = 0
def threeValueSentiment(ar):
  global positive_count
  global neutral_count
  global negative_count
  if ar['Sentiment'] == 0 or ar['Sentiment'] == 1:
    negative_count += 1 
    return 0
  elif ar['Sentiment'] == 2:
    neutral_count += 1 
    return 1
  elif ar['Sentiment'] == 3 or ar['Sentiment'] == 4:
    positive_count += 1 
    return 2
  else: 
    return -1
train['Sentiment'] = train.apply(threeValueSentiment, axis=1)

Posterior probabilites for 3 sentiment

In [None]:
def posterior_probabilities():
  posterior_positive = positive_count / train.shape[0]
  posterior_neutral = neutral_count / train.shape[0]
  posterior_negative = negative_count/ train.shape[0]
  return posterior_positive, posterior_neutral, posterior_negative

posterior_positive, posterior_neutral, posterior_negative = posterior_probabilities()

Create features dictionary

In [None]:
like_pos = Counter(); # Occurrence counts
like_neu = Counter(); # Occurrence counts
like_neg = Counter(); # Occurrence counts

N = len(train)
pos_count = 0
neu_count = 0
neg_count = 0

vocabulary = set()

for tweet, sentiment in train.itertuples(index=False):
    words = tweet.strip().split()
    if sentiment == 2:
      occ_counts = like_pos
      pos_count += len(words)
    elif sentiment == 1:
      occ_counts = like_neu
      neu_count += len(words)
    elif sentiment == 0:
      occ_counts = like_neg
      neg_count += len(words)

    for word in words:
      occ_counts[word] += 1 # Store occurence counts for all words
      vocabulary.add(word)

Likelihoods for features

In [None]:
vocab_len = len(vocabulary)
def calculate_likelihoods(ar, n):
  for word in ar:
    ar[word] = (ar[word] + 1)/ (n + vocab_len)

calculate_likelihoods(like_pos, pos_count)
calculate_likelihoods(like_neu, neu_count)
calculate_likelihoods(like_neg, neg_count)

def calculate_default_likelihoods(n):
    return 1/(n + vocab_len)

default_like_pos = calculate_default_likelihoods(pos_count)
default_like_neu = calculate_default_likelihoods(neu_count)
default_like_neg = calculate_default_likelihoods(neg_count)


Load dev set

In [None]:
dev = pd.read_csv("dev.tsv", sep='\t', index_col=0)
dev['Phrase'] = dev['Phrase'].str.lower()  # change to lowercase
dev["Phrase"] = dev['Phrase'].str.replace('[,.:;\'`-]','')

Transform to dictionary

In [None]:
result = []
for id, tweet, s in dev.itertuples():
  words = tweet.strip().split()
  result_pos = posterior_positive
  result_neu = posterior_neutral
  result_neg = posterior_negative
  for token in words:
    if token in like_pos:
      result_pos *= like_pos[token]
    else:
      result_pos *= default_like_pos
    if token in like_neu: 
      result_neu *= like_neu[token]
    else:
      result_neu *= default_like_neu
    if token in like_neg: 
      result_neg *= like_neg[token]
    else:
      result_neg *= default_like_neg

  if (result_pos > result_neg):
    result.append([id, 2])
  elif (result_pos < result_neg):
    result.append([id, 0])
  else:
    result.append([id, 1])

In [None]:
result_df = pd.DataFrame(result, columns=['SentenceId', 'Sentiment'])
result_df.set_index('SentenceId', inplace=True)
result_df.to_csv('result', sep='\t')

Evaluate

In [None]:
dev_real = []
for id, tweet, s in dev.itertuples():
  dev_real.append([id, s])

dev_df = pd.DataFrame(dev_real, columns=['SentenceId', 'Sentiment'])
dev_df.set_index('SentenceId', inplace=True)

In [None]:
def threeValueSentimentDev(ar):
  if ar['Sentiment'] == 0 or ar['Sentiment'] == 1:
    return 0
  elif ar['Sentiment'] == 2:
    return 1
  elif ar['Sentiment'] == 3 or ar['Sentiment'] == 4:
    return 2
  else: 
    return -1
dev_df['Sentiment'] = dev_df.apply(threeValueSentimentDev, axis=1)

In [None]:
correct = 0
incorrect = 0
for id, s in dev_df.itertuples():
  if dev_df.loc[id, 'Sentiment'] == result_df.loc[id, 'Sentiment']:
    correct += 1
  else:
    incorrect += 1
correct

659