## Download dataset from kaggle (only for google colab)

In [None]:
"""
import os
from google.colab import drive

drive.mount('/content/drive')
os.environ['KAGGLE_CONFIG_DIR'] = "/content/drive/My Drive/kaggle"
%cd /content/drive/My Drive/colab_data/datasets
os.mkdir('sentiment-analysis-on-movie-reviews')
%cd sentiment-analysis-on-movie-reviews
!kaggle competitions download -c sentiment-analysis-on-movie-reviews
!ls
"""

## Sentiment Analysis Example (Rotten Tomatoes Dataset) <br>
Labels:

*   0 = negative
*   1 = somewhat negative
*   2 = neutral
*   3 = somewhat positive
*   4 = positive


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

pd.set_option('display.max_colwidth', None)

from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

ROOT_DIR = '/content/drive/My Drive/colab_data/datasets/sentiment-analysis-on-movie-reviews'
LABELS = ['negative','somewhat negative', 'neutral', 'somewhat positive', 'positive']

In [None]:
# adjust plot colors if notebook background is dark
default_color = 'white'
mpl.rcParams['axes.labelcolor'] = default_color
mpl.rcParams['xtick.color'] = default_color
mpl.rcParams['ytick.color'] = default_color
mpl.rcParams['axes.titlecolor'] = default_color

### Analyse Dataset

In [None]:
train = pd.read_csv(os.path.join(ROOT_DIR, 'train.tsv.zip'), sep="\t")
test = pd.read_csv(os.path.join(ROOT_DIR, 'test.tsv.zip'), sep="\t")
sampleSub = pd.read_csv(os.path.join(ROOT_DIR, 'sampleSubmission.csv'))

In [None]:
train.head(5)

In [None]:
test.head(5)

In [None]:
sampleSub.head(5)

In [None]:
fig, ax = plt.subplots()

dist = train['Sentiment'].value_counts()

ax.set_xlabel('Sentiment')
ax.set_ylabel('Freq')

dist.plot(ax=ax, kind='bar')

### Exam the word frequency

In [None]:
def display_wordCloud(label, ax, stopwords=None):
  text = ' '.join(phrase for phrase in train[train["Sentiment"] == label]['Phrase'] )
  wordcloud = WordCloud(max_words = 100, stopwords = stopwords, background_color="black").generate(text)

  ax.imshow(wordcloud, interpolation = 'bilinear', cmap='viridis')
  ax.set_title(LABELS[label])
  ax.axis('off')

In [None]:
f, axs = plt.subplots(3,2,figsize=(24,16))
display_wordCloud(0,axs[0][0])
display_wordCloud(4,axs[0][1])
display_wordCloud(1,axs[1][0])
display_wordCloud(3,axs[1][1])
display_wordCloud(2,axs[2][0])
axs[2][1].axis('off')

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(['movie', 'film', 'one', 'RRB', 'LRB', 'character', 'characters', 'make', 'story'])
f, axs = plt.subplots(3,2,figsize=(24,16))
display_wordCloud(0,axs[0][0], stopwords)
display_wordCloud(4,axs[0][1], stopwords)
display_wordCloud(1,axs[1][0], stopwords)
display_wordCloud(3,axs[1][1], stopwords)
display_wordCloud(2,axs[2][0], stopwords)
axs[2][1].axis('off')

## Data Preprocessing

In [None]:
from nltk.tokenize import word_tokenize
from nltk import FreqDist
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
snowStem=SnowballStemmer('english')

from string import punctuation
import re

In [None]:
# Download packages if not yet downloaded
import nltk
nltk.download('punkt')

In [None]:
def preprocess(phrases):
    corpus=[]
    for i in range(0,len(phrases)):
        phrase=str(phrases[i])
        phrase = phrase.lower()
        phrase=re.sub('[^a-z]',' ',phrase)
        
        tokens=[snowStem.stem(w) for w in word_tokenize(phrase)]
        phrase=' '.join(tokens)
        corpus.append(phrase)
    return corpus

In [None]:
train['phrase_preprocess']=preprocess(train['Phrase'].values)
test['phrase_preprocess']=preprocess(test['Phrase'].values)
train.head()

# Split Training Set into Training and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(train["phrase_preprocess"], train['Sentiment'].values, test_size=0.20, random_state=111)
x_test = test['phrase_preprocess'].values

# Encoding: Bag-of-Words (BoW)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
full_corpus = list(x_train) + list(x_val) + list(x_test)
vectorizer = CountVectorizer(min_df = 5)
vectorizer.fit(full_corpus)

x_train = vectorizer.transform(x_train)
x_val = vectorizer.transform(x_val)
x_test = vectorizer.transform(x_test)
print(x_train[0].shape)
print(x_train[0])

# Neural Network

In [None]:
import torch
import numpy as np
from torch import nn
from torch.autograd import Variable
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, neurons):
      super(MLP, self).__init__()
      self.neurons = neurons;
      self.nb_layers = len(neurons)
      self.layers = nn.ModuleList();
      self.layers.append(nn.Linear(input_dim, neurons[0]))
      
      for i in range(1, self.nb_layers):
        self.layers.append(nn.Linear(neurons[i-1], neurons[i]))

      self.layers.append(nn.Linear(neurons[self.nb_layers-1], 5))
        
    def forward(self, x):
      for i in range(self.nb_layers):
        x = F.relu(self.layers[i](x))
      x = F.softmax(self.layers[self.nb_layers](x), dim=0)
      return x

In [None]:
class ReviewDataset(Dataset):
  def __init__(self, features, labels):
    self.features = features
    self.labels =labels
  
  def __len__(self):
    return self.features.shape[0]
  
  def __getitem__(self, idx):
    return {'input': self.features[idx].toarray().flatten(),
            'label': self.labels[idx]}

In [None]:
use_cuda = torch.cuda.is_available()
print (use_cuda)

if use_cuda:
  current_device = torch.cuda.current_device()
  print(torch.cuda.get_device_name(current_device))
else:
  current_device = torch.device("cpu")


In [None]:
model = MLP(x_train.shape[1], [800, 500, 300, 200])
model.to(current_device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss()
epochs = 100

def print_(loss):
    print ("The loss calculated: ", loss)

print(model)

In [None]:
train_dataset = ReviewDataset(x_train, y_train)
val_dataset = ReviewDataset(x_val, y_val)
train_dataloader = DataLoader(train_dataset, batch_size=100, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=100, shuffle=True, num_workers=4)

nb_train_batchs = len(train_dataloader)
nb_val_batchs = len(val_dataloader)

In [None]:
running_loss = 0.0;
with torch.set_grad_enabled(True):
  for i_batch, sample_batch in enumerate(train_dataloader):
    inputs = sample_batch['input']
    labels = sample_batch['label']

    inputs = inputs.to(current_device, dtype=torch.float)
    labels = labels.to(current_device, dtype=torch.long)

    optimizer.zero_grad()
    _y = model(inputs)
    loss = loss_fn(_y, labels)
    loss.backward()
    optimizer.step()

    running_loss += loss.item()

  print_(running_loss/nb_train_batchs)

In [None]:
running_loss = 0.0;
with torch.set_grad_enabled(False):
  for i_batch, sample_batch in enumerate(val_dataloader):
    inputs = sample_batch['input']
    labels = sample_batch['label']

    inputs = inputs.to(current_device, dtype=torch.float)
    labels = labels.to(current_device, dtype=torch.long)

    optimizer.zero_grad()
    _y = model(inputs)
    loss = loss_fn(_y, labels)

    running_loss += loss.item()

  print_(running_loss/nb_val_batchs)

# Exercises:



1.   Try Balancing the dataset and rerun the same models
2.   Try Term Frequency-Inverse Document Freuency (tf-idf)
3.   Try incorporating additional preprocessing
4.   Try adjusting network architecture (i.e. different activation function, # neurons)
5.   Try including accuracy evaluation metric
6.   Try running for multiple epochs



