In [None]:
%load_ext autoreload
%autoreload 2

# 7. Sentiment Analysis - Elman RNN

## Setup

Firstly, set up the path to the (preprocessed) dataset and glove pretrained embeddings

In [None]:
# Path to the preprocessed data
import os

fileDir = os.path.dirname(os.path.realpath('__file__'))
absFilePathToPreprocessedDataset = os.path.join(fileDir, '../db_processed/db_sequences_words_split_short.csv')
absFilePathToGloVe = os.path.join(fileDir, '../GloVe/glove.6B.50d.txt')
pathToPreprocessedDataset = os.path.abspath(os.path.realpath(absFilePathToPreprocessedDataset))
pathToGloveEmbeddings = os.path.abspath(os.path.realpath(absFilePathToGloVe))
print (pathToPreprocessedDataset)
print (pathToGloveEmbeddings)

Choose the device to run the training on:

In [None]:
device = "cpu"

Set the learning rate parameter:

In [None]:
learningRate = 0.0001

## Initialization

In [None]:
from Common.TwitterDataset import TwitterDataset

# Step #1: Instantiate the dataset
# instantiate the dataset
dataset = TwitterDataset.load_dataset_and_make_vectorizer(pathToPreprocessedDataset, representation="indices")
# get the vectorizer
vectorizer = dataset.get_vectorizer()

### Option B: Use pre-trained embeddings
To use pre-trained embeddings, there are three steps:

1. Load the pretrained embeddings
2. Select only subset of embeddings for the words that are actually present on the data
3. Set the Embedding Layer's weight matrix as the loaded subset

In [None]:
# Step #1.B.1: Load the pre-trained embeddings

from Common.PreTrainedEmbeddings import PreTrainedEmbeddings

embeddings = PreTrainedEmbeddings.from_embeddings_file(pathToGloveEmbeddings)

# Step #1.B.2: Initialize the embedding matrix

# get list of words in the vocabulary
word_list = vectorizer.text_vocabulary.to_serializable()["token_to_idx"].keys()

# get the pre-trained embedding vectors only for words that appear in the vocabulary
embeddings_matrix = embeddings.make_embeddings_matrix(word_list)

In [None]:
embeddings_matrix.shape

In [None]:
import torch.nn as nn
import torch.optim as optim
from Models.ModelElmanRNN import SentimentClassifierElmanRNN

# Step #2: Instantiate the model
# instantiate the model
model = SentimentClassifierElmanRNN(
    embedding_size=50,
    num_embeddings=len(vectorizer.text_vocabulary),
    rnn_hidden_dim=10,
    output_dim=len(vectorizer.target_vocabulary),
    padding_idx=vectorizer.text_vocabulary.mask_index,
    batch_first=True,
    pretrained_embedding_matrix=embeddings_matrix,  # Step #1.B.3: set the loaded subset as a weight matrix
)
# send model to appropriate device
model = model.to(device)

# Step #3: Instantiate the loss function
loss_func = nn.CrossEntropyLoss()

# Step #4: Instantiate the optimizer
optimizer = optim.Adam(model.parameters(), lr=learningRate)

In [None]:
print(len(vectorizer.text_vocabulary))
print(vectorizer.text_vocabulary)

## Training Loop

In [None]:
from Common.Trainer import Trainer

sentiment_analysis_trainer = Trainer(
    dataset=dataset,
    model=model,
    loss_func=loss_func,
    optimizer=optimizer
)

In [None]:
# setup the chosen number of epochs
num_epochs = 100
# setup the chosen batch size
batch_size = 20

report = sentiment_analysis_trainer.train(num_epochs=num_epochs, batch_size=batch_size, device=device)

## Evaluate the results

In [None]:
from RunHelper import evaluate_model

# set the model in eval state
model.eval()

evaluate_model(sentiment_analysis_trainer, device, batch_size)

## Inference and classifying new data points

Let's do inference on the new data. This is another evaluation method to make qualitative judgement about whether the model is working.

Let's try the model on some examples:

In [None]:
from RunHelper import run_examples

examples = [
    "mr and mrs dursley"
]

run_examples(examples, model, vectorizer)

### More detailed evaluation on the Test Set

In [None]:
from RunHelper import model_run_and_evaluate

model_run_and_evaluate(dataset, vectorizer, model)