In [89]:
import numpy as np
import re
import nltk
from datasets import load_dataset
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid

import matplotlib.pyplot as plt

In [None]:
# nltk.download('all')

# Part 0. Dataset Preparation

In [91]:
# loading the dataset from the library
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset ['train']
validation_dataset = dataset ['validation']
test_dataset = dataset ['test']

In [92]:
# check the sizes of each dataset
train_size = len(train_dataset)
validation_size = len(validation_dataset)
test_size = len(test_dataset)

print(f"Training dataset size: {train_size}")
print(f"Validation dataset size: {validation_size}")
print(f"Test dataset size: {test_size}")

Training dataset size: 8530
Validation dataset size: 1066
Test dataset size: 1066


In [93]:
# view an example from each dataset
print("Train Dataset")
print(train_dataset.features)
print(train_dataset[0]) 

print("Test Dataset")
print(test_dataset.features)
print(test_dataset[0]) 

print("Validation Dataset")
print(validation_dataset.features)
print(validation_dataset[0])

Train Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
Test Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .', 'label': 1}
Validation Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', 'label': 1}


# Part 1. Preparing Word Embeddings

### Preprocessing

In [94]:
def preprocessing(text):

    # remove any other special characters but keep the general ones for potential sentiment usage
    text = re.sub(r'[^a-zA-Z0-9\'\!\?\.]', ' ', text)

    # replace multiple spaces with one space only
    text = re.sub(r'\s+', ' ', text)

    # remove leading and trailing whitespace to avoid unnecessary inconsistency
    text = text.strip()

    # convert the text to lowercase
    text = text.lower()


    return text

# apply the preprocessing function to the 'text' column of each dataset
train_dataset = train_dataset.map(lambda x: {'text': preprocessing(x['text'])})
validation_dataset = validation_dataset.map(lambda x: {'text': preprocessing(x['text'])})
test_dataset = test_dataset.map(lambda x: {'text': preprocessing(x['text'])})

# an example of the processed text
print("Train Dataset Example:")
print(train_dataset[0])

Train Dataset Example:
{'text': "the rock is destined to be the 21st century's new conan and that he's going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal .", 'label': 1}


In [95]:
# tokenization
# empty list to store the resulting sentences
tokenized_sentences = []

for text in train_dataset['text']:
    # Tokenize the text and append the tokenized sentence to the list
    tokenized_sentences.append(word_tokenize(text))

### (a) Size of vocabulary in training data

In [96]:
# empty set for storing unique words
original_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the words set
        original_vocab.add(word)

print(f"(a) The size of vocabulary formed in the training data is {len(original_vocab)}")

(a) The size of vocabulary formed in the training data is 16683


### (b) Number of OOV in the training data

In [97]:
# adjust the parameters for word2vec
vector_size = 100 # Dimensionality of the word vectors
window = 3 # Maximum distance between the current and predicted word within a sentence
min_count = 2 # Ignores all words with total frequency lower than this
workers = 4 # CPU cores
sg = 1 # 1 for skip-gram, 0 for CBOW
epochs = 5 

# train the word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences, 
    vector_size = vector_size, 
    window = window, 
    min_count = min_count, 
    workers = workers,
    epochs = epochs)

# variable to store model's vocab list 
word2vec_vocab = set(word2vec_model.wv.key_to_index)

# Calculate OOV words by comparing the original vocab and Word2Vec vocab
oov_words = original_vocab - word2vec_vocab

print(f"(b) Number of OOV words in the training data is {len(oov_words)} when the minimum threshold for each word is {min_count}")

(b) Number of OOV words in the training data is 7866 when the minimum threshold for each word is 2


### (c) Mitigating OOV - code snippet

In [117]:
# define the UNK and PAD token
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'

def replace_oov_words(tokenized_sentences,vocab):
    # Replace all OOV words with <UNK>
    # process each sentence in the tokenized_sentences list
    for i, sentence in enumerate(tokenized_sentences):
        # empty list to store the current processed sentence
        processed_sentence = []
        for word in sentence:
            if word in vocab:
                # if the current word is in the model's vocab, keep it as it is
                processed_sentence.append(word)  
            else:
                # otherwise, replace the word with UNK
                processed_sentence.append(UNK_TOKEN) 

        # update the sentence in the original tokenized_sentences list
        tokenized_sentences[i] = processed_sentence

    return tokenized_sentences

### Embedding matrix

In [99]:
# empty set for storing unique words
final_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the final_vocab set
        final_vocab.add(word)

# add 'UNK' and '<PAD>' to the vocabulary
final_vocab.add(UNK_TOKEN)
final_vocab.add(PAD_TOKEN)

# create the dictionary that maps each word in final_vocab to a unique index
word_to_index = {word: i for i, word in enumerate(final_vocab)}

embedding_dim = word2vec_model.vector_size 

# initialize embedding matrix with number of vocab and embedding dimension
embedding_matrix = np.zeros((len(word_to_index), embedding_dim))

# fill the embedding matrix with the corresponding word vectors
for word, i in word_to_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
    else:
        # (option 1) random initialization for unknown words 
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
        # (option 2) use average vector for unknown words 
        # embedding_matrix[i] = np.mean(word2vec_model.wv.vectors, axis=0)

print(f"Shape of embedding matrix: {embedding_matrix.shape}")

Shape of embedding matrix: (16685, 100)


In [100]:
# convert word to indices 
def words_to_indices(sentence, word_to_index):
    return [word_to_index.get(word, word_to_index[UNK_TOKEN]) for word in sentence.split()]

train_X = [words_to_indices(sentence, word_to_index) for sentence in train_dataset['text']]
train_y = train_dataset['label']
val_X = [words_to_indices(sentence, word_to_index) for sentence in validation_dataset['text']]
val_y = validation_dataset['label']
test_X = [words_to_indices(sentence, word_to_index) for sentence in test_dataset['text']]
test_y = test_dataset['label']

def create_dataloader(X, y, batch_size=16, shuffle=True):
    X_tensor = [torch.tensor(seq, dtype=torch.long) for seq in X]
    X_padded = pad_sequence(X_tensor, batch_first=True, padding_value=word_to_index[PAD_TOKEN])
    y_tensor = torch.tensor(y, dtype=torch.long)
    dataset = TensorDataset(X_padded, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = create_dataloader(train_X, train_y, shuffle=True)
val_dataloader = create_dataloader(val_X, val_y, shuffle=False)
test_dataloader = create_dataloader(test_X, test_y, shuffle=False)

# convert embedding_matrix to tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)

# Part 3. Enhancement


## Part 3.1 Update the word embeddings during the training process



In [108]:
class RNNUpdateEmbedding(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_dim,
        output_dim,
        pad_idx,
        embedding_matrix,
        freeze_embeddings=False,
        aggregation_method="max_pooling",
        dropout_rate=0.2,
    ):
        super().__init__()

        # convert numpy embedding matrix to tensor
        embedding_tensor = torch.FloatTensor(embedding_matrix)

        self.embedding = nn.Embedding.from_pretrained(
            embedding_tensor, padding_idx=pad_idx, freeze=freeze_embeddings
        )

        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        # attention layer for attention aggregation method
        self.attention = nn.Linear(hidden_dim, 1)

        self.aggregation_method = aggregation_method

    def forward(self, text):
        # embedded = self.embedding(text)
        embedded = self.dropout(self.embedding(text))
        output, hidden = self.rnn(embedded)

        if self.aggregation_method == "last_hidden":
            # use the last hidden state
            sentence_repr = hidden.squeeze(0)
        elif self.aggregation_method == "last_output":
            # use the last output
            sentence_repr = output[:, -1, :]
        elif self.aggregation_method == "mean_pooling":
            # average all outputs
            sentence_repr = torch.mean(output, dim=1)
        elif self.aggregation_method == "max_pooling":
            # max pooling over the sequence
            sentence_repr, _ = torch.max(output, dim=1)
        elif self.aggregation_method == "attention":
            # Attention mechanism
            attention_weights = F.softmax(self.attention(output), dim=1)
            sentence_repr = torch.sum(attention_weights * output, dim=1)
        sentence_repr = self.dropout(sentence_repr)
        return self.fc(sentence_repr)

In [109]:
vocab_size = len(word_to_index)
embedding_dim = embedding_matrix.shape[1]  # match word2vec vector size
hidden_dim = 256
output_dim = 2
pad_idx = word_to_index["<PAD>"] if "<PAD>" in word_to_index else 0

model = RNNUpdateEmbedding(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    pad_idx,
    embedding_matrix,
    freeze_embeddings=False,
    aggregation_method="max_pooling",
)

# verify embedding layer
print(f"Embedding layer shape: {model.embedding.weight.shape}")
print(f"Embedding layer requires gradient: {model.embedding.weight.requires_grad}")

Embedding layer shape: torch.Size([16685, 100])
Embedding layer requires gradient: True


In [110]:
model.apply(lambda m: m.reset_parameters() if hasattr(m, "reset_parameters") else None)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    model.parameters(), lr=best_params["lr"], weight_decay=best_params["weight_decay"]
)

final_val_loss, final_val_acc = train_and_evaluate(
    model,
    train_dataloader,
    val_dataloader,
    criterion,
    optimizer,
    best_params["num_epochs"],
    best_params["patience"],
)

print(f"Final validation loss: {final_val_loss:.4f}")
print(f"Final validation accuracy: {final_val_acc:.4f}")

# Save the best model
torch.save(model.state_dict(), "./saved_models/part_3_1.pth")

Final validation loss: 0.5835
Final validation accuracy: 0.7250


In [112]:
best_params_enhanced_rnn, best_val_loss_enhanced_rnn, best_val_acc_enhanced_rnn = (
    hyperparameter_tuning(model, train_dataloader, val_dataloader)
)
print(f"Best parameters: {best_params_enhanced_rnn}")
print(f"Best validation loss: {best_val_loss_enhanced_rnn:.4f}")
print(f"Best validation accuracy: {best_val_acc_enhanced_rnn:.4f}")

# train using the best parameters
model.apply(lambda m: m.reset_parameters() if hasattr(m, "reset_parameters") else None)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    model.parameters(),
    lr=best_params_enhanced_rnn["lr"],
    weight_decay=best_params_enhanced_rnn["weight_decay"],
)

final_val_loss_enhanced_rnn, final_val_acc_enhanced_rnn = train_and_evaluate(
    model,
    train_dataloader,
    val_dataloader,
    criterion,
    optimizer,
    best_params_enhanced_rnn["num_epochs"],
    best_params_enhanced_rnn["patience"],
)

print(f"Final validation loss: {final_val_loss_enhanced_rnn:.4f}")
print(f"Final validation accuracy: {final_val_acc_enhanced_rnn:.4f}")

# Save the best model
torch.save(model.state_dict(), "./saved_models/part_3_1.pth")

Best parameters: {'batch_size': 32, 'lr': 0.0001, 'num_epochs': 50, 'patience': 5, 'weight_decay': 1e-05}
Best validation loss: 0.5514
Best validation accuracy: 0.7203
Final validation loss: 0.5504
Final validation accuracy: 0.7424


In [113]:
model.load_state_dict(torch.load("./saved_models/part_3_1.pth"))

model.eval()
test_loss = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, label in test_dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, label)

        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)

        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

test_loss /= len(test_dataloader)
test_acc = accuracy_score(all_labels, all_predictions)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

Test Loss: 0.5417
Test Accuracy: 0.7439


  model.load_state_dict(torch.load("./saved_models/part_3_1.pth"))


## Part 3.2 Mitigating OOV


### Mitigating OOV using \<UNK\> token and fasttext

In [145]:
from gensim.models import FastText

corpus = tokenized_sentences
fasttext_model = FastText(
    sentences=corpus, vector_size=100, window=3, min_count=2, workers=4, sg=1
)

fasttext_vocab = set(fasttext_model.wv.key_to_index)

ft_oov_words = original_vocab - fasttext_vocab

print(f"Number of OOV words", len(ft_oov_words))

tokenized_sentences = replace_oov_words(tokenized_sentences, fasttext_vocab)

# empty set for storing unique words
final_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the final_vocab set
        final_vocab.add(word)

# add 'UNK' and '<PAD>' to the vocabulary
final_vocab.add(UNK_TOKEN)
final_vocab.add(PAD_TOKEN)

# create the dictionary that maps each word in final_vocab to a unique index
word_to_index = {word: i for i, word in enumerate(final_vocab)}

embedding_dim = fasttext_model.vector_size

# initialize embedding matrix with number of vocab and embedding dimension
embedding_matrix = np.zeros((len(word_to_index), embedding_dim))

# fill the embedding matrix with the corresponding word vectors
#since fasttext can generate vectors for OOV words, we can directly use the vectors
for word, i in word_to_index.items():
    embedding_matrix[i] = fasttext_model.wv[word]


print(f"Shape of embedding matrix: {embedding_matrix.shape}")

Number of OOV words 7866
Shape of embedding matrix: (8819, 100)


In [146]:

train_X = [
    words_to_indices(sentence, word_to_index) for sentence in train_dataset["text"]
]
train_y = train_dataset["label"]
val_X = [
    words_to_indices(sentence, word_to_index) for sentence in validation_dataset["text"]
]
val_y = validation_dataset["label"]
test_X = [
    words_to_indices(sentence, word_to_index) for sentence in test_dataset["text"]
]
test_y = test_dataset["label"]



train_dataloader = create_dataloader(train_X, train_y, shuffle=True)
val_dataloader = create_dataloader(val_X, val_y, shuffle=False)
test_dataloader = create_dataloader(test_X, test_y, shuffle=False)

# convert embedding_matrix to tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)

### Train on RNN model with OOV elimination methods

In [151]:
vocab_size = len(word_to_index)
embedding_dim = embedding_matrix.shape[1]  # match fasttext vector size
hidden_dim = 256
output_dim = 2
pad_idx = word_to_index["<PAD>"] if "<PAD>" in word_to_index else 0

model = RNNUpdateEmbedding(
    vocab_size,
    embedding_dim,
    hidden_dim,
    output_dim,
    pad_idx,
    embedding_matrix,
    freeze_embeddings=False,
    aggregation_method="max_pooling",
)

# verify embedding layer
print(f"Embedding layer shape: {model.embedding.weight.shape}")
print(f"Embedding layer requires gradient: {model.embedding.weight.requires_grad}")

Embedding layer shape: torch.Size([8819, 100])
Embedding layer requires gradient: True


In [148]:
model.apply(lambda m: m.reset_parameters() if hasattr(m, "reset_parameters") else None)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    model.parameters(), lr=best_params["lr"], weight_decay=best_params["weight_decay"]
)

final_val_loss, final_val_acc = train_and_evaluate(
    model,
    train_dataloader,
    val_dataloader,
    criterion,
    optimizer,
    best_params["num_epochs"],
    best_params["patience"],
)

print(f"Final validation loss: {final_val_loss:.4f}")
print(f"Final validation accuracy: {final_val_acc:.4f}")

# Save the best model
torch.save(model.state_dict(), "./saved_models/part_3_2.pth")

Final validation loss: 0.5823
Final validation accuracy: 0.7110


In [149]:
best_params_enhanced_rnn, best_val_loss_enhanced_rnn, best_val_acc_enhanced_rnn = (
    hyperparameter_tuning(model, train_dataloader, val_dataloader)
)
print(f"Best parameters: {best_params_enhanced_rnn}")
print(f"Best validation loss: {best_val_loss_enhanced_rnn:.4f}")
print(f"Best validation accuracy: {best_val_acc_enhanced_rnn:.4f}")

# train using the best parameters
model.apply(lambda m: m.reset_parameters() if hasattr(m, "reset_parameters") else None)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(
    model.parameters(),
    lr=best_params_enhanced_rnn["lr"],
    weight_decay=best_params_enhanced_rnn["weight_decay"],
)

final_val_loss_enhanced_rnn, final_val_acc_enhanced_rnn = train_and_evaluate(
    model,
    train_dataloader,
    val_dataloader,
    criterion,
    optimizer,
    best_params_enhanced_rnn["num_epochs"],
    best_params_enhanced_rnn["patience"],
)

print(f"Final validation loss: {final_val_loss_enhanced_rnn:.4f}")
print(f"Final validation accuracy: {final_val_acc_enhanced_rnn:.4f}")

# Save the best model
torch.save(model.state_dict(), "./saved_models/part_3_2.pth")

Best parameters: {'batch_size': 64, 'lr': 0.001, 'num_epochs': 50, 'patience': 5, 'weight_decay': 1e-05}
Best validation loss: 0.5507
Best validation accuracy: 0.7545
Final validation loss: 0.5422
Final validation accuracy: 0.7382


In [150]:
model.load_state_dict(torch.load("./saved_models/part_3_2.pth"))

model.eval()
test_loss = 0
all_predictions = []
all_labels = []

with torch.no_grad():
    for inputs, label in test_dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, label)

        test_loss += loss.item()
        _, predicted = torch.max(outputs, 1)

        all_predictions.extend(predicted.cpu().numpy())
        all_labels.extend(label.cpu().numpy())

test_loss /= len(test_dataloader)
test_acc = accuracy_score(all_labels, all_predictions)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

Test Loss: 0.6206
Test Accuracy: 0.7598


  model.load_state_dict(torch.load("./saved_models/part_3_2.pth"))
