In [38]:
%%capture
!pip install nltk datasets gensim numpy optuna

In [39]:
import numpy as np
import re
import nltk
from datasets import load_dataset
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid

import optuna
from typing import List
import torch.optim as optim
from optuna.trial import Trial
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score

import matplotlib.pyplot as plt

In [40]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

# Part 0. Dataset Preparation

In [41]:
# loading the dataset from the library
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset ['train']
validation_dataset = dataset ['validation']
test_dataset = dataset ['test']

In [42]:
# check the sizes of each dataset
train_size = len(train_dataset)
validation_size = len(validation_dataset)
test_size = len(test_dataset)

print(f"Training dataset size: {train_size}")
print(f"Validation dataset size: {validation_size}")
print(f"Test dataset size: {test_size}")

Training dataset size: 8530
Validation dataset size: 1066
Test dataset size: 1066


In [43]:
# view an example from each dataset
print("Train Dataset")
print(train_dataset.features)
print(train_dataset[0])

print("Test Dataset")
print(test_dataset.features)
print(test_dataset[0])

print("Validation Dataset")
print(validation_dataset.features)
print(validation_dataset[0])

Train Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
Test Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .', 'label': 1}
Validation Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', 'label': 1}


# Part 1. Preparing Word Embeddings

### Preprocessing

In [44]:
def preprocessing(text):

    # remove any other special characters but keep the general ones for potential sentiment usage
    text = re.sub(r'[^a-zA-Z0-9\'\!\?\.]', ' ', text)

    # replace multiple spaces with one space only
    text = re.sub(r'\s+', ' ', text)

    # remove leading and trailing whitespace to avoid unnecessary inconsistency
    text = text.strip()

    return text

# apply the preprocessing function to the 'text' column of each dataset
train_dataset = train_dataset.map(lambda x: {'text': preprocessing(x['text'])})
validation_dataset = validation_dataset.map(lambda x: {'text': preprocessing(x['text'])})
test_dataset = test_dataset.map(lambda x: {'text': preprocessing(x['text'])})

# an example of the processed text
print("Train Dataset Example:")
print(train_dataset[0])

Train Dataset Example:
{'text': "the rock is destined to be the 21st century's new conan and that he's going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal .", 'label': 1}


In [45]:
# tokenization
# empty list to store the resulting sentences
tokenized_sentences = []

for text in train_dataset['text']:
    # Tokenize the text and append the tokenized sentence to the list
    tokenized_sentences.append(word_tokenize(text))

### (a) Size of vocabulary in training data

In [46]:
# empty set for storing unique words
original_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the words set
        original_vocab.add(word)

print(f"(a) The size of vocabulary formed in the training data is {len(original_vocab)}")

(a) The size of vocabulary formed in the training data is 16683


### (b) Number of OOV in the training data

In [47]:
# adjust the parameters for word2vec
vector_size = 100 # Dimensionality of the word vectors
window = 3 # Maximum distance between the current and predicted word within a sentence
min_count = 2 # Ignores all words with total frequency lower than this
workers = 4 # CPU cores
sg = 1 # 1 for skip-gram, 0 for CBOW
epochs = 5

# train the word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences,
    vector_size = vector_size,
    window = window,
    min_count = min_count,
    workers = workers,
    epochs = epochs)

# variable to store model's vocab list
word2vec_vocab = set(word2vec_model.wv.key_to_index)

# Calculate OOV words by comparing the original vocab and Word2Vec vocab
oov_words = original_vocab - word2vec_vocab

print(f"(b) Number of OOV words in the training data is {len(oov_words)} when the minimum threshold for each word is {min_count}")

(b) Number of OOV words in the training data is 7866 when the minimum threshold for each word is 2


### (c) Mitigating OOV

In [48]:
# Replace all OOV words with <UNK>

# define the UNK and PAD token
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'

# process each sentence in the tokenized_sentences list
for i, sentence in enumerate(tokenized_sentences):
    # empty list to store the current processed sentence
    processed_sentence = []
    for word in sentence:
        if word in word2vec_vocab:
            # if the current word is in the model's vocab, keep it as it is
            processed_sentence.append(word)
        else:
            # otherwise, replace the word with UNK
            processed_sentence.append(UNK_TOKEN)

    # update the sentence in the original tokenized_sentences list
    tokenized_sentences[i] = processed_sentence

### Embedding matrix

In [49]:
# empty set for storing unique words
final_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the final_vocab set
        final_vocab.add(word)

# add 'UNK' and '<PAD>' to the vocabulary
final_vocab.add(UNK_TOKEN)
final_vocab.add(PAD_TOKEN)

# create the dictionary that maps each word in final_vocab to a unique index
word_to_index = {word: i for i, word in enumerate(final_vocab)}

embedding_dim = word2vec_model.vector_size

# initialize embedding matrix with number of vocab and embedding dimension
embedding_matrix = np.zeros((len(word_to_index), embedding_dim))

# fill the embedding matrix with the corresponding word vectors
for word, i in word_to_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
    else:
        # (option 1) random initialization for unknown words
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
        # (option 2) use average vector for unknown words
        # embedding_matrix[i] = np.mean(word2vec_model.wv.vectors, axis=0)

print(f"Shape of embedding matrix: {embedding_matrix.shape}")

Shape of embedding matrix: (8819, 100)


In [61]:
# convert word to indices
def words_to_indices(sentence, word_to_index):
    return [word_to_index.get(word, word_to_index[UNK_TOKEN]) for word in sentence.split()]

train_X = [words_to_indices(sentence, word_to_index) for sentence in train_dataset['text']]
train_y = train_dataset['label']
val_X = [words_to_indices(sentence, word_to_index) for sentence in validation_dataset['text']]
val_y = validation_dataset['label']
test_X = [words_to_indices(sentence, word_to_index) for sentence in test_dataset['text']]
test_y = test_dataset['label']

def create_dataloader(X, y, batch_size=16, shuffle=True):
    X_tensor = [torch.tensor(seq, dtype=torch.long) for seq in X]
    X_padded = pad_sequence(X_tensor, batch_first=True, padding_value=word_to_index[PAD_TOKEN])
    y_tensor = torch.tensor(y, dtype=torch.long)
    dataset = TensorDataset(X_padded, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = create_dataloader(train_X, train_y, shuffle=True)
val_dataloader = create_dataloader(val_X, val_y, shuffle=False)
test_dataloader = create_dataloader(test_X, test_y, shuffle=False)

# convert embedding_matrix to tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)
print(embedding_matrix.shape)

torch.Size([8819, 100])


In [84]:
print(train_X)

[[3976, 2701, 3645, 1566, 5640, 5996, 3976, 6010, 6400, 5077, 3279, 3859, 5938, 6400, 1376, 5640, 3929, 8132, 1046, 2895, 4722, 5613, 1238, 1313, 7510, 6400, 3501, 6663, 2735, 4070, 6400, 3353], [3976, 2804, 3849, 1379, 3471, 3976, 2046, 3471, 3976, 5922, 1575, 3645, 1414, 6864, 5938, 8132, 3595, 3471, 2987, 6400, 2304, 4243, 1068, 6803, 3280, 6289, 6400, 1687, 1454, 3471, 2224, 3353, 6303, 3353, 6303, 3353, 6400, 8469, 7159, 3353], [1480, 7317, 7138, 3148, 5824], [8498, 2320, 1364, 1618, 5640, 3066, 5640, 3976, 149, 5640, 4658, 4053, 474, 3645, 8132, 5111, 2191, 5640, 3300, 3353], [3276, 6826, 5710, 8082, 4266, 2067, 5157, 6400, 1414, 8630, 3859, 3683, 99, 5938, 213, 6400, 8344, 1618, 1471, 3353], [3976, 1121, 4462, 2918, 8427, 927, 7482, 3976, 8290, 8376, 3471, 8436, 641, 2895, 2729, 1923, 4658, 5354, 3976, 4099, 4572, 3471, 3976, 6768, 3353], [5244, 5938, 8082, 1710, 3471, 6347, 3859, 5676, 3353], [3046, 1941, 8149, 8088, 5084, 8711, 3072, 4096, 6400, 5938, 3976, 2741, 5640, 8813, 3

# Part 3. Model Training & Evaluation - CNN

In [51]:
torch.manual_seed(42)

<torch._C.Generator at 0x7c1edc3db290>

In [79]:
class SentimentCNN(nn.Module):
  def __init__(self,
               embedding_dim: int,
               embedding_matrix: torch.FloatTensor,
               pad_idx: int,
               num_classes: int,
               num_filters: int,
               filter_sizes: List[int],
               dropout: float = 0.5,
               freeze_embeddings: bool = True,
               hidden_dim: int = 128):
    super(SentimentCNN, self).__init__()

    # Load pretrained embeddings
    embedding_tensor = torch.FloatTensor(embedding_matrix)
    self.embedding = nn.Embedding.from_pretrained(embedding_tensor, padding_idx=pad_idx, freeze=freeze_embeddings)

    self.convs = nn.ModuleList([
        nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
    ])

    self.dropout = nn.Dropout(dropout)
    self.fc1 = nn.Linear(len(filter_sizes) * num_filters, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, num_classes)

  def forward(self, x):
    x = self.embedding(x)
    x = x.unsqueeze(1)

    conv_outputs = []
    for conv in self.convs:
        conv_out = torch.relu(conv(x)).squeeze(3)
        pool_out = torch.max_pool1d(conv_out, conv_out.shape[2]).squeeze(2)
        conv_outputs.append(pool_out)


    x = torch.cat(conv_outputs, dim=1)
    x = self.dropout(x)
    x = self.fc1(x)
    x = torch.relu(x)
    x = self.dropout(x)
    x = self.fc2(x)

    return x

In [80]:
def train_epoch(model, train_dataloader, optimizer, criterion):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for batch in train_dataloader:
      texts, labels = batch
      texts, labels = texts.to(device), labels.to(device)

      optimizer.zero_grad()
      predictions = model(texts)
      # print(f"Predictions: {predictions}")
      # print(f"Labels: {labels}")

      loss = criterion(predictions, labels)
      acc = (predictions.argmax(1) == labels).float().mean()

      loss.backward()
      optimizer.step()

      epoch_loss += loss.item()
      epoch_acc += acc.item()

    return epoch_loss / len(train_dataloader), epoch_acc / len(train_dataloader)

In [71]:
def evaluate(model, val_dataloader, criterion):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  epoch_loss = 0
  epoch_acc = 0

  with torch.no_grad():
    for batch in val_dataloader:
      texts, labels = batch
      texts, labels = texts.to(device), labels.to(device)

      predictions = model(texts)

      loss = criterion(predictions, labels)
      acc = (predictions.argmax(1) == labels).float().mean()

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(val_dataloader), epoch_acc / len(val_dataloader)

In [77]:
def test_model(model, test_loader):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  correct = 0
  total = 0

  with torch.no_grad():
      for texts, labels in test_loader:
          texts, labels = texts.to(device), labels.to(device)

          # Get predictions
          outputs = model(texts)
          predictions = outputs.argmax(dim=1)

          # Calculate accuracy
          correct += (predictions == labels).sum().item()
          total += labels.size(0)

          # Print batch predictions vs actual
          print("\nBatch results:")
          print(f"Predictions: {predictions}")
          print(f"Actual labels: {labels}")
          print(f"Accuracy: {correct/total:.4f}")

  final_accuracy = correct / total
  print(f"\nFinal Test Accuracy: {final_accuracy:.4f}")

  return final_accuracy

In [72]:
def objective(trial: Trial, train_dataloader, val_dataloader):
    # Define hyperparameters to optimize
    params = {
        'embedding_dim': trial.suggest_int('embedding_dim', 50, 300),
        'num_filters': trial.suggest_int('num_filters', 64, 256),
        'filter_sizes': [2, 3, 4],  # Could also be optimized if desired
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
        'batch_size': trial.suggest_int('batch_size', 16, 128),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
        'hidden_dim': trial.suggest_int('hidden_dim', 64, 512)
    }

    # Initialize model, criterion, and optimizer
    model = SentimentCNN(
        embedding_dim=embedding_matrix.shape[1],
        pad_idx=word_to_index[PAD_TOKEN],
        embedding_matrix=embedding_matrix,
        num_classes=2,
        num_filters=params['num_filters'],
        filter_sizes=params['filter_sizes'],
        dropout=params['dropout'],
        # hidden_dim=params['hidden_dim']
        hidden_dim=128,
        freeze_embeddings=False,
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

    # Training loop
    n_epochs = 10
    best_val_acc = 0

    for epoch in range(n_epochs):
        train_loss, train_acc = train_epoch(model, train_dataloader, optimizer, criterion)
        val_loss, val_acc = evaluate(model, val_dataloader, criterion)

        # Report intermediate value
        trial.report(val_acc, epoch)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.TrialPruned()

        if val_acc > best_val_acc:
            best_val_acc = val_acc

    return best_val_acc

In [73]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, train_dataloader, val_dataloader), n_trials=100)

best_params = study.best_params
print(f"Best parameters: {best_params}")
print(f"Best validation accuracy: {study.best_value}")

[I 2024-11-01 06:41:56,645] A new study created in memory with name: no-name-4f69aa45-cdb5-4fea-ac1f-59e42b2c85ec
[I 2024-11-01 06:42:13,893] Trial 0 finished with value: 0.5692164177325234 and parameters: {'embedding_dim': 78, 'num_filters': 184, 'dropout': 0.337893825507256, 'batch_size': 49, 'learning_rate': 0.00022099986544271398, 'hidden_dim': 365}. Best is trial 0 with value: 0.5692164177325234.
[I 2024-11-01 06:42:31,739] Trial 1 finished with value: 0.5027985074626866 and parameters: {'embedding_dim': 121, 'num_filters': 212, 'dropout': 0.24008393283027551, 'batch_size': 42, 'learning_rate': 0.008321326851969683, 'hidden_dim': 263}. Best is trial 0 with value: 0.5692164177325234.
[I 2024-11-01 06:42:57,196] Trial 2 finished with value: 0.5832089550459563 and parameters: {'embedding_dim': 146, 'num_filters': 74, 'dropout': 0.19400326895344608, 'batch_size': 104, 'learning_rate': 0.0002645434271049529, 'hidden_dim': 345}. Best is trial 2 with value: 0.5832089550459563.
[I 2024-11

Best parameters: {'embedding_dim': 123, 'num_filters': 249, 'dropout': 0.1566414199762078, 'batch_size': 85, 'learning_rate': 0.0002903800771018002, 'hidden_dim': 361}
Best validation accuracy: 0.5947761197588337


KeyError: 'filter_sizes'

In [76]:
final_model = SentimentCNN(
    embedding_dim=embedding_matrix.shape[1],
    pad_idx=word_to_index[PAD_TOKEN],
    embedding_matrix=embedding_matrix,
    num_classes=2,
    num_filters=best_params['num_filters'],
    filter_sizes=[2, 3, 4],
    dropout=best_params['dropout'],
    freeze_embeddings=False,
    hidden_dim=128
).to("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(final_model.parameters(), lr=best_params['learning_rate'])

# Train final model
n_epochs = 15
for epoch in range(n_epochs):
    train_loss, train_acc = train_epoch(final_model, train_dataloader, optimizer, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}')

# Evaluate final model
test_loss, test_acc = evaluate(final_model, test_dataloader, criterion)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.3f}')

Epoch: 01, Train Loss: 0.695, Train Acc: 0.507
Epoch: 02, Train Loss: 0.688, Train Acc: 0.540
Epoch: 03, Train Loss: 0.629, Train Acc: 0.645
Epoch: 04, Train Loss: 0.338, Train Acc: 0.855
Epoch: 05, Train Loss: 0.178, Train Acc: 0.930
Epoch: 06, Train Loss: 0.103, Train Acc: 0.962
Epoch: 07, Train Loss: 0.062, Train Acc: 0.980
Epoch: 08, Train Loss: 0.038, Train Acc: 0.988
Epoch: 09, Train Loss: 0.024, Train Acc: 0.993
Epoch: 10, Train Loss: 0.018, Train Acc: 0.995
Epoch: 11, Train Loss: 0.014, Train Acc: 0.996
Epoch: 12, Train Loss: 0.009, Train Acc: 0.996
Epoch: 13, Train Loss: 0.006, Train Acc: 0.998
Epoch: 14, Train Loss: 0.010, Train Acc: 0.996
Epoch: 15, Train Loss: 0.008, Train Acc: 0.997
Test Loss: 2.286, Test Acc: 0.715


In [None]:
# Test the model
test_model(final_model, test_dataloader)

In [111]:
sentences = list(["This movie is great!", "This movie is fucking terrible"])
sentences = [words_to_indices(sentence, word_to_index) for sentence in sentences]
labels = [0] * len(sentences)

manual_test_dataset = create_dataloader(sentences, labels, shuffle=False)

test_model(final_model, manual_test_dataset) # Ignore printed message, I don't set the labels correctly here.


Batch results:
Predictions: tensor([1, 0], device='cuda:0')
Actual labels: tensor([0, 0], device='cuda:0')
Accuracy: 0.5000

Final Test Accuracy: 0.5000


0.5