<a href="https://colab.research.google.com/github/Ruin9999/sentiment-analysis/blob/main/sentiment_analysis_part_3_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install nltk datasets gensim numpy optuna

In [2]:
import numpy as np
import re
import nltk
from datasets import load_dataset
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
import torch.optim as optim
import torch.nn.functional as F

from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid

import optuna
from typing import List
import torch.optim as optim
from optuna.trial import Trial
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score

import matplotlib.pyplot as plt

In [3]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

# Part 0. Dataset Preparation

In [4]:
# loading the dataset from the library
dataset = load_dataset("rotten_tomatoes")
train_dataset = dataset ['train']
validation_dataset = dataset ['validation']
test_dataset = dataset ['test']

README.md:   0%|          | 0.00/7.46k [00:00<?, ?B/s]

train.parquet:   0%|          | 0.00/699k [00:00<?, ?B/s]

validation.parquet:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

test.parquet:   0%|          | 0.00/92.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

In [5]:
# check the sizes of each dataset
train_size = len(train_dataset)
validation_size = len(validation_dataset)
test_size = len(test_dataset)

print(f"Training dataset size: {train_size}")
print(f"Validation dataset size: {validation_size}")
print(f"Test dataset size: {test_size}")

Training dataset size: 8530
Validation dataset size: 1066
Test dataset size: 1066


In [6]:
# view an example from each dataset
print("Train Dataset")
print(train_dataset.features)
print(train_dataset[0])

print("Test Dataset")
print(test_dataset.features)
print(test_dataset[0])

print("Validation Dataset")
print(validation_dataset.features)
print(validation_dataset[0])

Train Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .', 'label': 1}
Test Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'lovingly photographed in the manner of a golden book sprung to life , stuart little 2 manages sweetness largely without stickiness .', 'label': 1}
Validation Dataset
{'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}
{'text': 'compassionately explores the seemingly irreconcilable situation between conservative christian parents and their estranged gay and lesbian children .', 'label': 1}


# Part 1. Preparing Word Embeddings

### Preprocessing

In [7]:
def preprocessing(text):

    # remove any other special characters but keep the general ones for potential sentiment usage
    text = re.sub(r'[^a-zA-Z0-9\'\!\?\.]', ' ', text)

    # replace multiple spaces with one space only
    text = re.sub(r'\s+', ' ', text)

    # remove leading and trailing whitespace to avoid unnecessary inconsistency
    text = text.strip()

    return text

# apply the preprocessing function to the 'text' column of each dataset
train_dataset = train_dataset.map(lambda x: {'text': preprocessing(x['text'])})
validation_dataset = validation_dataset.map(lambda x: {'text': preprocessing(x['text'])})
test_dataset = test_dataset.map(lambda x: {'text': preprocessing(x['text'])})

# an example of the processed text
print("Train Dataset Example:")
print(train_dataset[0])

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1066 [00:00<?, ? examples/s]

Train Dataset Example:
{'text': "the rock is destined to be the 21st century's new conan and that he's going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal .", 'label': 1}


In [8]:
# tokenization
# empty list to store the resulting sentences
tokenized_sentences = []

for text in train_dataset['text']:
    # Tokenize the text and append the tokenized sentence to the list
    tokenized_sentences.append(word_tokenize(text))

### (a) Size of vocabulary in training data

In [9]:
# empty set for storing unique words
original_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the words set
        original_vocab.add(word)

print(f"(a) The size of vocabulary formed in the training data is {len(original_vocab)}")

(a) The size of vocabulary formed in the training data is 16683


### (b) Number of OOV in the training data

In [10]:
# adjust the parameters for word2vec
vector_size = 100 # Dimensionality of the word vectors
window = 3 # Maximum distance between the current and predicted word within a sentence
min_count = 2 # Ignores all words with total frequency lower than this
workers = 4 # CPU cores
sg = 1 # 1 for skip-gram, 0 for CBOW
epochs = 5

# train the word2vec model
word2vec_model = Word2Vec(
    sentences = tokenized_sentences,
    vector_size = vector_size,
    window = window,
    min_count = min_count,
    workers = workers,
    epochs = epochs)

# variable to store model's vocab list
word2vec_vocab = set(word2vec_model.wv.key_to_index)

# Calculate OOV words by comparing the original vocab and Word2Vec vocab
oov_words = original_vocab - word2vec_vocab

print(f"(b) Number of OOV words in the training data is {len(oov_words)} when the minimum threshold for each word is {min_count}")

(b) Number of OOV words in the training data is 7866 when the minimum threshold for each word is 2


### (c) Mitigating OOV

In [11]:
# Replace all OOV words with <UNK>

# define the UNK and PAD token
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'

# process each sentence in the tokenized_sentences list
for i, sentence in enumerate(tokenized_sentences):
    # empty list to store the current processed sentence
    processed_sentence = []
    for word in sentence:
        if word in word2vec_vocab:
            # if the current word is in the model's vocab, keep it as it is
            processed_sentence.append(word)
        else:
            # otherwise, replace the word with UNK
            processed_sentence.append(UNK_TOKEN)

    # update the sentence in the original tokenized_sentences list
    tokenized_sentences[i] = processed_sentence

### Embedding matrix

In [12]:
# empty set for storing unique words
final_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the final_vocab set
        final_vocab.add(word)

# add 'UNK' and '<PAD>' to the vocabulary
final_vocab.add(UNK_TOKEN)
final_vocab.add(PAD_TOKEN)

# create the dictionary that maps each word in final_vocab to a unique index
word_to_index = {word: i for i, word in enumerate(final_vocab)}

embedding_dim = word2vec_model.vector_size

# initialize embedding matrix with number of vocab and embedding dimension
embedding_matrix = np.zeros((len(word_to_index), embedding_dim))

# fill the embedding matrix with the corresponding word vectors
for word, i in word_to_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]
    else:
        # (option 1) random initialization for unknown words
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
        # (option 2) use average vector for unknown words
        # embedding_matrix[i] = np.mean(word2vec_model.wv.vectors, axis=0)

print(f"Shape of embedding matrix: {embedding_matrix.shape}")

Shape of embedding matrix: (8819, 100)


In [13]:
# convert word to indices
def words_to_indices(sentence, word_to_index):
    return [word_to_index.get(word, word_to_index[UNK_TOKEN]) for word in sentence.split()]

train_X = [words_to_indices(sentence, word_to_index) for sentence in train_dataset['text']]
train_y = train_dataset['label']
val_X = [words_to_indices(sentence, word_to_index) for sentence in validation_dataset['text']]
val_y = validation_dataset['label']
test_X = [words_to_indices(sentence, word_to_index) for sentence in test_dataset['text']]
test_y = test_dataset['label']

def create_dataloader(X, y, batch_size=16, shuffle=True):
    X_tensor = [torch.tensor(seq, dtype=torch.long) for seq in X]
    X_padded = pad_sequence(X_tensor, batch_first=True, padding_value=word_to_index[PAD_TOKEN])
    y_tensor = torch.tensor(y, dtype=torch.long)
    dataset = TensorDataset(X_padded, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_dataloader = create_dataloader(train_X, train_y, shuffle=True)
val_dataloader = create_dataloader(val_X, val_y, shuffle=False)
test_dataloader = create_dataloader(test_X, test_y, shuffle=False)

# convert embedding_matrix to tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)
print(embedding_matrix.shape)

torch.Size([8819, 100])


In [14]:
print(train_X)

[[8699, 248, 3336, 3422, 7506, 753, 8699, 6286, 1910, 3863, 4603, 4887, 8084, 1910, 7866, 7506, 5200, 6593, 8665, 4918, 165, 3420, 5214, 439, 4275, 1910, 2185, 8776, 2210, 8719, 1910, 4356], [8699, 2411, 8808, 4956, 6068, 8699, 4883, 6068, 8699, 7945, 1439, 3336, 4882, 4626, 8084, 6593, 1776, 6068, 5616, 1910, 591, 4886, 1273, 793, 2664, 8629, 1910, 5263, 1049, 6068, 1595, 4356, 3782, 4356, 3782, 4356, 1910, 4173, 2391, 4356], [7375, 1662, 5254, 839, 4777], [4576, 8766, 5780, 3480, 7506, 8117, 7506, 8699, 5224, 7506, 2556, 4171, 497, 3336, 6593, 7162, 521, 7506, 6824, 4356], [6780, 525, 4147, 2693, 6217, 3169, 3434, 1910, 4882, 6915, 4887, 3029, 4283, 8084, 2842, 1910, 1455, 3480, 1578, 4356], [8699, 7445, 4606, 2269, 7869, 4379, 3787, 8699, 6043, 1989, 6068, 130, 2988, 4918, 5730, 8502, 2556, 8684, 8699, 6397, 507, 6068, 8699, 962, 4356], [128, 8084, 2693, 517, 6068, 1459, 4887, 5116, 4356], [66, 5265, 7478, 8722, 7741, 7167, 329, 1942, 1910, 8084, 8699, 6303, 7506, 6458, 3336, 4876, 

# **Part 3.4 Model Training & Evaluation - CNN**

In [15]:
torch.manual_seed(42)

<torch._C.Generator at 0x7adb66f3cf10>

## Creating a CNN architecture

In [16]:
class SentimentCNN(nn.Module):
  def __init__(self,
               embedding_dim: int,
               embedding_matrix: torch.FloatTensor,
               pad_idx: int,
               num_classes: int,
               num_filters: int,
               filter_sizes: List[int],
               dropout: float = 0.5,
               freeze_embeddings: bool = True,
               hidden_dim: int = 128):
    super(SentimentCNN, self).__init__()

    # Load pretrained embeddings
    embedding_tensor = torch.FloatTensor(embedding_matrix)
    self.embedding = nn.Embedding.from_pretrained(embedding_tensor, padding_idx=pad_idx, freeze=freeze_embeddings)

    self.convs = nn.ModuleList([
        nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
    ])

    self.dropout = nn.Dropout(dropout)
    self.fc1 = nn.Linear(len(filter_sizes) * num_filters, hidden_dim)
    self.fc2 = nn.Linear(hidden_dim, num_classes)

  def forward(self, x):
    x = self.embedding(x)
    x = x.unsqueeze(1)

    conv_outputs = []
    for conv in self.convs:
        conv_out = torch.relu(conv(x)).squeeze(3)
        pool_out = torch.max_pool1d(conv_out, conv_out.shape[2]).squeeze(2)
        conv_outputs.append(pool_out)


    x = torch.cat(conv_outputs, dim=1)
    x = self.dropout(x)
    x = self.fc1(x)
    x = torch.relu(x)
    x = self.dropout(x)
    x = self.fc2(x)

    return x

In [76]:
def train_epoch(model, train_dataloader, optimizer, criterion):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.train()
    epoch_loss = 0
    epoch_acc = 0

    for batch in train_dataloader:
      texts, labels = batch
      texts, labels = texts.to(device), labels.to(device)

      optimizer.zero_grad()
      predictions = model(texts)
      # print(f"Predictions: {predictions}")
      # print(f"Labels: {labels}")

      loss = criterion(predictions, labels)
      acc = (predictions.argmax(1) == labels).float().mean()

      loss.backward()
      optimizer.step()

      epoch_loss += loss.item()
      epoch_acc += acc.item()

    val_loss, val_acc = evaluate(model, val_dataloader, criterion)

    return (epoch_loss / len(train_dataloader), epoch_acc / len(train_dataloader),
            val_loss, val_acc)

In [77]:
def evaluate(model, val_dataloader, criterion):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  epoch_loss = 0
  epoch_acc = 0

  with torch.no_grad():
    for batch in val_dataloader:
      texts, labels = batch
      texts, labels = texts.to(device), labels.to(device)

      predictions = model(texts)

      loss = criterion(predictions, labels)
      acc = (predictions.argmax(1) == labels).float().mean()

      epoch_loss += loss.item()
      epoch_acc += acc.item()

  return epoch_loss / len(val_dataloader), epoch_acc / len(val_dataloader)

In [81]:
def test_model(model, test_loader):
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  total_loss = 0
  total_acc = 0

  with torch.no_grad():
      for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)

        outputs = model(texts)

        loss = criterion(outputs, labels)
        acc = (outputs.argmax(1) == labels).float().mean()

        total_loss += loss.item()
        total_acc += acc.item()

  return total_loss / len(test_loader), total_acc / len(test_loader)

In [20]:
def objective(trial: Trial, train_dataloader, val_dataloader, freeze_embeddings):
    # Define hyperparameters to optimize
    params = {
        'num_filters': trial.suggest_int('num_filters', 64, 256),
        'filter_sizes': [2, 3, 4],  # Could also be optimized if desired
        'dropout': trial.suggest_float('dropout', 0.1, 0.5),
        'batch_size': trial.suggest_int('batch_size', 16, 128),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True),
        'hidden_dim': trial.suggest_int('hidden_dim', 64, 512)
    }

    # Initialize model, criterion, and optimizer
    model = SentimentCNN(
        embedding_dim=embedding_matrix.shape[1],
        pad_idx=word_to_index[PAD_TOKEN],
        embedding_matrix=embedding_matrix,
        num_classes=2,
        num_filters=params['num_filters'],
        filter_sizes=params['filter_sizes'],
        dropout=params['dropout'],
        # hidden_dim=params['hidden_dim']
        hidden_dim=128,
        freeze_embeddings=freeze_embeddings,
    ).to("cuda" if torch.cuda.is_available() else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=params['learning_rate'])

    # Training loop
    n_epochs = 10
    best_val_acc = 0

    for epoch in range(n_epochs):
        train_loss, train_acc = train_epoch(model, train_dataloader, optimizer, criterion)
        val_loss, val_acc = evaluate(model, val_dataloader, criterion)

        # Report intermediate value
        trial.report(val_acc, epoch)

        # Handle pruning based on the intermediate value
        if trial.should_prune():
            raise optuna.TrialPruned()

        if val_acc > best_val_acc:
            best_val_acc = val_acc

    return best_val_acc

## Hyperparameter searching

In [21]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, train_dataloader, val_dataloader, freeze_embeddings=True), n_trials=100)

frozen_best_params = study.best_params
print(f"Best parameters for frozen model: {frozen_best_params}")
print(f"Best validation accuracy frozen model: {study.best_value}")

[I 2024-11-02 13:00:11,045] A new study created in memory with name: no-name-c3d25e39-d197-4044-b2c6-75eb833563db
[I 2024-11-02 13:00:35,619] Trial 0 finished with value: 0.5074626865671642 and parameters: {'num_filters': 206, 'dropout': 0.39019289959942705, 'batch_size': 69, 'learning_rate': 0.0026368475229062815, 'hidden_dim': 440}. Best is trial 0 with value: 0.5074626865671642.
[I 2024-11-02 13:00:52,541] Trial 1 finished with value: 0.5686567167737591 and parameters: {'num_filters': 138, 'dropout': 0.18743072741208647, 'batch_size': 101, 'learning_rate': 0.00013213082909350102, 'hidden_dim': 427}. Best is trial 1 with value: 0.5686567167737591.
[I 2024-11-02 13:01:09,810] Trial 2 finished with value: 0.5761194033409233 and parameters: {'num_filters': 195, 'dropout': 0.41669219239350996, 'batch_size': 101, 'learning_rate': 0.00022878724419913792, 'hidden_dim': 107}. Best is trial 2 with value: 0.5761194033409233.
[I 2024-11-02 13:01:31,402] Trial 3 finished with value: 0.5486940296

Best parameters for frozen model: {'num_filters': 198, 'dropout': 0.239265170989996, 'batch_size': 73, 'learning_rate': 0.00022338248514807002, 'hidden_dim': 484}
Best validation accuracy frozen model: 0.5817164182662964


In [22]:
study = optuna.create_study(direction="maximize")
study.optimize(lambda trial: objective(trial, train_dataloader, val_dataloader, freeze_embeddings=False), n_trials=100)

unfrozen_best_params = study.best_params
print(f"Best parameters for unfrozen model: {unfrozen_best_params}")
print(f"Best validation accuracy unfrozen model: {study.best_value}")

[I 2024-11-02 13:07:06,253] A new study created in memory with name: no-name-eb96b3e9-634a-492d-91d7-2dba242543f9
[I 2024-11-02 13:07:34,310] Trial 0 finished with value: 0.7417910445981951 and parameters: {'num_filters': 73, 'dropout': 0.35787690214753043, 'batch_size': 24, 'learning_rate': 0.00014346910130994643, 'hidden_dim': 352}. Best is trial 0 with value: 0.7417910445981951.
[I 2024-11-02 13:07:52,304] Trial 1 finished with value: 0.7367537315212079 and parameters: {'num_filters': 110, 'dropout': 0.2441408144195715, 'batch_size': 95, 'learning_rate': 0.0029353712521772565, 'hidden_dim': 458}. Best is trial 0 with value: 0.7417910445981951.
[I 2024-11-02 13:08:12,866] Trial 2 finished with value: 0.7427238804190907 and parameters: {'num_filters': 238, 'dropout': 0.2518201610885534, 'batch_size': 102, 'learning_rate': 0.00017517791084431164, 'hidden_dim': 357}. Best is trial 2 with value: 0.7427238804190907.
[I 2024-11-02 13:08:35,286] Trial 3 finished with value: 0.73376865707226

Best parameters for frozen model: {'num_filters': 101, 'dropout': 0.14269765966044753, 'batch_size': 67, 'learning_rate': 0.0018403741252235538, 'hidden_dim': 328}
Best validation accuracy frozen model: 0.7628731345062825


## Testing the *optimized* models

In [91]:
frozen_embeddings_model = SentimentCNN(
    embedding_dim=embedding_matrix.shape[1],
    pad_idx=word_to_index[PAD_TOKEN],
    embedding_matrix=embedding_matrix,
    num_classes=2,
    num_filters=frozen_best_params['num_filters'],
    filter_sizes=[2, 3, 4],
    dropout=frozen_best_params['dropout'],
    freeze_embeddings=True,
    hidden_dim=128
).to("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(frozen_embeddings_model.parameters(), lr=frozen_best_params['learning_rate'])

# Train final model
n_epochs = 3
for epoch in range(n_epochs):
    train_loss, train_acc, val_loss, val_acc = train_epoch(frozen_embeddings_model, train_dataloader, optimizer, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Val Loss: {val_loss}, Val Acc: {val_acc}')

# Evaluate final model
test_loss, test_acc = test_model(frozen_embeddings_model, test_dataloader)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.3f}')

Epoch: 01, Train Loss: 0.690, Train Acc: 0.532, Val Loss: 0.6811808721343083, Val Acc: 0.5787313433725443
Epoch: 02, Train Loss: 0.676, Train Acc: 0.573, Val Loss: 0.6693356401884734, Val Acc: 0.5830223880597015
Epoch: 03, Train Loss: 0.665, Train Acc: 0.593, Val Loss: 0.6642578831359521, Val Acc: 0.5908582090441861
Test Loss: 0.659, Test Acc: 0.613


In [92]:
unfrozen_embeddings_model = SentimentCNN(
    embedding_dim=embedding_matrix.shape[1],
    pad_idx=word_to_index[PAD_TOKEN],
    embedding_matrix=embedding_matrix,
    num_classes=2,
    num_filters=unfrozen_best_params['num_filters'],
    filter_sizes=[2, 3, 4],
    dropout=unfrozen_best_params['dropout'],
    freeze_embeddings=False,
    hidden_dim=128
).to("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(unfrozen_embeddings_model.parameters(), lr=unfrozen_best_params['learning_rate'])

# Train final model
n_epochs = 3
for epoch in range(n_epochs):
    train_loss, train_acc, val_loss, val_acc = train_epoch(unfrozen_embeddings_model, train_dataloader, optimizer, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Val Loss: {val_loss}, Val Acc: {val_acc}')

# Evaluate final model
test_loss, test_acc = test_model(unfrozen_embeddings_model, test_dataloader)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.3f}')

Epoch: 01, Train Loss: 0.635, Train Acc: 0.622, Val Loss: 0.5703225496099956, Val Acc: 0.7121268662054148
Epoch: 02, Train Loss: 0.426, Train Acc: 0.808, Val Loss: 0.515268223943995, Val Acc: 0.742910448294967
Epoch: 03, Train Loss: 0.256, Train Acc: 0.895, Val Loss: 0.5875158552358399, Val Acc: 0.7367537315212079
Test Loss: 0.554, Test Acc: 0.755


In [94]:
sentences = list(["This movie is great!", "This movie is fucking terrible"])
sentences = [words_to_indices(sentence, word_to_index) for sentence in sentences]
labels = [1, 0]

manual_test_dataset = create_dataloader(sentences, labels, shuffle=False)

test_model(unfrozen_embeddings_model, manual_test_dataset) # Ignore printed message, I don't set the labels correctly here.
test_model(frozen_embeddings_model, manual_test_dataset) # Ignore printed message, I don't set the labels correctly here.

(0.6986857652664185, 0.5)

In [100]:
# Test the model
frozen_test_loss, frozen_test_acc = test_model(frozen_embeddings_model, test_dataloader)
unfrozen_test_loss, unfrozen_test_acc = test_model(unfrozen_embeddings_model, test_dataloader)

print(f'Frozen loss: {frozen_test_loss}, Frozen acc: {frozen_test_acc}')
print(f'Unfrozen loss: {unfrozen_test_loss}, Unfrozen acc: {unfrozen_test_acc}')

Frozen loss: 0.6593021528044744, Frozen acc: 0.6126865673421035
Unfrozen loss: 0.5542561914493789, Unfrozen acc: 0.7550373139666088


# **Using unfrozen embeddings + updated OOV handling from part 3.2**

In [27]:
# Same code as before but just put into a function.

# define the UNK and PAD token
UNK_TOKEN = '<UNK>'
PAD_TOKEN = '<PAD>'

def replace_oov_words(tokenized_sentences,vocab):
    # Replace all OOV words with <UNK>
    # process each sentence in the tokenized_sentences list
    for i, sentence in enumerate(tokenized_sentences):
        # empty list to store the current processed sentence
        processed_sentence = []
        for word in sentence:
            if word in vocab:
                # if the current word is in the model's vocab, keep it as it is
                processed_sentence.append(word)
            else:
                # otherwise, replace the word with UNK
                processed_sentence.append(UNK_TOKEN)

        # update the sentence in the original tokenized_sentences list
        tokenized_sentences[i] = processed_sentence

    return tokenized_sentences

In [28]:
from gensim.models import FastText

corpus = tokenized_sentences
fasttext_model = FastText(
    sentences=corpus, vector_size=100, window=3, min_count=2, workers=4, sg=1
)

fasttext_vocab = set(fasttext_model.wv.key_to_index)

ft_oov_words = original_vocab - fasttext_vocab

print(f"Number of OOV words", len(ft_oov_words))

tokenized_sentences = replace_oov_words(tokenized_sentences, fasttext_vocab)

# empty set for storing unique words
final_vocab = set()

for sentence in tokenized_sentences:
    for word in sentence:
        # add each word in the sentence to the final_vocab set
        final_vocab.add(word)

# add 'UNK' and '<PAD>' to the vocabulary
final_vocab.add(UNK_TOKEN)
final_vocab.add(PAD_TOKEN)

# create the dictionary that maps each word in final_vocab to a unique index
word_to_index = {word: i for i, word in enumerate(final_vocab)}

embedding_dim = fasttext_model.vector_size

# initialize embedding matrix with number of vocab and embedding dimension
embedding_matrix = np.zeros((len(word_to_index), embedding_dim))

# fill the embedding matrix with the corresponding word vectors
#since fasttext can generate vectors for OOV words, we can directly use the vectors
for word, i in word_to_index.items():
    embedding_matrix[i] = fasttext_model.wv[word]


print(f"Shape of embedding matrix: {embedding_matrix.shape}")

Number of OOV words 7866
Shape of embedding matrix: (8819, 100)


In [29]:

train_X = [
    words_to_indices(sentence, word_to_index) for sentence in train_dataset["text"]
]
train_y = train_dataset["label"]
val_X = [
    words_to_indices(sentence, word_to_index) for sentence in validation_dataset["text"]
]
val_y = validation_dataset["label"]
test_X = [
    words_to_indices(sentence, word_to_index) for sentence in test_dataset["text"]
]
test_y = test_dataset["label"]



train_dataloader = create_dataloader(train_X, train_y, shuffle=True)
val_dataloader = create_dataloader(val_X, val_y, shuffle=False)
test_dataloader = create_dataloader(test_X, test_y, shuffle=False)

# convert embedding_matrix to tensor
embedding_matrix = torch.FloatTensor(embedding_matrix)

In [30]:
final_unfrozen_embeddings_model = SentimentCNN(
    embedding_dim=embedding_matrix.shape[1],
    pad_idx=word_to_index[PAD_TOKEN],
    embedding_matrix=embedding_matrix,
    num_classes=2,
    num_filters=unfrozen_best_params['num_filters'],
    filter_sizes=[2, 3, 4],
    dropout=unfrozen_best_params['dropout'],
    freeze_embeddings=False,
    hidden_dim=128
).to("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(final_unfrozen_embeddings_model.parameters(), lr=unfrozen_best_params['learning_rate'])

# Train final model
n_epochs = 15
for epoch in range(n_epochs):
    train_loss, train_acc = train_epoch(final_unfrozen_embeddings_model, train_dataloader, optimizer, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}')

# Evaluate final model
test_loss, test_acc = evaluate(final_unfrozen_embeddings_model, test_dataloader, criterion)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.3f}')

Epoch: 01, Train Loss: 0.647, Train Acc: 0.603
Epoch: 02, Train Loss: 0.448, Train Acc: 0.793
Epoch: 03, Train Loss: 0.273, Train Acc: 0.888
Epoch: 04, Train Loss: 0.147, Train Acc: 0.945
Epoch: 05, Train Loss: 0.089, Train Acc: 0.967
Epoch: 06, Train Loss: 0.050, Train Acc: 0.982
Epoch: 07, Train Loss: 0.040, Train Acc: 0.986
Epoch: 08, Train Loss: 0.032, Train Acc: 0.989
Epoch: 09, Train Loss: 0.021, Train Acc: 0.992
Epoch: 10, Train Loss: 0.023, Train Acc: 0.992
Epoch: 11, Train Loss: 0.015, Train Acc: 0.995
Epoch: 12, Train Loss: 0.017, Train Acc: 0.994
Epoch: 13, Train Loss: 0.019, Train Acc: 0.995
Epoch: 14, Train Loss: 0.011, Train Acc: 0.996
Epoch: 15, Train Loss: 0.017, Train Acc: 0.994
Test Loss: 2.350, Test Acc: 0.729


# **Trying to improve the model**

In [56]:
class ImprovedSentimentCNN(nn.Module):
  def __init__(self,
               embedding_dim: int,
               embedding_matrix: torch.FloatTensor,
               pad_idx: int,
               num_classes: int,
               num_filters: int,
               filter_sizes: List[int],
               dropout: float = 0.5,
               freeze_embeddings: bool = True,
               hidden_dim1: int = 128,
               hidden_dim2: int = 64,
               hidden_dim3: int = 32):
    super(ImprovedSentimentCNN, self).__init__()

    # Load pretrained embeddings
    embedding_tensor = torch.FloatTensor(embedding_matrix)
    self.embedding = nn.Embedding.from_pretrained(embedding_tensor, padding_idx=pad_idx, freeze=freeze_embeddings)

    self.convs = nn.ModuleList([
        nn.Conv2d(1, num_filters, (fs, embedding_dim)) for fs in filter_sizes
    ])

    # Batch Normalization
    self.bn = nn.BatchNorm1d(num_filters)

    # Self Attention
    self.attention = nn.MultiheadAttention(embed_dim=num_filters * len(filter_sizes), num_heads=2, dropout=dropout)

    self.dropout = nn.Dropout(dropout)
    self.fc1 = nn.Linear(len(filter_sizes) * num_filters, hidden_dim1)
    self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
    self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
    self.fc4 = nn.Linear(hidden_dim3, num_classes)

  def forward(self, x):
    x = self.embedding(x)
    x = x.unsqueeze(1)

    conv_outputs = []
    for conv in self.convs:
        conv_out = torch.relu(conv(x)).squeeze(3)
        # conv_out = self.bn(conv_out) # (Reduced Performance)
        pool_out = torch.max_pool1d(conv_out, conv_out.shape[2]).squeeze(2)
        conv_outputs.append(pool_out)

    x = torch.cat(conv_outputs, dim=1)

    # Applying attention (Reduced performance)
    # x = x.unsqueeze(1)
    # x, _ = self.attention(x, x, x)
    # x = x.squeeze(1)

    # Fully connected layers
    x = self.dropout(x)
    x = self.fc1(x)
    x = torch.relu(x)
    x = self.dropout(x)
    x = self.fc2(x)
    x = torch.relu(x)
    x = self.dropout(x)
    x = self.fc3(x)
    x = torch.relu(x)
    x = self.dropout(x)
    x = self.fc4(x)

    return x

In [111]:
attention_model = ImprovedSentimentCNN(
    embedding_dim=embedding_matrix.shape[1],
    pad_idx=word_to_index[PAD_TOKEN],
    embedding_matrix=embedding_matrix,
    num_classes=2,
    num_filters=200,
    filter_sizes=[2, 3, 4],
    dropout=0.17269878819772821,
    freeze_embeddings=False,
).to("cuda" if torch.cuda.is_available() else "cpu")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(attention_model.parameters(), lr=0.003259879759370297)

# Train final model
n_epochs = 5 # We decrease the number of epochs because we have so many fully connected layers. Also observed to overfit when around 7.
for epoch in range(n_epochs):
    train_loss, train_acc, val_loss, val_acc = train_epoch(attention_model, train_dataloader, optimizer, criterion)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.3f}, Val Loss: {val_loss}, Val Acc: {val_acc}')

# Evaluate final model
test_loss, test_acc = test_model(attention_model, test_dataloader)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.3f}')

Epoch: 01, Train Loss: 0.659, Train Acc: 0.586, Val Loss: 0.5959313729805733, Val Acc: 0.6826492539092676
Epoch: 02, Train Loss: 0.491, Train Acc: 0.778, Val Loss: 0.5647396907877567, Val Acc: 0.7121268662054148
Epoch: 03, Train Loss: 0.341, Train Acc: 0.863, Val Loss: 0.5380431420322674, Val Acc: 0.7317164184442208
Epoch: 04, Train Loss: 0.223, Train Acc: 0.921, Val Loss: 0.5740454154672907, Val Acc: 0.7345149259069073
Epoch: 05, Train Loss: 0.139, Train Acc: 0.953, Val Loss: 0.7417777985779207, Val Acc: 0.7371268654937175
Test Loss: 0.675, Test Acc: 0.730


In [110]:
sentences = list([
    "A masterpiece! The storyline was captivating, and the acting was top-notch. Definitely worth a watch.",
    "Loved every minute! The visuals and soundtrack were absolutely stunning.",
    "A fresh take on a classic genre. Brilliantly executed and highly entertaining.",
    "Exceeded my expectations! The plot twists kept me on the edge of my seat.",
    "Incredible character development and a powerful message. One of the best films of the year.",
    "A true cinematic experience. Beautifully shot and wonderfully acted.",
    "Great balance of humor and drama, with a cast that brings it to life perfectly.",
    "So much fun to watch! The pacing was just right, and I was thoroughly entertained.",
    "An inspiring story with unforgettable performances. Left me feeling emotional and fulfilled.",
    "It’s rare to find a movie that hits every mark, but this one does it effortlessly.",
    "A complete waste of time. The plot was all over the place, and the characters were flat.",
    "So disappointing. I had high hopes, but it was just poorly executed from start to finish.",
    "The acting was cringe-worthy, and the dialogue was painfully unrealistic.",
    "The storyline was predictable, and it felt like every scene was dragging on.",
    "I struggled to stay awake. The pacing was terrible, and there was zero suspense.",
    "Couldn’t finish it. The movie was boring, with no real plot development.",
    "This was one of the worst movies I’ve ever seen. Nothing made sense.",
    "An absolute disaster. Poor writing, poor acting, and poor everything else.",
    "Felt like a low-budget student film. The special effects were laughable.",
    "Completely overrated. I don’t get the hype; it was dull and uninspired.",
])
sentences = [words_to_indices(sentence, word_to_index) for sentence in sentences]
labels = [1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0]

manual_test_dataset = create_dataloader(sentences, labels, shuffle=False)

test_loss, test_acc = test_model(attention_model, manual_test_dataset)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.3f}')

Test Loss: 0.411, Test Acc: 0.875


# **Loading and Saving models**

In [112]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [113]:
import torch

torch.save(frozen_embeddings_model.state_dict(), '/content/drive/MyDrive/frozen_embeddings_model.pth')
torch.save(unfrozen_embeddings_model.state_dict(), '/content/drive/MyDrive/unfrozen_embeddings_model.pth')
torch.save(attention_model.state_dict(), '/content/drive/MyDrive/attention_model.pth')

In [115]:
frozen_embeddings_model = torch.load("/content/drive/MyDrive/frozen_embeddings_model.pth")
unfrozen_embeddings_model = torch.load("/content/drive/MyDrive/unfrozen_embeddings_model.pth")
attention_model = torch.load("/content/drive/MyDrive/attention_model.pth")

  frozen_embeddings_model = torch.load("/content/drive/MyDrive/frozen_embeddings_model.pth")
  unfrozen_embeddings_model = torch.load("/content/drive/MyDrive/unfrozen_embeddings_model.pth")
  attention_model = torch.load("/content/drive/MyDrive/attention_model.pth")
