In [1]:
import sys
sys.path.append('../src')

import torch
import torch.nn as nn
import torch.optim as optim
import pickle

from data_loader import load_data
from embeddings import load_pretrained_embeddings
from models import RNNModel, BiLSTMModel, BiGRUModel, CNNModel
from utils import train_model, evaluate_model


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/haoyangpang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
device = torch.device('mps' if torch.mps.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: mps


In [3]:
import os
print(os.getcwd())  
os.chdir('/Users/haoyangpang/Desktop/Y3S1/SC4002 Natural Language Processing/assignment')

/Users/haoyangpang/Desktop/Y3S1/SC4002 Natural Language Processing/assignment/notebooks


In [4]:
# Load the saved vocabulary
with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

batch_size = 32
max_len = 100

train_loader, val_loader, test_loader, _ = load_data(batch_size, max_len)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/haoyangpang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
embedding_dim = 300
embedding_file = 'glove.6B.300d.txt'

# Load embedding matrix
embedding_matrix = load_pretrained_embeddings(vocab, embedding_dim, embedding_file)


### Enhancement 1: Updating Word Embeddings, modified RNN model

In [6]:
hidden_size = 256
output_size = 2  # Positive or Negative sentiment

# Initialize the RNN model with trainable embeddings
model_update_emb = RNNModel(embedding_matrix, hidden_size, output_size, freeze=False)
model_update_emb.to(device)


RNNModel(
  (embedding): Embedding(15813, 300)
  (rnn): RNN(300, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [7]:
# Define loss and optimizer
num_epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_update_emb.parameters(), lr=0.001)

# Train the model
train_losses, val_accuracies = train_model(
    model_update_emb, train_loader, val_loader, criterion, optimizer, num_epochs, device
)

# Evaluate on test set
model_update_emb.load_state_dict(torch.load('best_model.pt'))
test_accuracy_update_emb = evaluate_model(model_update_emb, test_loader, device)
print(f"Test Accuracy with Updated Embeddings: {test_accuracy_update_emb:.4f}")


Epoch 1/10: 100%|██████████| 267/267 [00:20<00:00, 13.18it/s]


Epoch 1/10, Loss: 0.7135, Val Acc: 0.4737


Epoch 2/10: 100%|██████████| 267/267 [00:18<00:00, 14.55it/s]


Epoch 2/10, Loss: 0.6994, Val Acc: 0.4728


Epoch 3/10: 100%|██████████| 267/267 [00:18<00:00, 14.61it/s]


Epoch 3/10, Loss: 0.6997, Val Acc: 0.4709


Epoch 4/10: 100%|██████████| 267/267 [00:18<00:00, 14.63it/s]


Epoch 4/10, Loss: 0.7051, Val Acc: 0.4700


Epoch 5/10: 100%|██████████| 267/267 [00:18<00:00, 14.66it/s]


Epoch 5/10, Loss: 0.7014, Val Acc: 0.5291


Epoch 6/10: 100%|██████████| 267/267 [00:18<00:00, 14.62it/s]


Epoch 6/10, Loss: 0.6988, Val Acc: 0.5291


Epoch 7/10: 100%|██████████| 267/267 [00:18<00:00, 14.59it/s]


Epoch 7/10, Loss: 0.7047, Val Acc: 0.4700


Epoch 8/10: 100%|██████████| 267/267 [00:18<00:00, 14.45it/s]


Epoch 8/10, Loss: 0.6989, Val Acc: 0.5000


Epoch 9/10: 100%|██████████| 267/267 [00:18<00:00, 14.52it/s]


Epoch 9/10, Loss: 0.6990, Val Acc: 0.4709


Epoch 10/10: 100%|██████████| 267/267 [00:18<00:00, 14.27it/s]


Epoch 10/10, Loss: 0.6964, Val Acc: 0.5291
Best Validation Accuracy: 0.5291


  model_update_emb.load_state_dict(torch.load('best_model.pt'))


Test Accuracy with Updated Embeddings: 0.5141


## Question 3(a): Report the accuracy score when updating word embeddings

Ans: The test accuracy when the word embeddings are updated during training is 0.5141

## Question 3(b): Report the accuracy score when applying OOV handling

### Enhancement 2

Ans: Since we have already initialized OOV embeddings and allowed them to be updated, the model from Enhancement 1 includes this change. So the Test Accuracty is also 0.5141

### Enhancement 3: Using BiLSTM and BiGRU

BiLSTM

In [8]:
hidden_size = 128
output_size = 2

model_bilstm = BiLSTMModel(embedding_matrix, hidden_size, output_size)
model_bilstm.to(device)


BiLSTMModel(
  (embedding): Embedding(15813, 300)
  (lstm): LSTM(300, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_bilstm.parameters(), lr=0.001)

train_losses, val_accuracies = train_model(
    model_bilstm, train_loader, val_loader, criterion, optimizer, num_epochs, device
)

model_bilstm.load_state_dict(torch.load('best_model.pt'))
test_accuracy_bilstm = evaluate_model(model_bilstm, test_loader, device)
print(f"Test Accuracy with BiLSTM: {test_accuracy_bilstm:.4f}")


Epoch 1/10: 100%|██████████| 267/267 [00:04<00:00, 56.18it/s]


Epoch 1/10, Loss: 0.6650, Val Acc: 0.6285


Epoch 2/10: 100%|██████████| 267/267 [00:03<00:00, 77.18it/s]


Epoch 2/10, Loss: 0.5987, Val Acc: 0.6567


Epoch 3/10: 100%|██████████| 267/267 [00:03<00:00, 77.20it/s]


Epoch 3/10, Loss: 0.5442, Val Acc: 0.6745


Epoch 4/10: 100%|██████████| 267/267 [00:03<00:00, 77.55it/s]


Epoch 4/10, Loss: 0.4648, Val Acc: 0.6735


Epoch 5/10: 100%|██████████| 267/267 [00:03<00:00, 77.41it/s]


Epoch 5/10, Loss: 0.3720, Val Acc: 0.6876


Epoch 6/10: 100%|██████████| 267/267 [00:03<00:00, 77.21it/s]


Epoch 6/10, Loss: 0.2530, Val Acc: 0.6773


Epoch 7/10: 100%|██████████| 267/267 [00:03<00:00, 76.75it/s]


Epoch 7/10, Loss: 0.1482, Val Acc: 0.6707


Epoch 8/10: 100%|██████████| 267/267 [00:03<00:00, 77.08it/s]


Epoch 8/10, Loss: 0.0851, Val Acc: 0.6651


Epoch 9/10: 100%|██████████| 267/267 [00:03<00:00, 76.87it/s]


Epoch 9/10, Loss: 0.0424, Val Acc: 0.6820


Epoch 10/10: 100%|██████████| 267/267 [00:03<00:00, 77.05it/s]


Epoch 10/10, Loss: 0.0293, Val Acc: 0.6670
Best Validation Accuracy: 0.6876


  model_bilstm.load_state_dict(torch.load('best_model.pt'))


Test Accuracy with BiLSTM: 0.6717


BiGRU

In [10]:
model_bigru = BiGRUModel(embedding_matrix, hidden_size, output_size)
model_bigru.to(device)


BiGRUModel(
  (embedding): Embedding(15813, 300)
  (gru): GRU(300, 128, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [11]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_bigru.parameters(), lr=0.001)

train_losses, val_accuracies = train_model(
    model_bigru, train_loader, val_loader, criterion, optimizer, num_epochs, device
)

model_bigru.load_state_dict(torch.load('best_model.pt'))
test_accuracy_bigru = evaluate_model(model_bigru, test_loader, device)
print(f"Test Accuracy with BiGRU: {test_accuracy_bigru:.4f}")


Epoch 1/10: 100%|██████████| 267/267 [00:39<00:00,  6.76it/s]


Epoch 1/10, Loss: 0.6566, Val Acc: 0.6276


Epoch 2/10: 100%|██████████| 267/267 [00:39<00:00,  6.77it/s]


Epoch 2/10, Loss: 0.5877, Val Acc: 0.6773


Epoch 3/10: 100%|██████████| 267/267 [00:39<00:00,  6.75it/s]


Epoch 3/10, Loss: 0.5233, Val Acc: 0.6811


Epoch 4/10: 100%|██████████| 267/267 [00:39<00:00,  6.76it/s]


Epoch 4/10, Loss: 0.4419, Val Acc: 0.7017


Epoch 5/10: 100%|██████████| 267/267 [00:39<00:00,  6.76it/s]


Epoch 5/10, Loss: 0.3311, Val Acc: 0.7054


Epoch 6/10: 100%|██████████| 267/267 [03:20<00:00,  1.33it/s]


Epoch 6/10, Loss: 0.2029, Val Acc: 0.7083


Epoch 7/10: 100%|██████████| 267/267 [18:40<00:00,  4.20s/it]   


Epoch 7/10, Loss: 0.1127, Val Acc: 0.6923


Epoch 8/10: 100%|██████████| 267/267 [02:21<00:00,  1.89it/s] 


Epoch 8/10, Loss: 0.0541, Val Acc: 0.6942


Epoch 9/10: 100%|██████████| 267/267 [00:38<00:00,  6.86it/s]


Epoch 9/10, Loss: 0.0290, Val Acc: 0.6867


Epoch 10/10: 100%|██████████| 267/267 [00:39<00:00,  6.70it/s]


Epoch 10/10, Loss: 0.0326, Val Acc: 0.6867
Best Validation Accuracy: 0.7083


  model_bigru.load_state_dict(torch.load('best_model.pt'))


Test Accuracy with BiGRU: 0.6811


## Question 3(c): Report the accuracy scores of BiLSTM and BiGRU

Ans: Test Accuracy with BiLSTM: 0.6717
Test Accuracy with BiGRU: 0.6811

Enhancement 4: Using CNN

In [12]:
model_cnn = CNNModel(embedding_matrix, output_size)
model_cnn.to(device)


CNNModel(
  (embedding): Embedding(15813, 300)
  (convs): ModuleList(
    (0): Conv2d(1, 100, kernel_size=(3, 300), stride=(1, 1))
    (1): Conv2d(1, 100, kernel_size=(4, 300), stride=(1, 1))
    (2): Conv2d(1, 100, kernel_size=(5, 300), stride=(1, 1))
  )
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=300, out_features=2, bias=True)
)

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_cnn.parameters(), lr=0.001)

train_losses, val_accuracies = train_model(
    model_cnn, train_loader, val_loader, criterion, optimizer, num_epochs, device
)

model_cnn.load_state_dict(torch.load('best_model.pt'))
test_accuracy_cnn = evaluate_model(model_cnn, test_loader, device)
print(f"Test Accuracy with CNN: {test_accuracy_cnn:.4f}")


Epoch 1/10: 100%|██████████| 267/267 [00:05<00:00, 49.71it/s]


Epoch 1/10, Loss: 0.6590, Val Acc: 0.6323


Epoch 2/10: 100%|██████████| 267/267 [00:04<00:00, 65.71it/s]


Epoch 2/10, Loss: 0.5581, Val Acc: 0.6632


Epoch 3/10: 100%|██████████| 267/267 [00:04<00:00, 64.95it/s]


Epoch 3/10, Loss: 0.4462, Val Acc: 0.6726


Epoch 4/10: 100%|██████████| 267/267 [00:04<00:00, 65.07it/s]


Epoch 4/10, Loss: 0.3477, Val Acc: 0.6773


Epoch 5/10: 100%|██████████| 267/267 [00:04<00:00, 64.47it/s]


Epoch 5/10, Loss: 0.2581, Val Acc: 0.6632


Epoch 6/10: 100%|██████████| 267/267 [00:04<00:00, 65.41it/s]


Epoch 6/10, Loss: 0.2021, Val Acc: 0.6632


Epoch 7/10: 100%|██████████| 267/267 [00:04<00:00, 65.93it/s]


Epoch 7/10, Loss: 0.1599, Val Acc: 0.6670


Epoch 8/10: 100%|██████████| 267/267 [00:04<00:00, 65.33it/s]


Epoch 8/10, Loss: 0.1376, Val Acc: 0.6670


Epoch 9/10: 100%|██████████| 267/267 [00:04<00:00, 65.61it/s]


Epoch 9/10, Loss: 0.1229, Val Acc: 0.6707


Epoch 10/10: 100%|██████████| 267/267 [00:04<00:00, 65.60it/s]
  model_cnn.load_state_dict(torch.load('best_model.pt'))


Epoch 10/10, Loss: 0.1005, Val Acc: 0.6764
Best Validation Accuracy: 0.6773
Test Accuracy with CNN: 0.6745


### Question 3(d): Report the accuracy scores of CNN

The test accuracy with CNN is 0.6745

Enhancement 5: Further Improvement

In [14]:
# In your notebook or training script
from models.bilstm_attention import BiLSTMAttentionModel

hidden_size = 128
output_size = 2

model_bilstm_attn = BiLSTMAttentionModel(
    embedding_matrix, hidden_size, output_size, freeze_embeddings=False
)
model_bilstm_attn.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_bilstm_attn.parameters(), lr=0.001)

# Train the model
num_epochs = 10
train_losses, val_accuracies = train_model(
    model_bilstm_attn, train_loader, val_loader, criterion, optimizer, num_epochs, device
)

# Evaluate on test set
model_bilstm_attn.load_state_dict(torch.load('best_model.pt'))
test_accuracy_bilstm_attn = evaluate_model(model_bilstm_attn, test_loader, device)
print(f"Test Accuracy with BiLSTM + Attention: {test_accuracy_bilstm_attn:.4f}")


Epoch 1/10: 100%|██████████| 267/267 [00:04<00:00, 56.90it/s]


Epoch 1/10, Loss: 0.6212, Val Acc: 0.6951


Epoch 2/10: 100%|██████████| 267/267 [00:03<00:00, 78.30it/s]


Epoch 2/10, Loss: 0.3564, Val Acc: 0.7505


Epoch 3/10: 100%|██████████| 267/267 [00:03<00:00, 78.45it/s]


Epoch 3/10, Loss: 0.1659, Val Acc: 0.7430


Epoch 4/10: 100%|██████████| 267/267 [00:03<00:00, 77.97it/s]


Epoch 4/10, Loss: 0.0650, Val Acc: 0.7270


Epoch 5/10: 100%|██████████| 267/267 [00:03<00:00, 77.60it/s]


Epoch 5/10, Loss: 0.0283, Val Acc: 0.7298


Epoch 6/10: 100%|██████████| 267/267 [00:03<00:00, 77.24it/s]


Epoch 6/10, Loss: 0.0074, Val Acc: 0.7233


Epoch 7/10: 100%|██████████| 267/267 [00:03<00:00, 77.07it/s]


Epoch 7/10, Loss: 0.0025, Val Acc: 0.7195


Epoch 8/10: 100%|██████████| 267/267 [00:03<00:00, 76.52it/s]


Epoch 8/10, Loss: 0.0011, Val Acc: 0.7261


Epoch 9/10: 100%|██████████| 267/267 [00:03<00:00, 76.67it/s]


Epoch 9/10, Loss: 0.0007, Val Acc: 0.7251


Epoch 10/10: 100%|██████████| 267/267 [00:03<00:00, 76.40it/s]


Epoch 10/10, Loss: 0.0005, Val Acc: 0.7233
Best Validation Accuracy: 0.7505


  model_bilstm_attn.load_state_dict(torch.load('best_model.pt'))


Test Accuracy with BiLSTM + Attention: 0.7749


### Question 3(e): Describe Your Final Improvement Strategy and Report Accuracy

Ans: Answer:

Improvement Strategy:

We implemented an attention mechanism on top of the BiLSTM model to allow the model to focus on the most informative words in the sentence. The attention mechanism computes a weighted sum of the LSTM outputs, where the weights are learned to emphasize important features. This lead to the increase in accuracy from 0.6717 to 0.7749.

The attention mechanism helps the model to capture long-range dependencies and focus on key parts of the input that are most relevant for the classification task.

### Question 3(f): Compare Results and Observations

Summary of Test Accuracies:

Model	Test Accuracy
RNN (static embeddings)	0.5000
RNN (updated embeddings)	0.5141
BiGRU	0.6811
BiLSTM	0.6717
CNN	0.6745
BiLSTM + Attention	0.7749
Observations:

Minimal Improvement from Updating Embeddings:

The RNN model with static embeddings achieved a test accuracy of 50.00%, which is equivalent to random guessing in a binary classification task.
Updating the word embeddings during training led to a slight improvement, increasing the test accuracy to 51.41%.
Interpretation: The minimal improvement suggests that the simple RNN architecture struggles to capture the complexity of the sentiment classification task, even when the embeddings are fine-tuned.

Significant Performance Boost with Advanced Architectures:

BiGRU, BiLSTM, and CNN models show substantial improvements over the simple RNN:
BiGRU: 68.11% test accuracy.
BiLSTM: 67.17% test accuracy.
CNN: 67.45% test accuracy.
Interpretation: These architectures are better at capturing sequential dependencies and extracting relevant features from the text, leading to higher accuracy.

Best Performance with BiLSTM + Attention:

The BiLSTM model enhanced with an attention mechanism achieved the highest test accuracy of 77.49%.
Interpretation: The attention mechanism allows the model to focus on the most informative words in each sentence, effectively capturing the nuances necessary for sentiment classification.

Comparison Between BiGRU and BiLSTM:

The BiGRU slightly outperformed the BiLSTM model:
BiGRU: 68.11%
BiLSTM: 67.17%
Interpretation: While both models are effective, the GRU's simpler architecture may prevent overfitting and result in better generalization in this case.

CNN Model Performance:

The CNN model achieved a test accuracy of 67.45%, comparable to the BiLSTM and BiGRU models.
Interpretation: CNNs are effective at capturing local patterns and n-gram features, which are valuable for sentiment analysis.

Conclusions:

Effectiveness of Advanced Models:

Transitioning from a simple RNN to more sophisticated architectures like BiGRU, BiLSTM, and CNN significantly enhances model performance.
These models are better suited for capturing complex patterns in textual data.

Impact of Attention Mechanism:

Incorporating an attention mechanism into the BiLSTM model leads to a substantial performance boost, achieving the highest accuracy.
Reason: Attention allows the model to weigh the importance of different words, focusing on those most relevant to the sentiment classification task.

Limited Benefit from Updating Embeddings Alone:

Simply updating the embeddings in the RNN model does not lead to a meaningful improvement.
Implication: The model's capacity plays a crucial role, and without a suitable architecture, fine-tuning embeddings is insufficient.
Recommendation for Future Work:

Model Complexity: Employ models with greater capacity and ability to capture sequential dependencies and contextual information.
Attention Mechanisms: Explore attention mechanisms further, as they have demonstrated significant benefits.
Hyperparameter Tuning: Experiment with different hyperparameters (e.g., learning rate, batch size, hidden units) to optimize performance.
Data Augmentation: Consider augmenting the dataset to provide the model with more varied training examples.

Overall Assessment:

The results clearly demonstrate that the choice of model architecture has a profound impact on performance in sentiment classification tasks.
Incorporating advanced models and techniques like bidirectional layers and attention mechanisms is essential for capturing the complexities of natural language.