# Sentiment Analysis on Reviews

In this project, we will use machine learning algorithms and deep learning models with different word embedding approaches for predicting the sentiment of reviews. We will use the following models and compare their performance on different datasets: LSTM, CNN, T5, LR, SVM, and RFT.

Data sources
* IMBD dataset
  * https://huggingface.co/datasets/imdb
* Word embedding
  * GloVe - https://nlp.stanford.edu/projects/glove

## Configuration

In this section, we will set up all the necessary parameters and import the required libraries.

In [None]:
import sys
sys.path.append('../src/')

In [None]:
import numpy as np
import nltk
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import accuracy_score
from transformers import T5Tokenizer

from lstm import LSTMSentiment
from cnn import CNNSentiment
from t5_model import T5Sentiment
from lr import LRSentiment
from svm import SVMSentiment
from rft import RFTSentiment
from utils import load_data, preprocess_data, create_dataloader, load_glove_embeddings, build_label_encoder, glove_to_tensor, split_dataset
from data_augmentation import DataAugmentation

# Parameters
DATASET_NAME = "imdb"
#pd.read_csv('/Users/notebook/Documents/nlp-training-REL_20230706/notebooks/data/IMDB Dataset.csv')

EPOCHS = 10
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
OUTPUT_DIM = 1
TRAIN_SPLIT_RATIO = 0.8
VAL_SPLIT_RATIO = 0.1
TEST_SPLIT_RATIO = 0.1


In [None]:
from datasets import disable_caching

disable_caching()

## Word Embeddings

We will use the pre-trained GloVe embeddings for this project.

In [None]:
# Load GloVe embeddings
embedding = load_glove_embeddings("data/embeddings/glove.6B.50d.txt")
label_encoder = build_label_encoder(vocabs=list(embedding.keys()))
embedding_tensor = glove_to_tensor(label_encoder, embedding, embedding_dim=EMBEDDING_DIM)

# Load tokenizer for T5
pretrained_name = "t5-small"
tokenizer_t5 = T5Tokenizer.from_pretrained(pretrained_name)

## Data Loading and Preprocessing

In [5]:
print(DATASET_NAME.head())

AttributeError: 'str' object has no attribute 'head'

In [6]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

In [7]:
# Load the IMDB dataset
data = load_data(name=DATASET_NAME, split="train[:50]+train[-50:]")
data.cleanup_cache_files()

# Split data into training, validation, and test sets
data = split_dataset(data, train_ratio=TRAIN_SPLIT_RATIO, validation_ratio=VAL_SPLIT_RATIO, test_ratio=TEST_SPLIT_RATIO)

# Preprocess data
data = preprocess_data(data, label_encoder, embedding_tensor, tokenizer_t5)

# Create dataloaders
train_dataloader = create_dataloader(data["train"], batch_size=16)
val_dataloader = create_dataloader(data["val"], batch_size=16)
test_dataloader = create_dataloader(data["test"], batch_size=16)

Found cached dataset imdb (C:/Users/notebook/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

## Models

### LSTM

In [8]:
# Initialize LSTM model
lstm = LSTMSentiment(
    embedding_tensor=embedding_tensor,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
)

# Train LSTM
optimizer = optim.Adam(lstm.parameters(), lr=1e-3)
criterion = nn.BCELoss()

lstm.train_loop(train_dataloader, val_dataloader, optimizer, criterion, epochs=EPOCHS)

# Evaluate LSTM
lstm_result = lstm.evaluate_loop(test_dataloader, criterion)
print(f"LSTM accuracy: {lstm_result['accuracy']}")


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 1 | Train loss: 0.690 | Train acc: 56.25% | Validation loss: 0.761 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 2 | Train loss: 0.682 | Train acc: 56.25% | Validation loss: 0.805 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 3 | Train loss: 0.672 | Train acc: 56.25% | Validation loss: 0.824 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 4 | Train loss: 0.662 | Train acc: 57.50% | Validation loss: 0.823 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 5 | Train loss: 0.660 | Train acc: 58.75% | Validation loss: 0.765 | Validation acc: 20.00%
Epoch: 6 | Train loss: 0.631 | Train acc: 76.25% | Validation loss: 0.809 | Validation acc: 30.00%
Epoch: 7 | Train loss: 0.533 | Train acc: 72.50% | Validation loss: 0.667 | Validation acc: 90.00%
Epoch: 8 | Train loss: 0.509 | Train acc: 77.50% | Validation loss: 0.606 | Validation acc: 70.00%
Epoch: 9 | Train loss: 0.457 | Train acc: 76.25% | Validation loss: 0.799 | Validation acc: 60.00%
Epoch: 10 | Train loss: 0.371 | Train acc: 85.00% | Validation loss: 0.466 | Validation acc: 80.00%
LSTM accuracy: 0.7


### CNN

In [10]:
NUM_FILTERS = 3
FILTER_SIZES = [3, 4, 5]

# Initialize CNN model
cnn = CNNSentiment(
    embedding_tensor=embedding_tensor,
    num_filters=NUM_FILTERS,
    filter_sizes=FILTER_SIZES,
    output_dim=OUTPUT_DIM,
)

# Train CNN
optimizer = optim.Adam(cnn.parameters(), lr=1e-3)
criterion = nn.BCELoss()

cnn.train_loop(train_dataloader, val_dataloader, optimizer, criterion, epochs=EPOCHS)

# Evaluate CNN
cnn_result = cnn.evaluate_loop(test_dataloader, criterion)
print(f"CNN accuracy: {cnn_result['accuracy']}")


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 1 | Train loss: 0.682 | Train acc: 55.00% | Validation loss: 0.776 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 2 | Train loss: 0.661 | Train acc: 57.50% | Validation loss: 0.773 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 3 | Train loss: 0.654 | Train acc: 60.00% | Validation loss: 0.771 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 4 | Train loss: 0.632 | Train acc: 60.00% | Validation loss: 0.769 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 5 | Train loss: 0.676 | Train acc: 56.25% | Validation loss: 0.771 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 6 | Train loss: 0.623 | Train acc: 66.25% | Validation loss: 0.771 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 7 | Train loss: 0.629 | Train acc: 62.50% | Validation loss: 0.766 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 8 | Train loss: 0.634 | Train acc: 71.25% | Validation loss: 0.765 | Validation acc: 20.00%


  _warn_prf(average, modifier, msg_start, len(result))


Epoch: 9 | Train loss: 0.629 | Train acc: 63.75% | Validation loss: 0.769 | Validation acc: 20.00%
Epoch: 10 | Train loss: 0.620 | Train acc: 65.00% | Validation loss: 0.775 | Validation acc: 20.00%
CNN accuracy: 0.3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### T5

In [None]:
# Initialize T5 model according to your tokenizer and pre-trained model
t5 = T5Sentiment(pretrained_name)
optimizer = optim.AdamW(t5.parameters(), lr=5e-5)

# Train T5
t5.train_loop(train_dataloader, val_dataloader, optimizer, epochs=EPOCHS)

# Evaluate T5
t5_result = t5.evaluate_loop(test_dataloader)
print(f"T5 accuracy: {t5_result['accuracy']}")

### LR

In [None]:
# Initialize LR model
lr = LRSentiment()
data = data.with_format("np")

# Train LR
lr.train(data["train"]["input_glove_vectors"], data["train"]["label"])

# Evaluate LR
lr_result = lr.evaluate(data["test"]["input_glove_vectors"], data["test"]["label"])

### SVM

In [None]:
# Initialize SVM model
svm = SVMSentiment()

# Train SVM
svm.train(data["train"]["input_glove_vectors"], data["train"]["label"])

# Evaluate SVM
svm_result = svm.evaluate(data["test"]["input_glove_vectors"], data["test"]["label"])


### RFT

In [None]:
# Initialize RFT model
rft = RFTSentiment()

# Train RFT
rft.train(data["train"]["input_glove_vectors"], data["train"]["label"])

# Evaluate RFT
rft_result = rft.evaluate(data["test"]["input_glove_vectors"], data["test"]["label"])


## Summary

In [None]:
cols = ["model", "accuracy", "f1", "precision", "recall"]

results = [lstm_result, cnn_result, t5_result, lr_result, svm_result, rft_result]
final_result = pd.DataFrame(results)

final_result["train_ratio"] = TRAIN_SPLIT_RATIO
final_result["val_ratio"] = VAL_SPLIT_RATIO
final_result["train_ratio"] = TRAIN_SPLIT_RATIO

final_result[cols]