# Sentiment Analysis on Reviews

In this project, we will use machine learning algorithms and deep learning models with different word embedding approaches for predicting the sentiment of reviews. We will use the following models and compare their performance on different datasets: LSTM, CNN, T5, LR, SVM, and RFT.

Data sources
* IMBD dataset
  * https://huggingface.co/datasets/imdb
* Word embedding
  * GloVe - https://nlp.stanford.edu/projects/glove

## Configuration

In this section, we will set up all the necessary parameters and import the required libraries.

In [1]:
import sys
sys.path.append('../src/')

In [2]:
import numpy as np
import nltk
import pandas as pd
import torch.nn as nn
import torch.optim as optim

from sklearn.metrics import accuracy_score
from transformers import T5Tokenizer

from lstm import LSTMSentiment
from cnn import CNNSentiment
from t5_model import T5Sentiment
from lr import LRSentiment
from svm import SVMSentiment
from rft import RFTSentiment
from utils import load_data, preprocess_data, create_dataloader, load_glove_embeddings, build_label_encoder, glove_to_tensor, split_dataset
from data_augmentation import DataAugmentation

# Parameters
DATASET_NAME = "imdb" # imdb / sentiment_data_custom
BATCH_SIZE = 32
EPOCHS = 10
EMBEDDING_DIM = 50
HIDDEN_DIM = 128
OUTPUT_DIM = 1
TRAIN_SPLIT_RATIO = 0.8
VAL_SPLIT_RATIO = 0.1
TEST_SPLIT_RATIO = 0.1

In [3]:
from datasets import disable_caching

disable_caching()

## Word Embeddings

We will use the pre-trained GloVe embeddings for this project.

In [4]:
# Load GloVe embeddings
embedding = load_glove_embeddings("C:\Users\notebook\Documents\nlp-training-REL_20230706\notebooks\data\embeddings\glove.6B.50d.txt")
label_encoder = build_label_encoder(vocabs=list(embedding.keys()))
embedding_tensor = glove_to_tensor(label_encoder, embedding, embedding_dim=EMBEDDING_DIM)

# Load tokenizer for T5
pretrained_name = "t5-small"
tokenizer_t5 = T5Tokenizer.from_pretrained(pretrained_name)

## Data Loading and Preprocessing

In [5]:
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

## Data Augmentation with EDA (Easy Data Augmentation)

In [6]:
# Load the IMDB dataset
raw_data = load_data(name=DATASET_NAME, split="train[:10]+train[-10:]")
raw_data.cleanup_cache_files()

# Split data into training, validation, and test sets
data = split_dataset(raw_data, train_ratio=TRAIN_SPLIT_RATIO, validation_ratio=VAL_SPLIT_RATIO, test_ratio=TEST_SPLIT_RATIO)

# Execute EDA on data and preprocess data
for data_split in ["train", "val", "test"]:
    data_augmentation = DataAugmentation(data[data_split])
    data[data_split] = data_augmentation.eda(
        alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=1
    )
    data[data_split] = preprocess_data(data[data_split], label_encoder, embedding_tensor, tokenizer_t5)

# Create dataloaders
train_dataloader = create_dataloader(data["train"], batch_size=BATCH_SIZE)
val_dataloader = create_dataloader(data["val"], batch_size=BATCH_SIZE)
test_dataloader = create_dataloader(data["test"], batch_size=BATCH_SIZE)


## Retrain and Evaluate Models on EDA Augmented Data

### LSTM

In [7]:
# Initialize LSTM model
lstm = LSTMSentiment(
    embedding_tensor=embedding_tensor,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
)

# Train LSTM
optimizer = optim.Adam(lstm.parameters(), lr=1e-3)
criterion = nn.BCELoss()

lstm.train_loop(train_dataloader, val_dataloader, optimizer, criterion, epochs=EPOCHS)

# Evaluate LSTM
lstm_result = lstm.evaluate_loop(test_dataloader, criterion)
print(f"LSTM accuracy: {lstm_result['accuracy']}")

### CNN

In [8]:
NUM_FILTERS = 3
FILTER_SIZES = [3, 4, 5]

# Initialize CNN model
cnn = CNNSentiment(
    embedding_tensor=embedding_tensor,
    num_filters=NUM_FILTERS,
    filter_sizes=FILTER_SIZES,
    output_dim=OUTPUT_DIM,
)

# Train CNN
optimizer = optim.Adam(cnn.parameters(), lr=1e-3)
criterion = nn.BCELoss()

cnn.train_loop(train_dataloader, val_dataloader, optimizer, criterion, epochs=EPOCHS)

# Evaluate CNN
cnn_result = cnn.evaluate_loop(test_dataloader, criterion)
print(f"CNN accuracy: {cnn_result['accuracy']}")


### T5

In [9]:
# Initialize T5 model according to your tokenizer and pre-trained model
t5 = T5Sentiment(pretrained_name)
optimizer = optim.AdamW(t5.parameters(), lr=5e-5)

# Train T5
t5.train_loop(train_dataloader, val_dataloader, optimizer, epochs=EPOCHS)

# Evaluate T5
t5_result = t5.evaluate_loop(test_dataloader)
print(f"T5 accuracy: {t5_result['accuracy']}")

### LR

In [None]:
# Initialize LR model
lr = LRSentiment()
data = data.with_format("np")

# Train LR
lr.train(data["train"]["input_glove_vectors"], data["train"]["label"])

# Evaluate LR
lr_result = lr.evaluate(data["test"]["input_glove_vectors"], data["test"]["label"])

### SVM

In [None]:
# Initialize SVM model
svm = SVMSentiment()

# Train SVM
svm.train(data["train"]["input_glove_vectors"], data["train"]["label"])

# Evaluate SVM
svm_result = svm.evaluate(data["test"]["input_glove_vectors"], data["test"]["label"])


### RFT

In [None]:
# Initialize RFT model
rft = RFTSentiment()

# Train RFT
rft.train(data["train"]["input_glove_vectors"], data["train"]["label"])

# Evaluate RFT
rft_result = rft.evaluate(data["test"]["input_glove_vectors"], data["test"]["label"])


## Summary

In [None]:
cols = ["model", "accuracy", "f1", "precision", "recall"]

results = [lstm_result, cnn_result, t5_result, lr_result, svm_result, rft_result]
final_result = pd.DataFrame(results)

final_result["train_ratio"] = TRAIN_SPLIT_RATIO
final_result["val_ratio"] = VAL_SPLIT_RATIO
final_result["train_ratio"] = TRAIN_SPLIT_RATIO

final_result[cols]