# Workspace Setup

## Some Imports

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.utils import resample
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm.notebook import tqdm
import numpy as np
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from matplotlib import pyplot as plt
from transformers import RobertaTokenizer, RobertaForSequenceClassification, EarlyStoppingCallback, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
import time
from wordcloud import WordCloud

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

## Some Constants

In [None]:
MAX_LEN = 256
RANDOM_SEED = 42
VAL_SIZE = 0.15
TEST_SIZE = 0.15
RESULTS_DIR = './results'

## Useful functions

In [None]:
def stars_to_sentiment(stars):
    if stars <= 1:
        return "Negative"
    elif stars == 2:
        return "Neutral"
    else:
        return "Positive"

################### transformers ######################
# Tokenizer 
def tokenize_reviews(df, tokenizer, max_length=512):
    encodings = tokenizer(
        df["Review"].tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    labels = torch.tensor(df["Rating"].values)
    return encodings, labels

# Unpack the encodings (input_ids and attention_mask)
class EncodedDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }
    
    def __len__(self):
        return len(self.labels)
    
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    prec, rec, f_score, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {"precision": prec,
            "recall": rec,
            "f1": f_score}
    
################### LSTM ######################

# Encode sentences with padding and OOV handling
def encode_sentence(tokens, vocab, max_len):
    encoded = [vocab.get(word, vocab["<OOV>"]) for word in tokens]
    padded = encoded[:max_len] + [vocab["<PAD>"]] * max(0, max_len - len(encoded))
    return padded

class ReviewDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

## Check whether the GPU is available

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))
print("Using device:", device)

## Dataset

### Download

In [None]:
dataset_path = kagglehub.dataset_download("andrewmvd/trip-advisor-hotel-reviews")

print("Path to dataset files:", dataset_path)

### Dataloading on Pandas + preprocessing

In [None]:
# Load the dataset CSV file
df = pd.read_csv(f"{dataset_path}/tripadvisor_hotel_reviews.csv")
df["Rating"] = df["Rating"] - 1 # Convert 1–5 to 0–4

stop_words = set(stopwords.words('english'))

# removing html tags, converting to lowercase and removing stop words
df["Review"] = df["Review"].str.lower()
df["Review"] = df["Review"].str.replace(r"<.*?>", "", regex=True)
df["Review"] = df["Review"].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Add a column with the number of tokens per review
token_counts = tokenizer(
    df["Review"].tolist(),
    padding=False,
    truncation=False,
    return_length=True
)["length"]
df["token_count"] = token_counts

# Drop reviews with token_count > MAX_LEN
df_filtered = df[df["token_count"] <= MAX_LEN].reset_index(drop=True)

# # Oversample classes 0, 1, 2, 3 to match class 4's count
# max_count = df_filtered[df_filtered["Rating"] == 4].shape[0]
# dfs = []
# for label in range(5):
#     df_class = df_filtered[df_filtered["Rating"] == label]
#     if label < 4:
#         df_class_upsampled = resample(
#             df_class,
#             replace=True,
#             n_samples=max_count,
#             random_state=RANDOM_SEED
#         )
#         dfs.append(df_class_upsampled)
#     else:
#         dfs.append(df_class)
# df_filtered = pd.concat(dfs).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# # Undersample classes 1, 2, 3, 4 to match class 0's count
# min_count = df_filtered[df_filtered["Rating"] == 0].shape[0]
# dfs = []
# for label in range(5):
#     df_class = df_filtered[df_filtered["Rating"] == label]
#     if label > 0:
#         df_class_downsampled = resample(
#             df_class,
#             replace=False,
#             n_samples=min_count,
#             random_state=RANDOM_SEED
#         )
#         dfs.append(df_class_downsampled)
#     else:
#         dfs.append(df_class)
# df_filtered = pd.concat(dfs).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# Check new distribution
print("New dataset size:", len(df_filtered))
print(df_filtered["token_count"].describe())

print(df_filtered["Rating"].value_counts())

# Split off the test set
df_temp, df_test = train_test_split(df_filtered, test_size=TEST_SIZE, random_state=RANDOM_SEED, shuffle=True)

# Split the remaining into train and validation
df_train, df_val = train_test_split(df_temp, test_size=VAL_SIZE, random_state=RANDOM_SEED, shuffle=True)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_val)}")
print(f"Test size: {len(df_test)}")

print(df_filtered.head())


### Visualization

In [None]:
# Label distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
rating_counts = df_filtered['Rating'].value_counts().sort_index()
# Offset labels by 1 for display (since ratings are 0–4, but original scale is 1–5)
labels_offset = [i + 1 for i in rating_counts.index]
plt.bar(labels_offset, rating_counts.values, color='skyblue', edgecolor='black')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Distribution of Ratings in Filtered Dataset')
plt.xticks(labels_offset)
for i, count in zip(labels_offset, rating_counts.values):
    plt.text(i, count + 50, str(count), ha='center', va='bottom')

# Sentiment distribution 
plt.subplot(1, 2, 2)
df_filtered['Sentiment'] = df_filtered['Rating'].apply(stars_to_sentiment)
sentiment_counts = df_filtered['Sentiment'].value_counts()
colors = ['red', 'orange', 'green']
plt.pie(sentiment_counts.values, labels=sentiment_counts.index, autopct='%1.1f%%', colors=colors)
plt.title('Sentiment Distribution in Filtered Dataset')

plt.tight_layout()
plt.savefig('report/images/label_distribution.png')
plt.show()

# Word Cloud
# Combine all reviews into one text
all_text = ' '.join(df_filtered['Review'].values)

# Generate word cloud
plt.figure(figsize=(12, 8))
wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(all_text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Hotel Reviews', fontsize=16, fontweight='bold')
plt.savefig('report/images/wordcloud.png')
plt.show()

### Torch Dataloaders

#### RoBERTa dataloaders

In [None]:
# Tokenize train, validation, and test sets
train_encodings = tokenize_reviews(df_train, tokenizer)
val_encodings = tokenize_reviews(df_val, tokenizer)
test_encodings = tokenize_reviews(df_test, tokenizer)

# Build datasets
train_dataset = EncodedDataset(*train_encodings)
val_dataset = EncodedDataset(*val_encodings)
test_dataset = EncodedDataset(*test_encodings)

# Only keep the first 100 entries for each dataset
mock_dataset = EncodedDataset(
    {k: v[:100] for k, v in train_encodings[0].items()},
    train_encodings[1][:100]
)

# Create DataLoaders
train_loader_bert = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader_bert = DataLoader(val_dataset, batch_size=32)
test_loader_bert = DataLoader(test_dataset, batch_size=32)
mock_loader_bert = DataLoader(mock_dataset, batch_size=8)

#### LSTM dataloaders

In [None]:
texts = df_filtered["Review"].values
print("Number of reviews:", len(texts))
labels = df_filtered["Rating"].values

# Tokenize and build vocabolary
tokenized_texts = [word_tokenize(text.lower()) for text in texts]
vocab = Counter([word for sentence in tokenized_texts for word in sentence])
vocab = {word: i+2 for i, (word, _) in enumerate(vocab.most_common(10000))}
vocab["<PAD>"] = 0
vocab["<OOV>"] = 1

encoded_texts = [encode_sentence(tokens, vocab, MAX_LEN) for tokens in tokenized_texts]

# Split encoded_texts and labels into train, val, test sets (same sizes as before)
encoded_texts = np.array(encoded_texts)
labels = np.array(labels)

X_temp, X_test, y_temp, y_test = train_test_split(
    encoded_texts, labels, test_size=TEST_SIZE, random_state=RANDOM_SEED, shuffle=True
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=VAL_SIZE, random_state=RANDOM_SEED, shuffle=True
)

print(f"LSTM splits - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# 2. Create Dataset
train_dataset_lstm = ReviewDataset(X_train, y_train)
val_dataset_lstm = ReviewDataset(X_val, y_val)
test_dataset_lstm = ReviewDataset(X_test, y_test)

# 3. Create DataLoaders
train_loader_lstm = DataLoader(train_dataset_lstm, batch_size=16, shuffle=True)
val_loader_lstm = DataLoader(val_dataset_lstm, batch_size=32)
test_loader_lstm = DataLoader(test_dataset_lstm, batch_size=32)

# Transformer based Model

## Model definition

In [None]:
class SA_Model():
    def __init__(self, model_name="roberta-base", num_labels=5):
        self.model_name = model_name
        self.num_labels = num_labels
        self.model = RobertaForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_labels)
        
    def apply_lora(self, rank=16):
        lora_config = LoraConfig(
            r=rank,
            lora_alpha=32,
            lora_dropout=0.1,
            bias="none",
            task_type="SEQ_CLS",  # For sequence classification
            target_modules=["query", "value"],
        )
        self.model = get_peft_model(self.model, lora_config)
        
    def freeze_all_except_lora(self):
        """Freeze all base model weights except LoRA adapters"""
        for name, param in self.model.named_parameters():
            if "lora" not in name:
                param.requires_grad = False

    def unfreeze_classifier(self):
        """Unfreeze the classifier layer for fine-tuning"""
        for name, param in self.model.named_parameters():
            if "classifier" in name:
                param.requires_grad = True

    def get_model(self):
        return self.model
    
    def to_device(self, device):
        """Move the model to the specified device (CPU or GPU)"""
        self.model.to(device)
        return self
    
    def get_trainable_parameters(self):
        # for name, param in self.model.named_parameters():
        #     if param.requires_grad:
        #         print("Trainable:", name)
        trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
        total_params = sum(p.numel() for p in self.model.parameters())
        print(f"Trainable parameters: {trainable_params:,} / {total_params:,} ({100 * trainable_params / total_params:.2f}%)")

## Model evaluation function

In [None]:
def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            outputs = model(**inputs)
            logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(all_labels, all_preds))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(cmap='Blues', ax=ax)
    plt.savefig(RESULTS_DIR + "/transformer/confusion_matrix_transformer.png")
    plt.show()
    return accuracy, precision, recall, f1, cm

def evaluate_model_as_sentiment(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}
            labels = batch["labels"].to(device)
            outputs = model(**inputs)
            logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = [stars_to_sentiment(pred) for pred in all_preds]
    all_labels = [stars_to_sentiment(label) for label in all_labels]

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    cm = confusion_matrix(all_labels, all_preds)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(all_labels, all_preds))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Neutral", "Positive"])
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(cmap='Blues', ax=ax)
    plt.savefig(RESULTS_DIR + "/transformer/confusion_matrix_transformer_sentiment.png")
    plt.show()
    return accuracy, precision, recall, f1, cm

## Training

In [None]:
model = SA_Model()
model.apply_lora(rank=32)
model.freeze_all_except_lora() 
model.unfreeze_classifier()
model.get_trainable_parameters()

In [None]:
training_args = TrainingArguments(
    output_dir=RESULTS_DIR + "/transformer",    # Where checkpoints/logs go
    per_device_train_batch_size=16,             # Batch size per GPU
    per_device_eval_batch_size=16,
    num_train_epochs=10,                        # Total number of training epochs
    learning_rate=3e-5,
    warmup_ratio=0.1,                           # 10% of training steps for warmup
    eval_strategy="epoch",                      # Run eval at end of every epoch
    save_strategy="epoch",                      # Save checkpoint every epoch
    logging_strategy="steps",
    logging_steps=100,
    fp16=True,                           
    save_total_limit=5,                         # Only keep the last 5 checkpoints
    load_best_model_at_end=True,                # Use best checkpoint (based on metric)
    metric_for_best_model="f1",                 # Choose best model by accuracy
    greater_is_better=True,
    weight_decay=0.01,                          # Weight decay for regularization
)

trainer = Trainer(
    model=model.get_model(),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  
)

trainer.train()

In [None]:
model.model.save_pretrained(f"{RESULTS_DIR}/transformer/")
tokenizer.save_pretrained(f"{RESULTS_DIR}/transformer/")

In [None]:
# Load the model from the saved directory
model = SA_Model(f"{RESULTS_DIR}/transformer/", num_labels=5)
model.freeze_all_except_lora()
model.to_device(device)

evaluate_model(model.get_model(), test_loader_bert)
evaluate_model_as_sentiment(model.get_model(), test_loader_bert)

# LSTM based Model

## Model definition

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # Take mean over all hidden states (sequence dimension)
        mean_hidden = lstm_out.mean(dim=1)
        out = self.dropout(mean_hidden)
        return self.fc(out)

## Training function

In [None]:
def evaluate_model(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    
    print("Classification Report:")
    print(classification_report(all_labels, all_preds))
    
    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(cmap='Blues', ax=ax)
    plt.savefig(RESULTS_DIR + "/lstm/confusion_matrix.png")
    plt.plot()
    
    return accuracy, precision, recall, f1

def evaluate_model_as_sentiment(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in tqdm(data_loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = [stars_to_sentiment(pred) for pred in all_preds]
    all_labels = [stars_to_sentiment(label) for label in all_labels]
    
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
    
    print("Classification Report:")
    print(classification_report(all_labels, all_preds))
    
    cm = confusion_matrix(all_labels, all_preds)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Neutral", "Positive"])
    fig, ax = plt.subplots(figsize=(8, 6))
    disp.plot(cmap='Blues', ax=ax)
    plt.savefig(RESULTS_DIR + "/lstm/confusion_matrix_sentiment.png")
    plt.plot()
    
    return accuracy, precision, recall, f1

def train_model(model, train_loader, val_loader, num_epochs=100, plot=False, patience=10, min_epochs=0, 
                learning_rate=0.0003, weight_decay=1e-5):
    train_losses = []
    val_losses = []
    f_scores = []
    model.train()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    for epoch in range(num_epochs):
        total_loss = 0
        for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Training Loss: {total_loss / len(train_loader)}")
        train_losses.append(total_loss / len(train_loader))

        # Evaluate on validation set every epoch
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        labels_list = []
        outputs_list = []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                labels_list.extend(labels.cpu().numpy())
                outputs_list.extend(predicted.cpu().numpy())
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        
        val_accuracy = val_correct / val_total
        val_loss_avg = val_loss / len(val_loader)
        val_losses.append(val_loss_avg)
        prec, rec, f_score, _ = precision_recall_fscore_support(labels_list, outputs_list, average='weighted')
        f_scores.append(f_score)
        print(f"Validation Loss: {val_loss_avg:.4f},")
        print(f"Validation Accuracy: {val_accuracy:.4f},")
        print(f"Precision: {prec:.4f},")
        print(f"Recall: {rec:.4f},")
        print(f"F1 Score: {f_score:.4f}")
        model.train()

        # Early stoppin
        if epoch == 0:
            best_f_score = f_score
            epochs_without_improvement = 0
        else:
            if f_score > best_f_score:
                best_f_score = f_score
                epochs_without_improvement = 0
                # Save best model
                torch.save(model.state_dict(), RESULTS_DIR + '/lstm/best_model.pth')
            else:
                epochs_without_improvement += 1
                
            if epochs_without_improvement >= patience and epoch >= min_epochs:
                print(f"Early stopping triggered after {epoch + 1} epochs")
                break

    # Plot training loss
    if plot:
        plt.figure()
        plt.plot(range(1, epoch + 2), train_losses, label='Training Loss')
        plt.plot(range(1, epoch + 2), val_losses, label='Validation Loss')
        plt.plot(range(1, epoch + 2), f_scores, label='Validation F1 Score')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training Loss Curve')
        plt.legend()
        plt.savefig(RESULTS_DIR + "/lstm/loss.png")
        plt.show()

## Training

In [None]:
model = LSTMClassifier(vocab_size=len(vocab), embed_dim=128, hidden_dim=128, output_dim=5)
model.to(device)

# Train the LSTM model
train_model(model, train_loader_lstm, val_loader_lstm, 
            num_epochs=100, plot=True, learning_rate=0.0001,
            patience=10, min_epochs=0, weight_decay=1e-5)

In [None]:
evaluate_model(model, test_loader_lstm)
evaluate_model_as_sentiment(model, test_loader_lstm)