# Model Sentiment

### Model: distilbert-base-uncased

In [None]:
# rating
# 5    33373
# 4     6304
# 1     4638
# 3     3432
# 2     2035
# Name: count, dtype: int64

# The dataset shows significant class imbalance. 
# To address this, applied upsampling/downsampling techniques. 
# Using DistilBERT with REGRESSION for the classification task 
# since labels (1-5) map to outcomes: 1-3 -> Negative, Neutral, Positive sentiments.

In [None]:
# Imports

import torch
import torch.nn as nn
import pandas as pd
import numpy as np

from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score

In [None]:
# Check if GPU is available

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Using device: {device}")

### Load and balance dataset

In [None]:
data = pd.read_csv('../data/reviews_cleaned.csv')

data = data[['text', 'rating']].rename(columns = {'rating': 'labels'})

data = data[~data['text'].isna()]

data.shape

In [None]:
data['labels'].value_counts()

In [None]:
# DOWNsampling class 'Rating 5'
class_5 = data[data['labels'] == 5]
class_5 = class_5.sample(n = 6304, random_state = 42)

# Keep 7,242 rows for class 'Rating 4' 
class_4 = data[data['labels'] == 4]

# UPsampling class 'Rating 3'
class_3 = data[data['labels'] == 3]
class_3 = class_3.sample(n = 6304, replace = True, random_state = 42)

# UPsampling class 'Rating 2'
class_2 = data[data['labels'] == 2]
class_2 = class_2.sample(n = 6304, replace = True, random_state = 42)

# UPsampling class 'Rating 1'
class_1 = data[data['labels'] == 1]
class_1 = class_1.sample(n = 6304, replace = True, random_state = 42)

data = pd.concat([class_1, class_2, class_3, class_4, class_5])

# Shuffle combined rows
data = data.sample(frac=1, random_state = 42).reset_index(drop = True)

train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

### Data Preparation

In [None]:
# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

# Load TinyBERT tokenizer
tokenizer = BertTokenizer.from_pretrained('huawei-noah/TinyBERT_General_4L_312D')

def tokenize(batch):
    return tokenizer(batch['text'], padding = 'max_length', truncation = True, max_length = 128)

# Apply tokenization to the dataset
train_dataset = train_dataset.map(tokenize, batched = True)
test_dataset = test_dataset.map(tokenize, batched = True)

# Normalize column 'labels'
train_dataset = train_dataset.map(lambda x: {'labels': x['labels'] / 5})
test_dataset = test_dataset.map(lambda x: {'labels': x['labels'] / 5})

### Define the Model

In [None]:
# Load TinyBERT model

model = BertForSequenceClassification.from_pretrained('huawei-noah/TinyBERT_General_4L_312D', num_labels = 1)  # Regression task

model = model.to(device)  # Move model to GPU if available

print(f"Model is on: {next(model.parameters()).device}")

### Custom Trainer with MSE Loss

In [None]:
# Custom Trainer to use MSE loss for regression
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs = False, **kwargs):
        labels = inputs.pop('labels').float()  # Extract labels
        outputs = model(**inputs)
        logits = outputs.logits.squeeze()

        # Compute MSE Loss
        loss_fn = nn.MSELoss()
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

### Define Metrics

In [None]:
# Compute metrics for evaluation
def compute_metrics(eval_pred):
    predictions, truth = eval_pred

    # Define bins for regression 0-1
    bins = np.linspace(1, 5, 6)
    predictions = np.digitize(predictions, bins) - 1
    truth = np.digitize(truth, bins) - 1
    
    # Compute metrics
    mse = mean_squared_error(truth, predictions)
    mae = mean_absolute_error(truth, predictions)
    acc = accuracy_score(truth, predictions)
    
    return {'mse': mse, 'mae': mae, 'accuracy': acc}

### Training Setup

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir = '../models/sentiment/results',
    eval_strategy = 'epoch',
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 3,
    max_grad_norm = 1.0,  # Gradient clipping
    learning_rate = 1e-5,  # 3e-6
    weight_decay = 0.01,
    logging_steps = 500,
)

### Initialize and Train

In [None]:
# Initialize the Trainer
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    processing_class = tokenizer,
    compute_metrics = compute_metrics,
)

# Start training
trainer.train()

### Evaluate Model

In [None]:
results = trainer.evaluate()

print(results)

### Save Model and Tokenizer

In [None]:
output_dir = './../models/sentiment'

# model.save_pretrained(output_dir)
# tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

### Load Model and Tokenizer

In [None]:
model_path = '../models/sentiment'

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

model = model.to(device)

print(f"Model loaded and moved to device: {device}")

### Predict Sentiment Function

In [None]:
def predict_sentiment(texts, model, tokenizer):

    # Tokenize the input texts
    inputs = tokenizer(texts, return_tensors = 'pt', padding = True, truncation = True, max_length = 128)
    
    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
	# Perform inference
    model.eval()

    with torch.no_grad():

        outputs = model(**inputs)
        logits = outputs.logits.squeeze()

    return logits.cpu().tolist() if logits.ndim > 0 else [logits.item()]

### Prediction Example

In [None]:
examples = [
    "This game is a total disappointment. The controls are clunky, the graphics feel outdated, and the story is bland and uninspired. Definitely not worth the price.",  # Negative
    "The game is okay. It has some fun moments, but nothing stands out. The graphics are decent, and the gameplay is smooth, but it lacks depth and replay value.",  # Neutral
    "Absolutely loved this game! The story is engaging, the characters are well-developed, and the visuals are stunning. The gameplay is smooth and immersive—definitely a masterpiece!"  # Positive
]

ratings = predict_sentiment(examples, model, tokenizer)

# Print results
for text, rating in zip(examples, ratings):
    print(f"Review: {text}\nPredicted Rating: {rating:.2f}\n")