# Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datasets

# Dataset

In [None]:
# Loading the raw dataset, not the preprocessed version
from datasets import load_dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
print(dataset)

In [None]:
print(dataset['full'][0:5])

# Defining a function to map ratings to sentiment labels and addding it to the dataset

In [5]:
"""
    Maps the 'rating' value to a sentiment label:
    - Ratings 4–5 stars → 2 (Positive)
    - Ratings 1–2 stars → 0 (Negative)
    - Ratings 3 stars → 1 (Neutral)
"""
def encode_labels(labels):
    rating = labels['rating']
    if rating in [4, 5]:
        return {'labels': 2} # Positive review
    elif rating in [1, 2]:
        return {'labels': 0} # Negative review
    else:
        return {'labels': 1} # Neutral review

In [None]:
dataset = dataset.map(encode_labels)
print(dataset['full'][0:5])

# Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split

# Convert to pandas dataframe
df = pd.DataFrame(dataset["full"])
print(df.head())

df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
# Split the test set into validation and test set
df_valid, df_test = train_test_split(df_test, test_size = 0.5, random_state = 42)

# Convert back to Hugging Face dataset
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
test_dataset = Dataset.from_pandas(df_test)

# Tokeniztion

In [None]:
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize function for padding and truncating the reviews
def tokenze_function(examples):
    tokenized = tokenizer(examples["text"], padding = "max_length", truncation = True)
    tokenized["labels"] = examples["labels"]
    return tokenized

# Apply the tokenization function to the datasets
train_dataset = train_dataset.map(tokenze_function, batched = True)
valid_dataset = valid_dataset.map(tokenze_function, batched = True)
test_dataset = test_dataset.map(tokenze_function, batched = True)

In [None]:
# Step 4: Print the tokenized dataset to inspect the keys
print("First example after tokenization:", train_dataset[0])

# Step 5: Check what keys exist in the tokenized dataset
print("Keys in tokenized dataset:", train_dataset[0].keys())

# Loading the model

In [None]:
from transformers import AutoModelForSequenceClassification

# Load the BERT model for binary classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 3)

# Defining accuracy

In [11]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)
    precision = precision_score(labels, preds, average = 'weighted')
    recall = recall_score(labels, preds, average = 'weighted')
    f1 = f1_score(labels, preds, average = 'weighted')
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# Defining the arguments

In [None]:
from transformers import TrainingArguments

# Define the training arguments
training_args = TrainingArguments(
    output_dir = "./results",
    evaluation_strategy = "epoch",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 64,
    num_train_epochs = 3,
    weight_decay = 0.01,
    logging_dir = ".logs",
    logging_steps = 10,

)

# Trainer

In [None]:
from transformers import Trainer

# Set up the Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = valid_dataset,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

# Training the model

In [None]:
trainer.train()

# Evaluating the model

In [None]:
results = trainer.evaluate(test_dataset)

# Saving

In [None]:
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

# Print the evaluation results

In [None]:
print(f"Accuracy: {results['eval_accuracy']}")
print(f"Precision: {results['eval_precision']}")
print(f"Recall: {results['eval_recall']}")
print(f"F1: {results['eval_f1']}")

# Predicting

In [None]:
pred = trainer.predict(test_dataset)
print(pred.predictions)