# Using the yelp reviews dataset to fine-tune a DistilBERT model for sentiment analysis.

- [Information about the base model](https://huggingface.co/docs/transformers/model_doc/distilbert)
- [Information about ray use cases](https://docs.ray.io/en/latest/ray-overview/use-cases.html)
- [Ray tuning example](https://docs.ray.io/en/latest/train/examples/transformers/huggingface_text_classification.html)

## Installing required packages

In [None]:
! export RAY_PICKLE_VERBOSE_DEBUG='2'
! export RAY_AIR_NEW_OUTPUT='1'

In [None]:
! pip install -U datasets "transformers[torch]" "ray[tune]" evaluate scikit-learn seaborn

## Importing required packages

In [None]:
%%time
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    AdamW,
    get_linear_schedule_with_warmup,
)
import numpy as np
from huggingface_hub import notebook_login
from ray import tune
import ray
import evaluate
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
model_checkpoint = "distilbert-base-uncased"
repo_name = "DistilBERT-yelp-sentiment-analysis"
batch_size = 8
dataset_checkpoint = "noahnsimbe/yelp-dataset"
data_fraction = 0.001
num_train_epochs = 20

## Data Loading

In [None]:
%%time
data = load_dataset(dataset_checkpoint)

### Using a fraction of the data

In [None]:
train_data = data["train"].train_test_split(train_size=data_fraction)["train"]
test_data = data["test"].train_test_split(train_size=data_fraction)["train"]
eval_data = data["eval"].train_test_split(train_size=data_fraction)["train"]

## Data Preprocessing

### Creating tokenizer from the pre-trained DistilBERT model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def preprocess_function(record):
    """
    Tokenizes the text in a given record using the tokenizer with truncation.

    Args:
        record (dict): A dictionary containing the text to be tokenized under the key "text".

    Returns:
        dict: A dictionary containing the tokenized text.
    """
    return tokenizer(record["text"], truncation=True)

### Tokenize the text using the tokenizer with truncation

In [None]:
%%time
tokenized_train_data = train_data.map(preprocess_function, batched=True)
tokenized_test_data = test_data.map(preprocess_function, batched=True)
tokenized_eval_data = eval_data.map(preprocess_function, batched=True)

train_dataset = Dataset.from_list(tokenized_train_data)
test_dataset = Dataset.from_list(tokenized_test_data)
eval_dataset = Dataset.from_list(tokenized_eval_data)

### Creating the DataCollatorWithPadding object

In [None]:
# Converts the training samples to PyTorch tensors and concatenate them with the correct amount of padding to speed up training

%%time
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Model Development

### Load pre-trained model for sequence classification

In [None]:
%%time
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=3
)

### Define the metrics to evaluate the model

In [None]:
def compute_metrics(eval_pred):
    """
    Compute various evaluation metrics such as accuracy, F1 score, precision, and recall.

    Args:
        eval_pred (tuple): A tuple containing the model predictions (logits) and the ground truth labels.

    Returns:
        dict: A dictionary containing computed evaluation metrics (accuracy, F1 score, precision, and recall).
    """
        
    load_accuracy = evaluate.load("accuracy", trust_remote_code=True)
    load_f1 = evaluate.load("f1", trust_remote_code=True)
    load_precision = evaluate.load("precision", trust_remote_code=True)
    load_recall = evaluate.load("recall", trust_remote_code=True)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_accuracy.compute(predictions=predictions, references=labels)[
        "accuracy"
    ]
    f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")[
        "f1"
    ]
    precision = load_precision.compute(
        predictions=predictions, references=labels, average="macro", zero_division=0
    )["precision"]
    recall = load_recall.compute(
        predictions=predictions, references=labels, average="macro"
    )["recall"]
    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

### Define training arguments for the model training process

In [None]:
training_args = TrainingArguments(
    output_dir=repo_name,
    learning_rate=2e-5,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    load_best_model_at_end=True,
    push_to_hub=False,
    use_cpu=True,
)

### Define the optimizer and learning rate scheduler

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataset) * training_args.num_train_epochs,
)

### Define the Trainer for model training

In [None]:
model_trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
)

### Train the model and measure the execution time

In [None]:
%%time
model_trainer.train()

## Model Evaluation

### Evalute the model and measure the execution time

In [None]:
%%time
model_trainer.evaluate(eval_dataset=eval_dataset)

In [None]:
model = model_trainer.model

In [None]:
test_text = eval_dataset["text"][:10]
y_test = eval_dataset["label"][:10]

In [None]:
predictions = model.predict(test_text)

In [None]:
label_mapping = {
    'LABEL_0': 0,
    'LABEL_1': 1,
    'LABEL_2': 2
}
class_labels = ["Negative", "Neutral", "Positive"]

### Generate predictions and evaluation metrics.

In [None]:
y_pred = list(map(lambda x: label_mapping[x['label']], predictions))
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred, target_names=class_labels)

In [None]:
print(f"Classification Report:\n{classification_rep}")

In [None]:
sns.set(font_scale=1.2)
sns.heatmap(
    conf_matrix,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=class_labels,
    yticklabels=class_labels,
)

plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

### Observation

Based on this analysis, the model performs well overall, with high precision, recall, and F1-score for Negative and Positive sentiments. However, the performance for Neutral sentiment is relatively lower, with lower precision, recall, and F1-score.

## Hyperparameter Tuning

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint, num_labels=3, return_dict=True
    )

In [None]:
hp_trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
    model_init=model_init,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
def ray_hp_space(_):
    return {
        "learning_rate": tune.loguniform(1e-6, 1e-4),
        "per_device_train_batch_size": tune.choice([4, 8, 16]),
        "per_device_eval_batch_size": tune.choice([4, 8, 16]),
        "weight_decay": tune.choice([0.001, 0.01, 0.1]),
        "num_train_epochs": tune.choice([2, 4, 6, 8, 10]),
    }

In [None]:
%%time
ray.init()
best_trial = hp_trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    hp_space=ray_hp_space,
    resources_per_trial={"cpu": 6},
)
ray.shutdown()

In [None]:
best_trial

## Training using best hyperparameters

In [None]:
for n, v in best_trial.hyperparameters.items():
    setattr(model_trainer.args, n, v)

In [None]:
%%time
train_data = data["train"]
test_data = data["test"]
eval_data = data["eval"]

tokenized_train_data = train_data.map(preprocess_function, batched=True)
tokenized_test_data = test_data.map(preprocess_function, batched=True)
tokenized_eval_data = eval_data.map(preprocess_function, batched=True)

train_dataset = Dataset.from_list(tokenized_train_data)
test_dataset = Dataset.from_list(tokenized_test_data)
eval_dataset = Dataset.from_list(tokenized_eval_data)

In [None]:
notebook_login()

In [None]:
setattr(model_trainer.args, "train_dataset", train_dataset)
setattr(model_trainer.args, "test_dataset", test_dataset)
setattr(model_trainer.args, "push_to_hub", True)

In [None]:
%%time
model_trainer.train()

In [None]:
%%time
model_trainer.evaluate(eval_dataset=eval_dataset)

### Uploading the model to Hub

In [None]:
%%time
model_trainer.push_to_hub()

## Analysing Model Performance

In [None]:
model = pipeline("text-classification", model=repo_name)

In [None]:
max_length = 200
short_text_dataset = eval_dataset.filter(lambda example: len(example['text']) <= max_length)
long_text_dataset = eval_dataset.filter(lambda example: len(example['text']) > max_length)

In [None]:
long_reviews = [review for review in eval_dataset["text"] if len(review) > max_length]
short_reviews = [review for review in eval_dataset["text"] if len(review) <= max_length]