In [None]:
# !pip install mlflow pyngrok
import subprocess
import time
import json
import pickle
import re
import string
import warnings
import os
import threading
from typing import Dict
import shutil

from google.colab import files, drive

import mlflow
import mlflow.pytorch
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from pyngrok import ngrok
import optuna

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("stopwords")
warnings.filterwarnings("ignore")

## MLflow and ngrok

In [None]:
# Your ngrok token
NGROK_TOKEN = ""
ngrok.set_auth_token(NGROK_TOKEN)


def start_mlflow_server():
    """
    Starts the MLflow server in a subprocess.
    """
    subprocess.Popen(
        [
            "mlflow",
            "server",
            "--backend-store-uri",
            "sqlite:///mlflow.db",
            "--default-artifact-root",
            "./mlruns",
            "--host",
            "0.0.0.0",
            "--port",
            "5000",
        ]
    )

server_thread = threading.Thread(target=start_mlflow_server)
server_thread.start()
time.sleep(5)

public_url = ngrok.connect(5000, "http")
print("MLflow Tracking UI is available at:", public_url)
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("sentiment_analysis_experiment")

## Hyperparameters

In [None]:
LEARNING_RATE = 2e-5
BATCH_SIZE_TRAIN = 4
BATCH_SIZE_EVAL = 8
EPOCHS = 1
WEIGHT_DECAY = 0.01
MAX_LENGTH = 256
WARMUP_STEPS = 50
LR_SCHEDULER = "linear"
GRADIENT_CHECKPOINTING = True

TRAINING_ARGS = TrainingArguments(
    output_dir="./results",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE_TRAIN,
    per_device_eval_batch_size=BATCH_SIZE_EVAL,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    eval_strategy="epoch", # eval_strategy for older version
    save_strategy="no",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
    disable_tqdm=False,
    seed=42,
    gradient_checkpointing=GRADIENT_CHECKPOINTING,
    lr_scheduler_type=LR_SCHEDULER,
    warmup_steps=WARMUP_STEPS,
)

MODELS = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
]

## Helper Functions

In [None]:
def load_and_clean_data(filepath: str) -> pd.DataFrame:
    """
    Loads a CSV file, cleans the text data, and balances the dataset.

    Args:
        filepath (str): The path to the CSV file.

    Returns:
        pd.DataFrame: A cleaned and balanced DataFrame with 'label' and 'cleaned_review' columns.
    """
    df = pd.read_csv(filepath)
    stop_words = set(stopwords.words("english"))

    def clean_text(text: str) -> str:
        """
        Cleans a single string of text by lowercasing, removing punctuation,
        numbers, extra spaces, and stopwords.

        Args:
            text (str): The input text string.

        Returns:
            str: The cleaned text string.
        """
        if not isinstance(text, str) or pd.isna(text):
            return ""
        text = str(text).lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = re.sub(r"\d+", "", text)
        text = re.sub(r"\s+", " ", text)
        words = word_tokenize(text)
        return " ".join([w for w in words if w not in stop_words and len(w) > 1])

    df["full_review"] = df.apply(
        lambda row: f"{str(row.get('title', ''))} {str(row.get('text', ''))}".strip(),
        axis=1,
    )
    df["cleaned_review"] = df["full_review"].apply(clean_text)

    df = df[(df["cleaned_review"].str.len() > 10) & (df["star_sentiment"].notna())]

    sentiment_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
    df["label"] = df["star_sentiment"].map(sentiment_map)

    # Balance the dataset by sampling an equal number of reviews from each class
    samples_per_class = {"Negative": 1000, "Neutral": 1000, "Positive": 1000}
    balanced_dfs = []
    for sentiment, class_id in sentiment_map.items():
        class_df = df[df["label"] == class_id]
        n_samples = min(samples_per_class[sentiment], len(class_df))
        balanced_dfs.append(class_df.sample(n=n_samples, random_state=42))

    return pd.concat(balanced_dfs, ignore_index=True)

In [None]:
def prepare_dataset(df: pd.DataFrame, tokenizer) -> Dataset:
    """
    Converts a pandas DataFrame into a Hugging Face Dataset and tokenizes it.

    Args:
        df (pd.DataFrame): The input DataFrame with 'cleaned_review' and 'label' columns.
        tokenizer: The tokenizer object from the Hugging Face library.

    Returns:
        Dataset: A tokenized Hugging Face Dataset.
    """
    def tokenize_function(examples):
        return tokenizer(
            examples["cleaned_review"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
        )

    dataset = Dataset.from_pandas(df[["cleaned_review", "label"]])
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(
        type="torch", columns=["input_ids", "attention_mask", "label"]
    )
    return dataset

## Model Training

In [None]:
def train_single_model(model_name: str, train_dataset, eval_dataset) -> Dict:
    """
    Trains a single transformer model, logs metrics and artifacts with MLflow.

    Args:
        model_name (str): The name of the Hugging Face model to train.
        train_dataset (Dataset): The training dataset.
        eval_dataset (Dataset): The evaluation dataset.

    Returns:
        Dict: A dictionary containing the model name, F1 score, and accuracy.
    """
    print(f" Training {model_name}...")

    with mlflow.start_run(run_name=model_name.replace("/", "_")):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3,
            id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
            label2id={"Negative": 0, "Neutral": 1, "Positive": 2},
        )

        training_args = TrainingArguments(**TRAINING_ARGS.to_dict())
        training_args.output_dir = f"./results/{model_name.replace('/', '_')}"

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
        )

        trainer.train()

        mlflow.pytorch.log_state_dict(trainer.model.state_dict(), artifact_path="model")

        predictions = trainer.predict(eval_dataset)
        pred_labels = predictions.predictions.argmax(axis=1)
        true_labels = predictions.label_ids

        f1 = f1_score(true_labels, pred_labels, average="weighted")
        accuracy = accuracy_score(true_labels, pred_labels)

        mlflow.log_params(
            {
                "model_name": model_name,
                "learning_rate": LEARNING_RATE,
                "batch_size_train": BATCH_SIZE_TRAIN,
                "batch_size_eval": BATCH_SIZE_EVAL,
                "epochs": EPOCHS,
            }
        )
        mlflow.log_metrics({"f1_score": f1, "accuracy": accuracy})

        return {"model_name": model_name, "f1_score": f1, "accuracy": accuracy}

## Main Execution

In [None]:
FILE_PATH = "/content/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_cleaned.csv"
df = load_and_clean_data(FILE_PATH)

train_df, eval_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

results = {}
for model_name in MODELS:
    print(f"Preparing dataset for model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    train_dataset = prepare_dataset(train_df, tokenizer)
    eval_dataset = prepare_dataset(eval_df, tokenizer)

    results[model_name] = train_single_model(
        model_name, train_dataset, eval_dataset
    )

print("\n Training completed. Check MLflow UI for results!")

## First Results

In [None]:
print("\n" + "=" * 70)
print("SUMMARY OF ALL MODEL RESULTS")
print("=" * 70)
print(f"{'Model':<50} {'F1 Score':<10} {'Accuracy':<10}")
print("-" * 70)

# Show model scores
for model_name, result in results.items():
    if "error" not in result:
        print(
            f"{model_name:<50} {result['f1_score']:<10.4f} {result['accuracy']:<10.4f}"
        )
    else:
        print(f"{model_name:<50} {'ERROR':<10} {'ERROR':<10}")

# Find the best performing model based on F1 score
valid_results = {k: v for k, v in results.items() if "error" not in v}
if valid_results:
    best_model = max(valid_results.items(), key=lambda x: x[1]["f1_score"])
    print(
        f"\nBest Model: {best_model[0]} (F1: {best_model[1]['f1_score']:.4f}, Accuracy: {best_model[1]['accuracy']:.4f})"
    )

## Optuna objective

In [None]:
def objective(trial):
    """
    Defines the Optuna objective function for hyperparameter optimization.

    This function trains a model with a given set of hyperparameters from an
    Optuna trial and returns the evaluation F1 score to be maximized.
    """
    # Start a new MLflow run for each trial
    with mlflow.start_run():
        # Define the hyperparameter search space using trial suggestions
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
        per_device_train_batch_size = trial.suggest_categorical(
            "train_batch_size", [4, 8, 16]
        )
        num_train_epochs = trial.suggest_int("epochs", 2, 5)
        weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
        lr_scheduler_type = trial.suggest_categorical("scheduler", ["linear", "cosine"])

        # Prepare the dataset, ensuring the data loading and splitting are within the objective
        df = load_and_clean_data(FILE_PATH)
        train_df, eval_df = train_test_split(
            df, test_size=0.2, random_state=42, stratify=df["label"]
        )

        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        train_dataset = prepare_dataset(train_df, tokenizer)
        eval_dataset = prepare_dataset(eval_df, tokenizer)

        # Define the model with the appropriate labels
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=3,
            id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
            label2id={"Negative": 0, "Neutral": 1, "Positive": 2},
        )

        # Define training arguments for the current trial
        training_args = TrainingArguments(
            output_dir=f"./results/{MODEL_NAME.replace('/', '_')}_optuna_trial_{trial.number}",
            eval_strategy="epoch", 
            save_strategy="epoch",
            save_total_limit=1,
            learning_rate=learning_rate,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=16,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            lr_scheduler_type=lr_scheduler_type,
            warmup_steps=500,
            logging_steps=100,
            fp16=torch.cuda.is_available(),
            report_to="none",
            disable_tqdm=True,
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
        )

        # Define the compute metrics function
        def compute_metrics(eval_pred):
            """Computes accuracy and weighted F1 score from predictions."""
            logits, labels = eval_pred
            preds = logits.argmax(axis=-1)
            return {
                "accuracy": accuracy_score(labels, preds),
                "f1": f1_score(labels, preds, average="weighted"),
            }

        # Initialize the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
        )

        # Train and evaluate the model
        trainer.train()
        metrics = trainer.evaluate()

        # Log hyperparameters and metrics to MLflow for the current trial
        mlflow.log_params(
            {
                "trial": trial.number,
                "learning_rate": learning_rate,
                "train_batch_size": per_device_train_batch_size,
                "epochs": num_train_epochs,
                "weight_decay": weight_decay,
                "scheduler": lr_scheduler_type,
            }
        )
        mlflow.log_metrics(
            {"f1_score": metrics["eval_f1"], "accuracy": metrics["eval_accuracy"]}
        )

        # Return the F1 score to Optuna
        return metrics["eval_f1"]

## Run Study

In [None]:
# Optimal model
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# End any existing active run before starting the study
mlflow.end_run() 
# Create and run the Optuna study to find the best hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print the best trial's parameters
print("Best trial:")
trial = study.best_trial
print(trial.params)

In [None]:
def load_best_model():
    """Find and load the best model from results directory."""
    
    results_dir = "./results"
    best_trial_num = -1
    best_model_path = None
    
    print(f"Scanning {results_dir}...")
    
    # Find the highest trial number with a best_model directory
    for trial_dir in os.listdir(results_dir):
        if "optuna_trial_" in trial_dir:
            trial_num = int(trial_dir.split("_trial_")[-1])
            trial_path = os.path.join(results_dir, trial_dir)
            best_model_dir = os.path.join(trial_path, "best_model")
            
            if os.path.exists(best_model_dir) and trial_num > best_trial_num:
                best_trial_num = trial_num
                best_model_path = best_model_dir
                print(f"Found trial {trial_num} with best_model")
    
    if best_model_path is None:
        raise ValueError("No best_model directories found!")
    
    print(f"Loading from trial {best_trial_num}: {best_model_path}")
    
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
    model = AutoModelForSequenceClassification.from_pretrained(best_model_path)
    
    return model, tokenizer

## Predict

In [None]:
def predict(model, tokenizer, texts):
    """Predict sentiment for given texts."""
    inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1)
    
    labels = ["Negative", "Neutral", "Positive"]
    return [labels[pred] for pred in predictions]

model, tokenizer = load_best_model()

## Save & Download model

In [None]:
trainer.save_model("./saved_roberta_model")
tokenizer.save_pretrained("./saved_roberta_model")

shutil.make_archive('saved_roberta_model', 'zip', './saved_roberta_model')
files.download('saved_roberta_model.zip')

## Back up for MLflow

In [None]:
# 1. Backup the MLflow database and runs
if os.path.exists("./mlruns"):
    shutil.make_archive('mlflow_complete', 'zip', './', 'mlruns')
    files.download('mlflow_complete.zip')
    print(" MLflow data downloaded!")

# 2. Also backup the SQLite database if it exists
if os.path.exists("mlflow.db"):
    files.download("mlflow.db")
    print(" MLflow database downloaded!")

# 3. Create a simple summary from your actual MLflow runs


try:
    # Get the experiment you set up
    experiment = mlflow.get_experiment_by_name("sentiment_analysis_experiment")
    
    if experiment:
        runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
        
        if not runs_df.empty:
            print("\n MLflow Runs Summary:")
            print(f"Total runs: {len(runs_df)}")
            
            # Show available columns
            print("\nAvailable columns:")
            for col in sorted(runs_df.columns):
                if not col.startswith('tags.') and not col.startswith('artifact_uri'):
                    print(f"  - {col}")
            
            # Save the complete runs data
            runs_df.to_csv('mlflow_runs_complete.csv', index=False)
            files.download('mlflow_runs_complete.csv')
            print(" Complete MLflow runs data downloaded!")
            
        else:
            print(" No runs found in MLflow")
    else:
        print(" Experiment 'sentiment_analysis_experiment' not found")
        
except Exception as e:
    print(f" Error accessing MLflow: {e}")

print("\n Backup complete! You can now:")
print("1. Extract mlflow_complete.zip locally")  
print("2. Run 'mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns'")
print("3. Open http://localhost:5000 to see your dashboard")

## Load best config and make final Predictions

In [None]:
model_directory = r"C:\Users\nicol\Desktop\Ironhack\week6\Project_NLP\models\roberta-latest"

try:
    # Attempt to load the tokenizer and model
    tokenizer = RobertaTokenizer.from_pretrained(model_directory)
    model = RobertaForMaskedLM.from_pretrained(model_directory)

    print("Model and tokenizer loaded successfully!")
    print("---------------------------------------")

except Exception as e:
    print(f"An error occurred while loading the model: {e}")

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at C:\Users\nicol\Desktop\Ironhack\week6\Project_NLP\models\roberta-latest and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer loaded successfully!
---------------------------------------


In [None]:
# --- Load your dataset ---
df = pd.read_csv("/content/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_cleaned.csv")
df["combined_review"] = df["title"].astype(str) + " " + df["text"].astype(str)

# Map labels to integers: Negative=0, Neutral=1, Positive=2
label_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
df["labels"] = df["star_sentiment"].map(label_map)

# Convert to HuggingFace dataset
dataset = Dataset.from_pandas(df)

# --- Load model + tokenizer ---
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

# --- Tokenization function ---
def tokenize(batch):
    return tokenizer(batch["combined_review"], padding="max_length", truncation=True, max_length=256)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Split dataset
dataset = dataset.train_test_split(test_size=0.2, seed=42)

# --- Training arguments with best parameters ---
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2.9443750826822793e-05,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.09960856196239078,
    lr_scheduler_type="cosine",
    logging_dir="./logs",
    load_best_model_at_end=True,
)

# --- Define Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
)

In [None]:
# --- Train ---
trainer.train()

# --- Save final model ---
trainer.save_model("./drive/MyDrive/finetuned_roberta_sentiment")
tokenizer.save_pretrained("./drive/MyDrive/finetuned_roberta_sentiment")