# Sentiment Classificaiton and Model Selection with MLflow

In [None]:
#!pip install mlflow pyngrok
#!pip install optuna
import subprocess
import time
import json
import re
import string
import warnings
import os
import shutil
import threading
from typing import Dict

import mlflow
from pyngrok import ngrok
import optuna
import torch
import nltk
from google.colab import files
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset

# MLflow
import mlflow.pytorch

nltk.download('punkt_tab')
nltk.download('stopwords')
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## MLflow database creation

In [None]:
# Your ngrok token
NGROK_TOKEN = ""
ngrok.set_auth_token(NGROK_TOKEN)


def start_mlflow_server():
    """
    Starts the MLflow server in a subprocess.
    """
    subprocess.Popen(
        [
            "mlflow",
            "server",
            "--backend-store-uri",
            "sqlite:///mlflow.db",
            "--default-artifact-root",
            "./mlruns",
            "--host",
            "0.0.0.0",
            "--port",
            "5000",
        ]
    )


# Start the server in a separate thread to avoid blocking the notebook
server_thread = threading.Thread(target=start_mlflow_server)
server_thread.start()

# Give the server a moment to start up
time.sleep(5)

# Create an ngrok tunnel to expose the MLflow UI to the internet
public_url = ngrok.connect(5000, "http")
print("MLflow Tracking UI is available at:", public_url)

# Set the MLflow tracking URI to the local server address
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# Set the experiment name for this run
mlflow.set_experiment("sentiment_analysis_experiment")



MLflow Tracking UI is available at: NgrokTunnel: "https://ea972af6202c.ngrok-free.app" -> "http://localhost:5000"




<Experiment: artifact_location='/content/mlruns/1', creation_time=1756478982874, experiment_id='1', last_update_time=1756478982874, lifecycle_stage='active', name='sentiment_analysis_experiment', tags={}>

## Hyperparameters

In [None]:
LEARNING_RATE = 2e-5
BATCH_SIZE_TRAIN = 4
BATCH_SIZE_EVAL = 8
EPOCHS = 1
WEIGHT_DECAY = 0.01
MAX_LENGTH = 256
WARMUP_STEPS = 50
LR_SCHEDULER = "linear"
GRADIENT_CHECKPOINTING = True

TRAINING_ARGS = TrainingArguments(
    output_dir="./results",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE_TRAIN,
    per_device_eval_batch_size=BATCH_SIZE_EVAL,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
    disable_tqdm=False,
    seed=42,
    gradient_checkpointing=GRADIENT_CHECKPOINTING,
    lr_scheduler_type=LR_SCHEDULER,
    warmup_steps=WARMUP_STEPS,
)

MODELS = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
]

## Helper Functions

In [None]:
def load_and_clean_data(filepath: str) -> pd.DataFrame:
    """
    Loads a CSV file, cleans the text data, and balances the dataset.

    Args:
        filepath (str): The path to the CSV file.

    Returns:
        pd.DataFrame: A cleaned and balanced DataFrame with 'label' and 'cleaned_review' columns.
    """
    df = pd.read_csv(filepath)
    stop_words = set(stopwords.words("english"))

    def clean_text(text: str) -> str:
        """
        Cleans a single string of text by lowercasing, removing punctuation,
        numbers, extra spaces, and stopwords.

        Args:
            text (str): The input text string.

        Returns:
            str: The cleaned text string.
        """
        if not isinstance(text, str) or pd.isna(text):
            return ""
        text = str(text).lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = re.sub(r"\d+", "", text)
        text = re.sub(r"\s+", " ", text)
        words = word_tokenize(text)
        return " ".join([w for w in words if w not in stop_words and len(w) > 1])

    # Combine title and text, then clean the new column
    df["full_review"] = df.apply(
        lambda row: f"{str(row.get('title', ''))} {str(row.get('text', ''))}".strip(),
        axis=1,
    )
    df["cleaned_review"] = df["full_review"].apply(clean_text)

    # Filter out rows with short or missing cleaned reviews
    df = df[(df["cleaned_review"].str.len() > 10) & (df["star_sentiment"].notna())]

    # Map sentiment strings to numerical labels
    sentiment_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
    df["label"] = df["star_sentiment"].map(sentiment_map)

    # Balance the dataset by sampling an equal number of reviews from each class
    samples_per_class = {"Negative": 2000, "Neutral": 2000, "Positive": 4000}
    balanced_dfs = []
    for sentiment, class_id in sentiment_map.items():
        class_df = df[df["label"] == class_id]
        n_samples = min(samples_per_class[sentiment], len(class_df))
        balanced_dfs.append(class_df.sample(n=n_samples, random_state=42))

    return pd.concat(balanced_dfs, ignore_index=True)

In [None]:
def prepare_dataset(df: pd.DataFrame, tokenizer) -> Dataset:
    """
    Converts a pandas DataFrame into a Hugging Face Dataset and tokenizes it.

    Args:
        df (pd.DataFrame): The input DataFrame with 'cleaned_review' and 'label' columns.
        tokenizer: The tokenizer object from the Hugging Face library.

    Returns:
        Dataset: A tokenized Hugging Face Dataset.
    """
    def tokenize_function(examples):
        return tokenizer(
            examples["cleaned_review"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
        )

    dataset = Dataset.from_pandas(df[["cleaned_review", "label"]])
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(
        type="torch", columns=["input_ids", "attention_mask", "label"]
    )
    return dataset

In [None]:
def train_single_model(model_name: str, train_dataset, eval_dataset) -> Dict:
    """
    Trains a single transformer model, logs metrics and artifacts with MLflow.

    Args:
        model_name (str): The name of the Hugging Face model to train.
        train_dataset (Dataset): The training dataset.
        eval_dataset (Dataset): The evaluation dataset.

    Returns:
        Dict: A dictionary containing the model name, F1 score, and accuracy.
    """
    print(f" Training {model_name}...")

    with mlflow.start_run(run_name=model_name.replace("/", "_")):
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=3,
            id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
            label2id={"Negative": 0, "Neutral": 1, "Positive": 2},
        )

        training_args = TrainingArguments(**TRAINING_ARGS.to_dict())
        training_args.output_dir = f"./results/{model_name.replace('/', '_')}"

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
        )

        trainer.train()

        # Log the model's state dictionary as an MLflow artifact
        mlflow.pytorch.log_state_dict(trainer.model.state_dict(), artifact_path="model")

        # Make predictions and calculate evaluation metrics
        predictions = trainer.predict(eval_dataset)
        pred_labels = predictions.predictions.argmax(axis=1)
        true_labels = predictions.label_ids

        f1 = f1_score(true_labels, pred_labels, average="weighted")
        accuracy = accuracy_score(true_labels, pred_labels)

        # Log hyperparameters and metrics to MLflow
        mlflow.log_params(
            {
                "model_name": model_name,
                "learning_rate": LEARNING_RATE,
                "batch_size_train": BATCH_SIZE_TRAIN,
                "batch_size_eval": BATCH_SIZE_EVAL,
                "epochs": EPOCHS,
            }
        )
        mlflow.log_metrics({"f1_score": f1, "accuracy": accuracy})

        return {"model_name": model_name, "f1_score": f1, "accuracy": accuracy}

## Main Execution

In [None]:
# Define the file path for the dataset
FILE_PATH = "/content/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_cleaned.csv"

# Load and clean the data from the specified file path
df = load_and_clean_data(FILE_PATH)

# Split the cleaned DataFrame into training and evaluation sets, ensuring stratification
train_df, eval_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

results = {}
for model_name in MODELS:
    print(f"Preparing dataset for model: {model_name}")

    # Initialize tokenizer for the current model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Prepare the datasets for training and evaluation
    train_dataset = prepare_dataset(train_df, tokenizer)
    eval_dataset = prepare_dataset(eval_df, tokenizer)

    # Train the model and store the results
    results[model_name] = train_single_model(
        model_name, train_dataset, eval_dataset
    )

print("\n Training completed. Check MLflow UI for results!")

Preparing dataset for model: distilbert-base-uncased


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

 Training distilbert-base-uncased...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6123,0.52394


🏃 View run distilbert-base-uncased at: http://127.0.0.1:5000/#/experiments/1/runs/c40b5d502be24160b82a7766d9e5af00
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Preparing dataset for model: bert-base-uncased


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

 Training bert-base-uncased...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6348,0.541205


🏃 View run bert-base-uncased at: http://127.0.0.1:5000/#/experiments/1/runs/9ae6769afdf64142bf5517762ad6ff72
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Preparing dataset for model: roberta-base


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

 Training roberta-base...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.6977,0.597068


🏃 View run roberta-base at: http://127.0.0.1:5000/#/experiments/1/runs/890efcb2f53449b0838b085ef75f377f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Preparing dataset for model: cardiffnlp/twitter-roberta-base-sentiment-latest


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

 Training cardiffnlp/twitter-roberta-base-sentiment-latest...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch,Training Loss,Validation Loss
1,0.6226,0.566744


🏃 View run cardiffnlp_twitter-roberta-base-sentiment-latest at: http://127.0.0.1:5000/#/experiments/1/runs/70d4596ad70d4975ab65812e48ccc70a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

 Training completed. Check MLflow UI for results!


## Base Model Results

In [None]:
# Print a formatted summary of all model results
print("\n" + "=" * 70)
print("SUMMARY OF ALL MODEL RESULTS")
print("=" * 70)
print(f"{'Model':<50} {'F1 Score':<10} {'Accuracy':<10}")
print("-" * 70)

for model_name, result in results.items():
    if "error" not in result:
        print(
            f"{model_name:<50} {result['f1_score']:<10.4f} {result['accuracy']:<10.4f}"
        )
    else:
        print(f"{model_name:<50} {'ERROR':<10} {'ERROR':<10}")

# Find the best performing model based on F1 score
valid_results = {k: v for k, v in results.items() if "error" not in v}
if valid_results:
    best_model = max(valid_results.items(), key=lambda x: x[1]["f1_score"])
    print(
        f"\nBest Model: {best_model[0]} (F1: {best_model[1]['f1_score']:.4f}, Accuracy: {best_model[1]['accuracy']:.4f})"
    )


SUMMARY OF ALL MODEL RESULTS
Model                                              F1 Score   Accuracy  
----------------------------------------------------------------------
distilbert-base-uncased                            0.8067     0.8069    
bert-base-uncased                                  0.8095     0.8100    
roberta-base                                       0.8016     0.8019    
cardiffnlp/twitter-roberta-base-sentiment-latest   0.8288     0.8281    

Best Model: cardiffnlp/twitter-roberta-base-sentiment-latest (F1: 0.8288, Accuracy: 0.8281)


## Optuna Objective

In [None]:
def objective(trial):
    """
    Defines the Optuna objective function for hyperparameter optimization.

    This function trains a model with a given set of hyperparameters from an
    Optuna trial and returns the evaluation F1 score to be maximized.
    """
    # Start a new MLflow run for each trial
    with mlflow.start_run():
        # Define the hyperparameter search space using trial suggestions
        learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
        per_device_train_batch_size = trial.suggest_categorical(
            "train_batch_size", [8, 16]
        )
        num_train_epochs = trial.suggest_int("epochs", 2, 3)
        weight_decay = 0.1
        lr_scheduler_type = "cosine"

        # Prepare the dataset, ensuring the data loading and splitting are within the objective
        df = load_and_clean_data(FILE_PATH)
        train_df, eval_df = train_test_split(
            df, test_size=0.2, random_state=42, stratify=df["label"]
        )

        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        train_dataset = prepare_dataset(train_df, tokenizer)
        eval_dataset = prepare_dataset(eval_df, tokenizer)

        # Define the model with the appropriate labels
        model = AutoModelForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=3,
            id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
            label2id={"Negative": 0, "Neutral": 1, "Positive": 2},
        )

        # Define training arguments for the current trial
        training_args = TrainingArguments(
            output_dir=f"./results/{MODEL_NAME.replace('/', '_')}_optuna_trial_{trial.number}",
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            learning_rate=learning_rate,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=16,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            lr_scheduler_type=lr_scheduler_type,
            warmup_steps=500,
            logging_steps=100,
            fp16=torch.cuda.is_available(),
            report_to="none",
            disable_tqdm=True,
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
        )

        # Define the compute metrics function
        def compute_metrics(eval_pred):
            """Computes accuracy and weighted F1 score from predictions."""
            logits, labels = eval_pred
            preds = logits.argmax(axis=-1)
            return {
                "accuracy": accuracy_score(labels, preds),
                "f1": f1_score(labels, preds, average="weighted"),
            }

        # Initialize the Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,

        )

        # Train and evaluate the model
        trainer.train()
        metrics = trainer.evaluate()

        # Log hyperparameters and metrics to MLflow for the current trial
        mlflow.log_params(
            {
                "trial": trial.number,
                "learning_rate": learning_rate,
                "train_batch_size": per_device_train_batch_size,
                "epochs": num_train_epochs,
                "weight_decay": weight_decay,
                "scheduler": lr_scheduler_type,
            }
        )
        mlflow.log_metrics(
            {"f1_score": metrics["eval_f1"], "accuracy": metrics["eval_accuracy"]}
        )

        # Return the F1 score to Optuna
        return metrics["eval_f1"]



[I 2025-08-29 15:37:13,907] A new study created in memory with name: no-name-129f5499-96c3-40d9-a842-31a26163d223


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'loss': 0.7809, 'grad_norm': 15.665146827697754, 'learning_rate': 5.5988124573987385e-06, 'epoch': 0.125}
{'loss': 0.6849, 'grad_norm': 17.46649932861328, 'learning_rate': 1.1254178575983322e-05, 'epoch': 0.25}
{'loss': 0.6462, 'grad_norm': 6.3594183921813965, 'learning_rate': 1.6909544694567906e-05, 'epoch': 0.375}
{'loss': 0.6488, 'grad_norm': 34.556156158447266, 'learning_rate': 2.2564910813152492e-05, 'epoch': 0.5}
{'loss': 0.6207, 'grad_norm': 9.447734832763672, 'learning_rate': 2.8220276931737075e-05, 'epoch': 0.625}
{'loss': 0.6129, 'grad_norm': 36.30864334106445, 'learning_rate': 2.7715446231131705e-05, 'epoch': 0.75}
{'loss': 0.5564, 'grad_norm': 10.621430397033691, 'learning_rate': 2.6054189230077193e-05, 'epoch': 0.875}
{'loss': 0.5691, 'grad_norm': 4.141660690307617, 'learning_rate': 2.3427587090757365e-05, 'epoch': 1.0}
{'eval_loss': 0.5357930660247803, 'eval_accuracy': 0.78875, 'eval_f1': 0.7693643488657378, 'eval_runtime': 5.0705, 'eval_samples_per_second': 315.553, 'ev

[I 2025-08-29 15:42:10,222] Trial 0 finished with value: 0.846324220821115 and parameters: {'learning_rate': 2.827683059292292e-05, 'train_batch_size': 8, 'epochs': 2}. Best is trial 0 with value: 0.846324220821115.


🏃 View run adorable-mare-631 at: http://127.0.0.1:5000/#/experiments/1/runs/5133995349534bfb93cb5c4782306a23
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'loss': 0.7746, 'grad_norm': 14.644993782043457, 'learning_rate': 8.107458517366635e-06, 'epoch': 0.125}
{'loss': 0.6817, 'grad_norm': 19.677730560302734, 'learning_rate': 1.6296810555110713e-05, 'epoch': 0.25}
{'loss': 0.6512, 'grad_norm': 7.774045944213867, 'learning_rate': 2.4486162592854788e-05, 'epoch': 0.375}
{'loss': 0.6491, 'grad_norm': 32.97530746459961, 'learning_rate': 3.2675514630598866e-05, 'epoch': 0.5}
{'loss': 0.6254, 'grad_norm': 11.091157913208008, 'learning_rate': 4.086486666834294e-05, 'epoch': 0.625}
{'loss': 0.6075, 'grad_norm': 31.337221145629883, 'learning_rate': 4.067307423690815e-05, 'epoch': 0.75}
{'loss': 0.5959, 'grad_norm': 12.304574966430664, 'learning_rate': 3.984842127227959e-05, 'epoch': 0.875}
{'loss': 0.6144, 'grad_norm': 3.445432662963867, 'learning_rate': 3.849526769128021e-05, 'epoch': 1.0}
{'eval_loss': 0.5448405742645264, 'eval_accuracy': 0.789375, 'eval_f1': 0.7746462034118652, 'eval_runtime': 5.0691, 'eval_samples_per_second': 315.638, 'eval_

[I 2025-08-29 15:50:09,989] Trial 1 finished with value: 0.8622753268057656 and parameters: {'learning_rate': 4.094676018872038e-05, 'train_batch_size': 8, 'epochs': 3}. Best is trial 1 with value: 0.8622753268057656.


🏃 View run popular-tern-409 at: http://127.0.0.1:5000/#/experiments/1/runs/00038f32698647f4b2e0e7fe417c11c6
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'loss': 0.7331, 'grad_norm': 8.328893661499023, 'learning_rate': 9.770819704286293e-06, 'epoch': 0.25}
{'loss': 0.6316, 'grad_norm': 24.298494338989258, 'learning_rate': 1.964033455710073e-05, 'epoch': 0.5}
{'loss': 0.6113, 'grad_norm': 21.42095375061035, 'learning_rate': 2.9509849409915165e-05, 'epoch': 0.75}
{'loss': 0.5972, 'grad_norm': 6.601912021636963, 'learning_rate': 3.93793642627296e-05, 'epoch': 1.0}
{'eval_loss': 0.5215785503387451, 'eval_accuracy': 0.808125, 'eval_f1': 0.7978361048528544, 'eval_runtime': 5.0962, 'eval_samples_per_second': 313.958, 'eval_steps_per_second': 19.622, 'epoch': 1.0}
{'loss': 0.4823, 'grad_norm': 16.456130981445312, 'learning_rate': 4.924887911554404e-05, 'epoch': 1.25}
{'loss': 0.4885, 'grad_norm': 15.031041145324707, 'learning_rate': 3.7233766665651036e-05, 'epoch': 1.5}
{'loss': 0.4602, 'grad_norm': 11.677794456481934, 'learning_rate': 1.2561332413018614e-05, 'epoch': 1.75}
{'loss': 0.4146, 'grad_norm': 6.587332725524902, 'learning_rate': 1.35

[I 2025-08-29 15:55:20,797] Trial 2 finished with value: 0.8398067908743837 and parameters: {'learning_rate': 4.934757426407218e-05, 'train_batch_size': 16, 'epochs': 2}. Best is trial 1 with value: 0.8622753268057656.


🏃 View run upset-cod-11 at: http://127.0.0.1:5000/#/experiments/1/runs/6faed21cd75d4e14848a1633593d199a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'loss': 0.7953, 'grad_norm': 16.80375099182129, 'learning_rate': 2.667260941244112e-06, 'epoch': 0.125}
{'loss': 0.7004, 'grad_norm': 16.322507858276367, 'learning_rate': 5.361463912197761e-06, 'epoch': 0.25}
{'loss': 0.6472, 'grad_norm': 9.977849006652832, 'learning_rate': 8.055666883151408e-06, 'epoch': 0.375}
{'loss': 0.6427, 'grad_norm': 29.920042037963867, 'learning_rate': 1.0749869854105058e-05, 'epoch': 0.5}
{'loss': 0.6429, 'grad_norm': 24.571582794189453, 'learning_rate': 1.3444072825058706e-05, 'epoch': 0.625}
{'loss': 0.6035, 'grad_norm': 39.56353759765625, 'learning_rate': 1.3203572679731243e-05, 'epoch': 0.75}
{'loss': 0.5512, 'grad_norm': 16.045482635498047, 'learning_rate': 1.2412153794745103e-05, 'epoch': 0.875}
{'loss': 0.5811, 'grad_norm': 5.3529839515686035, 'learning_rate': 1.1160846781391242e-05, 'epoch': 1.0}
{'eval_loss': 0.5007236003875732, 'eval_accuracy': 0.801875, 'eval_f1': 0.7924242888218567, 'eval_runtime': 5.0638, 'eval_samples_per_second': 315.97, 'eval

[I 2025-08-29 16:00:48,990] Trial 3 finished with value: 0.8403440453620215 and parameters: {'learning_rate': 1.3471014854768242e-05, 'train_batch_size': 8, 'epochs': 2}. Best is trial 1 with value: 0.8622753268057656.


🏃 View run thundering-crab-26 at: http://127.0.0.1:5000/#/experiments/1/runs/9516d23b229e4f6e8aa9fa7217e86814
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Map:   0%|          | 0/6400 [00:00<?, ? examples/s]

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'loss': 0.7699, 'grad_norm': 12.723413467407227, 'learning_rate': 2.0551239390118484e-06, 'epoch': 0.25}
{'loss': 0.6653, 'grad_norm': 15.147976875305176, 'learning_rate': 4.1310067056904835e-06, 'epoch': 0.5}
{'loss': 0.6678, 'grad_norm': 11.465372085571289, 'learning_rate': 6.206889472369118e-06, 'epoch': 0.75}
{'loss': 0.6394, 'grad_norm': 11.344589233398438, 'learning_rate': 8.282772239047753e-06, 'epoch': 1.0}
{'eval_loss': 0.521134614944458, 'eval_accuracy': 0.789375, 'eval_f1': 0.7817265877026071, 'eval_runtime': 5.0647, 'eval_samples_per_second': 315.913, 'eval_steps_per_second': 19.745, 'epoch': 1.0}
{'loss': 0.5426, 'grad_norm': 23.783918380737305, 'learning_rate': 1.0358655005726388e-05, 'epoch': 1.25}
{'loss': 0.5248, 'grad_norm': 19.233898162841797, 'learning_rate': 9.875529896696562e-06, 'epoch': 1.5}
{'loss': 0.5082, 'grad_norm': 10.909320831298828, 'learning_rate': 8.443613516543285e-06, 'epoch': 1.75}
{'loss': 0.489, 'grad_norm': 10.43337631225586, 'learning_rate': 6.

[I 2025-08-29 16:08:47,698] Trial 4 finished with value: 0.8366278615351791 and parameters: {'learning_rate': 1.0379413833393174e-05, 'train_batch_size': 16, 'epochs': 3}. Best is trial 1 with value: 0.8622753268057656.


🏃 View run casual-crow-927 at: http://127.0.0.1:5000/#/experiments/1/runs/8c8ef84bed784614b1e1aee4ee410e8f
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1
Best trial:
{'learning_rate': 4.094676018872038e-05, 'train_batch_size': 8, 'epochs': 3}


## Run Study

In [None]:
# Define the model name for the optimization study
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# End any existing active run before starting the study
mlflow.end_run()
# Create and run the Optuna study to find the best hyperparameters
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

# Print the best trial's parameters
print("Best trial:")
trial = study.best_trial
print(trial.params)

In [None]:
def load_model_from_mlflow(run_id: str, model_name: str):
    """
    Loads a model's state_dict from an MLflow run and applies it to a new model instance.

    Args:
        run_id (str): The MLflow run ID containing the saved model.
        model_name (str): The name of the Hugging Face model architecture.

    Returns:
        A tuple containing the loaded model and its tokenizer.
    """
    print(f"Loading model from MLflow run ID: {run_id}")

    # Step 1: Initialize the model architecture
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
        label2id={"Negative": 0, "Neutral": 1, "Positive": 2},
    )

    # Step 2: Load the state_dict from the MLflow artifact
    # The artifact is saved at "model/state_dict.pth" by default with log_state_dict
    model_uri = f"runs:/{run_id}/model/state_dict.pth"

    # MLflow's pytorch.load_state_dict handles downloading the artifact and loading the state dict
    state_dict = mlflow.pytorch.load_state_dict(model_uri)

    # Step 3: Apply the loaded state dict to the model
    model.load_state_dict(state_dict)

    print("Model loaded successfully!")
    return model, tokenizer

# --- Example Usage ---
# You would need to get the run_id from a previously completed MLflow run.
# You can get this programmatically, from the MLflow UI, or by logging it.
# For example, let's assume you have a run_id:
# best_run_id = "your-best-run-id-from-mlflow"

# You also need the original model name
# best_model_name = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# loaded_model, loaded_tokenizer = load_model_from_mlflow(best_run_id, best_model_name)

In [None]:
trainer.save_model("./saved_roberta_model")
tokenizer.save_pretrained("./saved_roberta_model")

NameError: name 'trainer' is not defined

In [None]:
# Create a zip file of your model
shutil.make_archive('saved_roberta_model', 'zip', './saved_roberta_model')

# Download the zip file
files.download('saved_roberta_model.zip')

FileNotFoundError: [Errno 2] No such file or directory: './saved_roberta_model'

## MLflow backup

In [None]:
# 1. Backup the MLflow database and runs
if os.path.exists("./mlruns"):
    shutil.make_archive('mlflow_complete', 'zip', './', 'mlruns')
    files.download('mlflow_complete.zip')
    print(" MLflow data downloaded!")

# 2. Also backup the SQLite database if it exists
if os.path.exists("mlflow.db"):
    files.download("mlflow.db")
    print(" MLflow database downloaded!")

# 3. Create a simple summary from your actual MLflow runs
try:
    # Get the experiment you set up
    experiment = mlflow.get_experiment_by_name("sentiment_analysis_experiment")

    if experiment:
        runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

        if not runs_df.empty:
            print("\n MLflow Runs Summary:")
            print(f"Total runs: {len(runs_df)}")

            # Show available columns
            print("\nAvailable columns:")
            for col in sorted(runs_df.columns):
                if not col.startswith('tags.') and not col.startswith('artifact_uri'):
                    print(f"  - {col}")

            # Save the complete runs data
            runs_df.to_csv('mlflow_runs_complete.csv', index=False)
            files.download('mlflow_runs_complete.csv')
            print(" Complete MLflow runs data downloaded!")

        else:
            print(" No runs found in MLflow")
    else:
        print(" Experiment 'sentiment_analysis_experiment' not found")

except Exception as e:
    print(f" Error accessing MLflow: {e}")

print("\n Backup complete! You can now:")
print("1. Extract mlflow_complete.zip locally")
print("2. Run 'mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns'")
print("3. Open http://localhost:5000 to see your dashboard")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

 MLflow data downloaded!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>



 MLflow database downloaded!




 Error accessing MLflow: API request to http://127.0.0.1:5000/api/2.0/mlflow/experiments/get-by-name failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow/experiments/get-by-name?experiment_name=sentiment_analysis_experiment (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7fb09c7f00b0>: Failed to establish a new connection: [Errno 111] Connection refused'))

 Backup complete! You can now:
1. Extract mlflow_complete.zip locally
2. Run 'mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns'
3. Open http://localhost:5000 to see your dashboard
