# Sentiment Classification and Model Selection with MLflow

In [None]:
import subprocess
import time
import re
import string
import warnings
import os
import shutil
import threading
from typing import Dict, Tuple

In [None]:
import mlflow
import mlflow.pytorch
from pyngrok import ngrok
import optuna
import torch
import nltk
from google.colab import files
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from datasets import Dataset

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
warnings.filterwarnings("ignore")

## MLflow database creation

In [None]:
NGROK_TOKEN = ""
ngrok.set_auth_token(NGROK_TOKEN)

In [None]:
def start_mlflow_server():
    subprocess.Popen([
        "mlflow", "server",
        "--backend-store-uri", "sqlite:///mlflow.db",
        "--default-artifact-root", "./mlruns",
        "--host", "0.0.0.0",
        "--port", "5000",
    ])

In [None]:
server_thread = threading.Thread(target=start_mlflow_server)
server_thread.start()
time.sleep(5)

In [None]:
public_url = ngrok.connect(5000, "http")
print("MLflow Tracking UI is available at:", public_url)

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("sentiment_analysis_experiment")

## Hyperparameters

In [None]:
LEARNING_RATE = 2e-5
BATCH_SIZE_TRAIN = 4
BATCH_SIZE_EVAL = 8
EPOCHS = 1
WEIGHT_DECAY = 0.01
MAX_LENGTH = 256
WARMUP_STEPS = 50
LR_SCHEDULER = "linear"
GRADIENT_CHECKPOINTING = True

In [None]:
TRAINING_ARGS = TrainingArguments(
    output_dir="./results",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE_TRAIN,
    per_device_eval_batch_size=BATCH_SIZE_EVAL,
    num_train_epochs=EPOCHS,
    weight_decay=WEIGHT_DECAY,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=50,
    fp16=torch.cuda.is_available(),
    report_to="none",
    disable_tqdm=False,
    seed=42,
    gradient_checkpointing=GRADIENT_CHECKPOINTING,
    lr_scheduler_type=LR_SCHEDULER,
    warmup_steps=WARMUP_STEPS,
)

In [None]:
MODELS = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "roberta-base",
    "cardiffnlp/twitter-roberta-base-sentiment-latest",
]

## Helper Functions

In [None]:
def load_and_clean_data(filepath: str) -> pd.DataFrame:
    df = pd.read_csv(filepath)
    stop_words = set(stopwords.words("english"))
    def clean_text(text: str) -> str:
        if not isinstance(text, str) or pd.isna(text):
            return ""
        text = str(text).lower()
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = re.sub(r"\d+", "", text)
        text = re.sub(r"\s+", " ", text)
        words = word_tokenize(text)
        return " ".join([w for w in words if w not in stop_words and len(w) > 1])
    df["full_review"] = df.apply(
        lambda row: f"{str(row.get('title', ''))} {str(row.get('text', ''))}".strip(),
        axis=1,
    )
    df["cleaned_review"] = df["full_review"].apply(clean_text)
    df = df[(df["cleaned_review"].str.len() > 10) & (df["star_sentiment"].notna())]
    sentiment_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
    df["label"] = df["star_sentiment"].map(sentiment_map)
    samples_per_class = {"Negative": 2000, "Neutral": 2000, "Positive": 4000}
    balanced_dfs = []
    for sentiment, class_id in sentiment_map.items():
        class_df = df[df["label"] == class_id]
        n_samples = min(samples_per_class[sentiment], len(class_df))
        balanced_dfs.append(class_df.sample(n=n_samples, random_state=42))
    return pd.concat(balanced_dfs, ignore_index=True)

In [None]:
def get_tokenizer_and_model(model_name: str) -> Tuple[AutoTokenizer, AutoModelForSequenceClassification]:
    """Get tokenizer and model - single source of truth"""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
        label2id={"Negative": 0, "Neutral": 1, "Positive": 2},
    )
    return tokenizer, model

In [None]:
def prepare_dataset(data_df: pd.DataFrame, model_tokenizer) -> Dataset:
    """Convert DataFrame to tokenized Dataset"""
    def tokenize_function(examples):
        return model_tokenizer(
            examples["cleaned_review"],
            padding="max_length",
            truncation=True,
            max_length=MAX_LENGTH,
        )
    dataset = Dataset.from_pandas(data_df[["cleaned_review", "label"]])
    dataset = dataset.map(tokenize_function, batched=True)
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
    return dataset

In [None]:
def compute_metrics(eval_pred):
    """Compute metrics function for trainer"""
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

In [None]:
def train_single_model(model_name_param: str, train_data, eval_data) -> Dict:
    """Train a single model and return results"""
    print(f"Training {model_name_param}...")
    with mlflow.start_run(run_name=model_name_param.replace("/", "_")):
        model_tokenizer, model = get_tokenizer_and_model(model_name_param)
        training_args = TrainingArguments(**TRAINING_ARGS.to_dict())
        training_args.output_dir = f"./results/{model_name_param.replace('/', '_')}"
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=eval_data,
            tokenizer=model_tokenizer,
        )
        trainer.train()
        mlflow.pytorch.log_state_dict(trainer.model.state_dict(), artifact_path="model")
        predictions = trainer.predict(eval_data)
        pred_labels = predictions.predictions.argmax(axis=1)
        true_labels = predictions.label_ids
        f1 = f1_score(true_labels, pred_labels, average="weighted")
        accuracy = accuracy_score(true_labels, pred_labels)
        mlflow.log_params({
            "model_name": model_name_param,
            "learning_rate": LEARNING_RATE,
            "batch_size_train": BATCH_SIZE_TRAIN,
            "batch_size_eval": BATCH_SIZE_EVAL,
            "epochs": EPOCHS,
        })
        mlflow.log_metrics({"f1_score": f1, "accuracy": accuracy})
        return {"model_name": model_name_param, "f1_score": f1, "accuracy": accuracy}

## Main Execution

In [None]:
FILE_PATH = "/content/Datafiniti_Amazon_Consumer_Reviews_of_Amazon_Products_cleaned.csv"

In [None]:
df = load_and_clean_data(FILE_PATH)
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

In [None]:
results = {}
for model_name in MODELS:
    print(f"Preparing dataset for model: {model_name}")
    
    tokenizer, _ = get_tokenizer_and_model(model_name)
    train_dataset = prepare_dataset(train_df, tokenizer)
    eval_dataset = prepare_dataset(eval_df, tokenizer)
    
    results[model_name] = train_single_model(model_name, train_dataset, eval_dataset)

In [None]:
print("\nTraining completed. Check MLflow UI for results!")

## Base Model Results

In [None]:
print("\n" + "=" * 70)
print("SUMMARY OF ALL MODEL RESULTS")
print("=" * 70)
print(f"{'Model':<50} {'F1 Score':<10} {'Accuracy':<10}")
print("-" * 70)

In [None]:
for model_name, result in results.items():
    if "error" not in result:
        print(f"{model_name:<50} {result['f1_score']:<10.4f} {result['accuracy']:<10.4f}")
    else:
        print(f"{model_name:<50} {'ERROR':<10} {'ERROR':<10}")

In [None]:
valid_results = {k: v for k, v in results.items() if "error" not in v}
if valid_results:
    best_model = max(valid_results.items(), key=lambda x: x[1]["f1_score"])
    print(f"\nBest Model: {best_model[0]} (F1: {best_model[1]['f1_score']:.4f}, Accuracy: {best_model[1]['accuracy']:.4f})")

## Optuna Objective

In [None]:
def objective(optuna_trial):
    """Optuna objective function for hyperparameter optimization"""
    with mlflow.start_run():
        learning_rate = optuna_trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
        per_device_train_batch_size = optuna_trial.suggest_categorical("train_batch_size", [8, 16])
        num_train_epochs = optuna_trial.suggest_int("epochs", 2, 3)
        weight_decay = 0.1
        lr_scheduler_type = "cosine"

        # Reuse existing data splits
        opt_tokenizer, opt_model = get_tokenizer_and_model(MODEL_NAME)
        opt_train_dataset = prepare_dataset(train_df, opt_tokenizer)
        opt_eval_dataset = prepare_dataset(eval_df, opt_tokenizer)
        training_args = TrainingArguments(
            output_dir=f"./results/{MODEL_NAME.replace('/', '_')}_optuna_trial_{optuna_trial.number}",
            eval_strategy="epoch",
            save_strategy="epoch",
            save_total_limit=1,
            learning_rate=learning_rate,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=16,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            lr_scheduler_type=lr_scheduler_type,
            warmup_steps=500,
            logging_steps=100,
            fp16=torch.cuda.is_available(),
            report_to="none",
            disable_tqdm=True,
            seed=42,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
        )
        trainer = Trainer(
            model=opt_model,
            args=training_args,
            train_dataset=opt_train_dataset,
            eval_dataset=opt_eval_dataset,
            tokenizer=opt_tokenizer,
            compute_metrics=compute_metrics,
        )
        trainer.train()
        metrics = trainer.evaluate()
        mlflow.log_params({
            "trial": optuna_trial.number,
            "learning_rate": learning_rate,
            "train_batch_size": per_device_train_batch_size,
            "epochs": num_train_epochs,
            "weight_decay": weight_decay,
            "scheduler": lr_scheduler_type,
        })
        mlflow.log_metrics({"f1_score": metrics["eval_f1"], "accuracy": metrics["eval_accuracy"]})
        return metrics["eval_f1"]

## Run Study

In [None]:
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

In [None]:
mlflow.end_run()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

In [None]:
print("Best trial:")
trial = study.best_trial
print(trial.params)

In [None]:
def load_model_from_mlflow(run_id: str, model_name_param: str):
    """Load model from MLflow run"""
    print(f"Loading model from MLflow run ID: {run_id}")
    load_tokenizer, load_model = get_tokenizer_and_model(model_name_param)
    
    model_uri = f"runs:/{run_id}/model/state_dict.pth"
    state_dict = mlflow.pytorch.load_state_dict(model_uri)
    load_model.load_state_dict(state_dict)
    print("Model loaded successfully!")
    return load_model, load_tokenizer

Get best trial parameters

In [None]:
best_params = study.best_trial.params

Retrain with best parameters for saving

In [None]:
with mlflow.start_run(run_name="best_model_final"):
    best_tokenizer, best_model = get_tokenizer_and_model(MODEL_NAME)
    best_train_dataset = prepare_dataset(train_df, best_tokenizer)
    best_eval_dataset = prepare_dataset(eval_df, best_tokenizer)
    final_training_args = TrainingArguments(
        output_dir="./results/final_best_model",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=best_params["learning_rate"],
        per_device_train_batch_size=best_params["train_batch_size"],
        per_device_eval_batch_size=16,
        num_train_epochs=best_params["epochs"],
        weight_decay=0.1,
        lr_scheduler_type="cosine",
        warmup_steps=500,
        logging_steps=100,
        fp16=torch.cuda.is_available(),
        report_to="none",
        disable_tqdm=False,
        seed=42,
    )
    final_trainer = Trainer(
        model=best_model,
        args=final_training_args,
        train_dataset=best_train_dataset,
        eval_dataset=best_eval_dataset,
        tokenizer=best_tokenizer,
        compute_metrics=compute_metrics,
    )
    final_trainer.train()
    
    # Save the best trained model
    final_trainer.save_model("./saved_roberta_model")
    best_tokenizer.save_pretrained("./saved_roberta_model")
    
    print("Best model saved to ./saved_roberta_model")

In [None]:
shutil.make_archive('saved_roberta_model', 'zip', './saved_roberta_model')
files.download('saved_roberta_model.zip')

## MLflow backup

In [None]:
if os.path.exists("./mlruns"):
    shutil.make_archive('mlflow_complete', 'zip', './', 'mlruns')
    files.download('mlflow_complete.zip')
    print("MLflow data downloaded!")

In [None]:
if os.path.exists("mlflow.db"):
    files.download("mlflow.db")
    print("MLflow database downloaded!")

In [None]:
try:
    experiment = mlflow.get_experiment_by_name("sentiment_analysis_experiment")
    
    if experiment:
        runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])
        
        if not runs_df.empty:
            print("MLflow Runs Summary:")
            print(f"Total runs: {len(runs_df)}")
            
            print("\nAvailable columns:")
            for col in sorted(runs_df.columns):
                if not col.startswith('tags.') and not col.startswith('artifact_uri'):
                    print(f"  - {col}")
            
            runs_df.to_csv('mlflow_runs_complete.csv', index=False)
            files.download('mlflow_runs_complete.csv')
            print("Complete MLflow runs data downloaded!")
        else:
            print("No runs found in MLflow")
    else:
        print("Experiment 'sentiment_analysis_experiment' not found")

In [None]:
except Exception as e:
    print(f"Error accessing MLflow: {e}")

In [None]:
print("\nBackup complete! You can now:")
print("1. Extract mlflow_complete.zip locally")
print("2. Run 'mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./mlruns'")
print("3. Open http://localhost:5000 to see your dashboard")