In [None]:
"""
fine_tune_comparative_models_v4.py

This script:
- Uses a custom optimizer and a learning rate scheduler (with warmup) to smooth training.
- Mounts Google Drive (if run in Colab)
- Imports required libraries
- Splits the original English dataset (and Spanish dataset, if available) into training and validation files (if they do not exist)
- Provides utility functions to load models, tokenize data, and train using Hugging Face's Trainer.
- Iterates over a list of models (BERT, BETO, RoBERTa, mBERT) and languages (English & Spanish)
- Saves the trained model into the model directory as well as comparative results into a CSV file.

Ensure your dataset directory contains:
  - The original cleaned English dataset: text_preprocessed_dataset.csv
  - (Optional) The translated Spanish files: text_preprocessed_dataset_es.csv, train_data_es.csv, and validation_data_es.csv.
  - If the English (or Spanish) split files don't exist, they will be created.
"""

In [24]:
# --- Mount Google Drive (if available) ---
try:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)
except ImportError:
    print("Not running in Colab; skipping Drive mount.")

Mounted at /content/drive


In [None]:
#!pip install -qU transformers==4.57.3 datasets accelerate
#import importlib, sys
#importlib.reload(sys.modules['transformers'])

In [25]:
from huggingface_hub import login
from google.colab import userdata

token = userdata.get('HF_TOKEN')   # loads the secret
if token:
    login(token)                   # one-line login
    print("✅ Hugging Face login successful")
else:
    print("⚠️  HF_TOKEN not found in Colab secrets")

✅ Hugging Face login successful


In [26]:
# --- Imports ---
import os
import random
import logging
import numpy as np
import pandas as pd
import torch
import warnings
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    RobertaTokenizer,
    RobertaForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from datasets import load_dataset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from datasets import ClassLabel

In [27]:
os.environ["WANDB_DISABLED"] = "true"

In [28]:
# --- Set Up Logging ---
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s')

In [29]:
# --- Set Random Seeds for Reproducibility ---
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

In [30]:
set_seed(42)

In [31]:
# --- Configuration ---
CONFIG = {
    "dataset_dir": "/content/drive/MyDrive/AIproject/dataset",  # Folder where ALL CSV files are stored.
    "model_dir": "/content/drive/MyDrive/AIproject/Models",       # Directory used for saving fine-tuned models.
    "output_dir": "./results",                                   # Directory where training checkpoints/results are saved.
    "num_train_epochs": 3,
    "per_device_train_batch_size": 16,
    "per_device_eval_batch_size": 16,
    "early_stopping_patience": 2
}

In [32]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging.info("Using device: %s", device)

In [33]:
# --- Step: Split Original English Dataset into Train and Validation ---
original_dataset_path = os.path.join(CONFIG["dataset_dir"], "text_preprocessed_dataset.csv")
train_data_en_path = os.path.join(CONFIG["dataset_dir"], "train_data_en.csv")
val_data_en_path = os.path.join(CONFIG["dataset_dir"], "validation_data_en.csv")

In [34]:
if not os.path.exists(train_data_en_path) or not os.path.exists(val_data_en_path):
    logging.info("Splitting original English dataset into train and validation...")
    df_original = pd.read_csv(original_dataset_path, sep=";")
    df_original = df_original.dropna(subset=["binary_sentiment"])

    # Map binary_sentiment to numeric labels
    label_mapping = {"non-offensive": 0, "offensive": 1}
    df_original["label"] = df_original["binary_sentiment"].map(label_mapping)
    df_original = df_original.dropna(subset=["label"])  # remove rows with unmapped values

    if "binary_sentiment" in df_original.columns:
         train_df, val_df = train_test_split(
             df_original, test_size=0.2, random_state=42, stratify=df_original["label"]
         )
    else:
         train_df, val_df = train_test_split(df_original, test_size=0.2, random_state=42)

    train_df.to_csv(train_data_en_path, index=False)
    val_df.to_csv(val_data_en_path, index=False)
    logging.info("English dataset split into train (%s) and validation (%s)", train_data_en_path, val_data_en_path)
else:
    logging.info("English train/validation files already exist.")

In [35]:
# --- Step: Split Original Spanish Dataset into Train and Validation ---
original_dataset_es_path = os.path.join(CONFIG["dataset_dir"], "text_preprocessed_dataset_es.csv")
train_data_es_path = os.path.join(CONFIG["dataset_dir"], "train_data_es.csv")
val_data_es_path = os.path.join(CONFIG["dataset_dir"], "validation_data_es.csv")

In [36]:
if os.path.exists(original_dataset_es_path) and (not os.path.exists(train_data_es_path) or not os.path.exists(val_data_es_path)):
    logging.info("Splitting original Spanish dataset into train and validation...")
    df_original_es = pd.read_csv(original_dataset_es_path, sep=";")
    df_original_es = df_original_es.dropna(subset=["binary_sentiment"])

    # Map binary_sentiment to numeric labels
    label_mapping = {"non-offensive": 0, "offensive": 1}
    df_original_es["label"] = df_original_es["binary_sentiment"].map(label_mapping)
    df_original_es = df_original_es.dropna(subset=["label"])  # remove rows with unmapped values

    if "binary_sentiment" in df_original_es.columns:
         train_df_es, val_df_es = train_test_split(
             df_original_es, test_size=0.2, random_state=42, stratify=df_original_es["label"]
         )
    else:
         train_df_es, val_df_es = train_test_split(df_original_es, test_size=0.2, random_state=42)

    train_df_es.to_csv(train_data_es_path, index=False)
    val_df_es.to_csv(val_data_es_path, index=False)
    logging.info("Spanish dataset split into train (%s) and validation (%s)", train_data_es_path, val_data_es_path)
else:
    logging.info("Spanish train/validation files already exist or Spanish original dataset is not provided.")


--- Utility Functions ---

In [37]:
def load_tokenizer_and_model(model_name, model_save_path, tokenizer_cls, model_cls, pretrained_name):
    """
    Loads a fine-tuned model and tokenizer if available; otherwise, loads a pretrained version.
    """
    if os.path.exists(model_save_path):
        logging.info(f"Loading fine-tuned {model_name} model from {model_save_path}")
        return tokenizer_cls.from_pretrained(model_save_path), model_cls.from_pretrained(model_save_path)

    return tokenizer_cls.from_pretrained(pretrained_name), model_cls.from_pretrained(pretrained_name)

In [38]:
def get_model_and_tokenizer(model_name, language="en"):
    """
    Loads and returns the appropriate model and tokenizer based on the specified model name and language.

    Parameters:
        model_name (str): One of 'BERT', 'BETO', 'RoBERTa', 'mBERT'
        language (str): 'en' or 'es'

    Returns:
        model, tokenizer: Hugging Face model and tokenizer on the correct device
    """

    # Define paths
    model_dir = CONFIG["model_dir"]

    # Language-specific model mapping
    model_map = {
        "en": {
            "BERT": ("BERT_finetuned", BertTokenizer, BertForSequenceClassification, "bert-base-uncased"),
            "RoBERTa": ("RoBERTa_finetuned", RobertaTokenizer, RobertaForSequenceClassification, "roberta-base"),
            "mBERT": ("mBERT_finetuned", BertTokenizer, BertForSequenceClassification, "bert-base-multilingual-cased")
        },
        "es": {
            "BETO": ("BETO_finetuned", BertTokenizer, BertForSequenceClassification, "dccuchile/bert-base-spanish-wwm-cased"),
            "RoBERTa": ("RoBERTa_es_finetuned", RobertaTokenizer, RobertaForSequenceClassification, "PlanTL-GOB-ES/roberta-base-bne"),
            "mBERT": ("mBERT_finetuned", BertTokenizer, BertForSequenceClassification, "bert-base-multilingual-cased")
        }
    }

    # Validate model
    if language not in model_map or model_name not in model_map[language]:
        raise ValueError(f"Model '{model_name}' not supported for language '{language}'.")

    model_save_path, tokenizer_cls, model_cls, pretrained_name = model_map[language][model_name]
    model_save_path = os.path.join(model_dir, model_save_path)

    # Load model and tokenizer (your helper function)
    tokenizer, model = load_tokenizer_and_model(model_name, model_save_path, tokenizer_cls, model_cls, pretrained_name)

    # Move model to device
    model.to(device)

    return model, tokenizer

In [39]:
def load_language_dataset(language="en"):
    """
    Loads the training and validation datasets for the given language.
    For English, it expects "train_data_en.csv" and "validation_data_en.csv".
    For Spanish, it expects "train_data_es.csv" and "validation_data_es.csv".
    """
    if language == "en":
        dataset_files = {
            "train": os.path.join(CONFIG["dataset_dir"], "train_data_en.csv"),
            "validation": os.path.join(CONFIG["dataset_dir"], "validation_data_en.csv")
        }
    elif language == "es":
        dataset_files = {
            "train": os.path.join(CONFIG["dataset_dir"], "train_data_es.csv"),
            "validation": os.path.join(CONFIG["dataset_dir"], "validation_data_es.csv")
        }
    else:
        raise ValueError("Language not supported.")

    return load_dataset("csv", data_files=dataset_files)

In [40]:
def tokenize_function(examples, tokenizer):
    """
    Tokenizes the 'label_corrected_clean' column in the given examples.
    Note: Your CSV files should contain a column named "label_corrected_clean" that holds the text data.
    """
    return tokenizer(examples["label_corrected_clean"], padding="max_length", truncation=True)

In [41]:
def train_model(model_name, language="en"):
    """
    Trains and evaluates the specified model on the dataset for the given language.
    After training, the model and tokenizer are saved into model_dir.
    """
    logging.info("Training %s model for language: %s", model_name, language)

    # Load model and tokenizer (either from Hugging Face or local fine-tuned version for mBERT)
    model, tokenizer = get_model_and_tokenizer(model_name, language)

    # Load dataset
    dataset = load_language_dataset(language)

     # CAST LABEL (CRITICAL)
    dataset = dataset.cast_column("label", ClassLabel(num_classes=2))

    # ---------- clean ----------
    def _clean(ex):
        ex["label_corrected_clean"] = str(ex["label_corrected_clean"] or "")
        return ex

    dataset = dataset.map(_clean, desc="clean strings")

     # ---------- debug ----------
    for split in dataset:
      df = dataset[split].to_pandas()
      print(
          split,
          "NaNs :", df["label_corrected_clean"].isna().sum(),
            "non-str:", df["label_corrected_clean"].map(lambda x: not isinstance(x, str)).sum())

    # Tokenize dataset
    logging.info("Tokenizing dataset...")
    tokenized_datasets = dataset.map(
        lambda x: tokenize_function(x, tokenizer),
        batched=True,
        desc="Tokenising"
    )

    keep_cols = ["input_ids", "attention_mask", "label"]
    tokenized_datasets = tokenized_datasets.remove_columns(
      [c for c in tokenized_datasets["train"].column_names if c not in keep_cols])

    print("DEBUG sample:", tokenized_datasets["train"][0])

    # --- Create a Custom Optimizer and Scheduler ---
    total_steps = (len(tokenized_datasets["train"]) // CONFIG["per_device_train_batch_size"]) * CONFIG["num_train_epochs"]
    warmup_steps = int(0.1 * total_steps)  # 10% of total steps for warmup

    optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    # Ensure the output directory exists
    os.makedirs(CONFIG["output_dir"], exist_ok=True)

    training_args = TrainingArguments(
        output_dir=CONFIG["output_dir"],
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=CONFIG["per_device_train_batch_size"],
        per_device_eval_batch_size=CONFIG["per_device_eval_batch_size"],
        num_train_epochs=CONFIG["num_train_epochs"],
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        save_total_limit=2,
        push_to_hub=False,
        report_to="none"  # disables wandb
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        callbacks=[
            EarlyStoppingCallback(
                early_stopping_patience=CONFIG["early_stopping_patience"]
            )
        ],
        optimizers=(optimizer, scheduler)
    )

    trainer.train()

    predictions = trainer.predict(tokenized_datasets["validation"])
    preds = np.argmax(predictions.predictions, axis=-1)
    report = classification_report(predictions.label_ids, preds, output_dict=True)

    # --- Save the fine-tuned model ---
    model_save_path = os.path.join(CONFIG["model_dir"], f"{model_name}_{language}_finetuned")
    os.makedirs(model_save_path, exist_ok=True)
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    logging.info("Model saved to: %s", model_save_path)

    return report

In [42]:
def main():
    languages = ["en", "es"]
    model_map = {
        "en": ["BERT", "RoBERTa", "mBERT"],
        "es": ["BETO", "RoBERTa", "mBERT"]
    }

    results = []
    for language in languages:
        for model_name in model_map[language]:
            logging.info("Starting training for %s in %s", model_name, language)
            try:
                report = train_model(model_name, language)
                results.append({
                    "model": model_name,
                    "language": language,
                    "accuracy": report["accuracy"],
                    "f1": report["weighted avg"]["f1-score"],
                    "precision": report["weighted avg"]["precision"],
                    "recall": report["weighted avg"]["recall"]
                })
            except Exception as e:
                logging.error("Error training %s for language %s: %s", model_name, language, str(e))

    df_results = pd.DataFrame(results)
    os.makedirs("results", exist_ok=True)
    results_path = os.path.join("results", "comparative_results.csv")
    df_results.to_csv(results_path, index=False)
    logging.info("Comparative results saved to %s", results_path)
    print(df_results)

In [43]:
import transformers
print(transformers.__version__)  # should be >= 4.30


4.57.3


In [None]:
if __name__ == "__main__":
    main()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Casting the dataset:   0%|          | 0/497 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/125 [00:00<?, ? examples/s]

clean strings:   0%|          | 0/497 [00:00<?, ? examples/s]

clean strings:   0%|          | 0/125 [00:00<?, ? examples/s]

train NaNs : 0 non-str: 0
validation NaNs : 0 non-str: 0


Tokenising:   0%|          | 0/497 [00:00<?, ? examples/s]

Tokenising:   0%|          | 0/125 [00:00<?, ? examples/s]

DEBUG sample: {'label': 1, 'input_ids': [101, 10047, 13330, 4248, 4168, 4168, 1012, 4012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.015084
2,No log,0.003816
3,No log,0.003104


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train NaNs : 0 non-str: 0
validation NaNs : 0 non-str: 0


Tokenising:   0%|          | 0/497 [00:00<?, ? examples/s]

Tokenising:   0%|          | 0/125 [00:00<?, ? examples/s]

DEBUG sample: {'label': 1, 'input_ids': [0, 757, 7544, 2119, 25683, 242, 4, 175, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.000939
2,No log,0.000318
3,No log,0.000281


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train NaNs : 0 non-str: 0
validation NaNs : 0 non-str: 0


Tokenising:   0%|          | 0/497 [00:00<?, ? examples/s]

Tokenising:   0%|          | 0/125 [00:00<?, ? examples/s]

DEBUG sample: {'label': 1, 'input_ids': [101, 10211, 15633, 79581, 10162, 69609, 60503, 10112, 119, 10212, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.002428
2,No log,0.001042
3,No log,0.000916
