# Fine-tuning a Model for Metaphor Detection with Performance Comparison
This notebook fine-tunes **one** model for token classification (metaphor detection) and compares performance before and after fine-tuning. Results are saved to CSV.

## Requirements

In [None]:
!pip install transformers datasets accelerate evaluate seqeval
!pip install torch matplotlib pandas

## Configuration

In [None]:
# Dataset configuration
DATASET_NAME = "mariadelcarmenramirez/metaphor-catalan-iter1"

# Choose ONE model to train
MODEL_CHECKPOINT = "projecte-aina/roberta-large-ca-v2"
MODEL_NAME = "roberta"

# Training configuration
TRAINING_CONFIG = {
    "learning_rate": 2e-5,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "num_train_epochs": 5,
    "weight_decay": 0.01,
    # Memory safety: increase if you hit OOM (2, 4, 8...)
    "gradient_accumulation_steps": 1,
}

# Reproducibility
SEED = 42

# Push to Hub configuration
PUSH_TO_HUB = True
HUB_USERNAME = "mariadelcarmenramirez"

# Output paths
OUTPUT_DIR = f"./{MODEL_NAME}-metaphor-detection-cat"
RESULTS_CSV = f"model_comparison_results_{MODEL_NAME}.csv"

## Reproducibility (Random Seed)

In [None]:
import random
import numpy as np
import torch
from transformers import set_seed

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

set_seed(SEED)
print(f"Seed set to {SEED}")

## Load Dataset from Hugging Face

In [None]:
from datasets import load_dataset

# Load the dataset - it's already split into train/validation/test!
dataset_dict = load_dataset(DATASET_NAME)

# Support either `ner_tags` (HF convention) or `tags` (older upload script)
label_col = "tags"
if label_col not in dataset_dict["train"].column_names:
    raise ValueError(f"Expected label column 'tags', got: {dataset_dict['train'].column_names}")

print(dataset_dict)
print(f"\nTrain examples: {len(dataset_dict['train'])}")
print(f"Validation examples: {len(dataset_dict['validation'])}")
print(f"Test examples: {len(dataset_dict['test'])}")

# Show an example
print("\nExample from training set:")
example = dataset_dict['train'][0]
if 'id' in example:
    print(f"Id: {example['id']}")
print(f"Tokens: {example['tokens']}")
print(f"Labels ({label_col}): {example[label_col]}")

In [None]:
# Extract label information from the dataset
label_list = dataset_dict['train'].features[label_col].feature.names
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

print(f"Labels: {label_list}")
print(f"Number of labels: {len(label_list)}")

## Label Distribution (Class Imbalance Check)

In [None]:
from collections import Counter

# Flatten all token labels in train
all_labels = []
for ex in dataset_dict["train"]:
    all_labels.extend(ex[label_col])

counts = Counter(all_labels)
total = sum(counts.values())

print("Label distribution (train tokens):")
for i, c in sorted(counts.items()):
    name = id2label.get(i, str(i))
    print(f"  {name:20s} {c:10d}  ({c/total:.2%})")

## Helper Functions

In [None]:
def tokenize_and_align_labels(examples, tokenizer):
    """
    Tokenize the text and align the labels with tokenized words.
    When a word is split into multiple subword tokens, only the first subword gets the label.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )
    
    labels = []
    for i, label in enumerate(examples[label_col]):
        # Validate: each word-token should have exactly one label
        if len(examples["tokens"][i]) != len(label):
            raise ValueError(
                f"Mismatch at example {i}: {len(examples['tokens'][i])} tokens "
                f"but {len(label)} labels. Tokens: {examples['tokens'][i]}"
            )
        
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        
        for word_idx in word_ids:
            if word_idx is None:  # Special tokens ([CLS], [SEP], [PAD])
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # First subword of a word
                label_ids.append(label[word_idx])
            else:  # Subsequent subwords of the same word
                label_ids.append(-100)
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

def compute_detailed_metrics(eval_pred):
    """Compute metrics including per-label breakdown"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    
    detailed_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
        "overall_f1": results["overall_f1"],
        "overall_accuracy": results["overall_accuracy"],
    }
    
    for label in label_list:
        if label in results:
            detailed_results[f"{label}_precision"] = results[label].get("precision", 0)
            detailed_results[f"{label}_recall"] = results[label].get("recall", 0)
            detailed_results[f"{label}_f1"] = results[label].get("f1-score", 0)
    
    return detailed_results

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification

def evaluate_model(model, tokenizer, tokenized_test_dataset, model_name="model"):
    """
    Evaluate a model on the test set and return detailed metrics.
    """
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
    
    eval_args = TrainingArguments(
        output_dir=f"./tmp_eval_{model_name}",
        per_device_eval_batch_size=TRAINING_CONFIG["per_device_eval_batch_size"],
        report_to="none",
        seed=SEED,
        data_seed=SEED,
    )
    
    trainer = Trainer(
        model=model,
        args=eval_args,
        data_collator=data_collator,
        compute_metrics=compute_detailed_metrics,
    )
    
    results = trainer.evaluate(tokenized_test_dataset)
    return results

## Evaluate Base Model → Fine-tune → Evaluate Fine-tuned

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import traceback
import pandas as pd

use_fp16 = torch.cuda.is_available()

# Optional: Login to Hugging Face
if PUSH_TO_HUB:
    from huggingface_hub import notebook_login
    notebook_login()

results_row = {
    "status": "failed",
    "model": MODEL_NAME,
    "checkpoint": MODEL_CHECKPOINT,
    "seed": SEED,
    "gradient_accumulation_steps": TRAINING_CONFIG.get("gradient_accumulation_steps", 1),
}

try:
    print("\n" + "="*80)
    print(f"Model: {MODEL_NAME} ({MODEL_CHECKPOINT})")
    print("="*80 + "\n")

    # Load tokenizer and base model
    print("Loading tokenizer and base model...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, add_prefix_space=True)
    base_model = AutoModelForTokenClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    # Tokenize dataset
    print("Tokenizing dataset...")
    tokenized_datasets = dataset_dict.map(
        lambda examples: tokenize_and_align_labels(examples, tokenizer),
        batched=True,
        remove_columns=dataset_dict["train"].column_names
    )

    # Evaluate base model
    print("\n" + "-"*60)
    print("EVALUATING BASE MODEL")
    print("-"*60)

    base_results = evaluate_model(
        base_model,
        tokenizer,
        tokenized_datasets["test"],
        model_name=f"{MODEL_NAME}_base"
    )

    print(f"\nBase Model Results:")
    print(f"  Precision: {base_results['eval_overall_precision']:.4f}")
    print(f"  Recall:    {base_results['eval_overall_recall']:.4f}")
    print(f"  F1:        {base_results['eval_overall_f1']:.4f}")
    print(f"  Accuracy:  {base_results['eval_overall_accuracy']:.4f}")

    # Fine-tune model
    print("\n" + "-"*60)
    print("FINE-TUNING")
    print("-"*60 + "\n")

    model = AutoModelForTokenClassification.from_pretrained(
        MODEL_CHECKPOINT,
        num_labels=len(label_list),
        id2label=id2label,
        label2id=label2id
    )

    from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification
    data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=TRAINING_CONFIG["learning_rate"],
        per_device_train_batch_size=TRAINING_CONFIG["per_device_train_batch_size"],
        per_device_eval_batch_size=TRAINING_CONFIG["per_device_eval_batch_size"],
        num_train_epochs=TRAINING_CONFIG["num_train_epochs"],
        weight_decay=TRAINING_CONFIG["weight_decay"],
        gradient_accumulation_steps=TRAINING_CONFIG.get("gradient_accumulation_steps", 1),
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        push_to_hub=False,
        logging_steps=100,
        fp16=use_fp16,
        report_to="none",
        seed=SEED,
        data_seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Save model
    trainer.save_model()
    tokenizer.save_pretrained(training_args.output_dir)
    trainer.save_state()
    print(f"\nModel saved locally to: {training_args.output_dir}")

    # Evaluate fine-tuned model
    print("\n" + "-"*60)
    print("EVALUATING FINE-TUNED MODEL")
    print("-"*60)

    detailed_results = compute_detailed_metrics(
        trainer.predict(tokenized_datasets["test"])
    )

    print(f"\nFine-tuned Model Results:")
    print(f"  Precision: {detailed_results['overall_precision']:.4f}")
    print(f"  Recall:    {detailed_results['overall_recall']:.4f}")
    print(f"  F1:        {detailed_results['overall_f1']:.4f}")
    print(f"  Accuracy:  {detailed_results['overall_accuracy']:.4f}")

    print("\nPer-label F1:")
    for label in label_list:
        key = f"{label}_f1"
        if key in detailed_results:
            print(f"  {label:20s}: {detailed_results[key]:.4f}")

    improvement_f1 = detailed_results['overall_f1'] - base_results['eval_overall_f1']

    # Fill results row
    results_row.update({
        "status": "success",
        "base_precision": base_results['eval_overall_precision'],
        "base_recall": base_results['eval_overall_recall'],
        "base_f1": base_results['eval_overall_f1'],
        "base_accuracy": base_results['eval_overall_accuracy'],
        "finetuned_precision": detailed_results['overall_precision'],
        "finetuned_recall": detailed_results['overall_recall'],
        "finetuned_f1": detailed_results['overall_f1'],
        "finetuned_accuracy": detailed_results['overall_accuracy'],
        "improvement_f1": improvement_f1,
    })

    # Optionally push
    if PUSH_TO_HUB:
        print("\nPushing to Hugging Face Hub...")
        repo_id = f"{HUB_USERNAME}/metaphor-cat-{MODEL_NAME}"
        model.push_to_hub(repo_id)
        tokenizer.push_to_hub(repo_id)
        print(f"Model pushed to: https://huggingface.co/{repo_id}")

except Exception as e:
    print("\n" + "!"*80)
    print("RUN FAILED")
    print(f"Error: {type(e).__name__}: {e}")
    print("Traceback:")
    traceback.print_exc()
    results_row["error"] = f"{type(e).__name__}: {e}"
    print("!"*80 + "\n")

# Save results to CSV (single-row)
df_results = pd.DataFrame([results_row])
df_results.to_csv(RESULTS_CSV, index=False)
print(f"\nResults saved to '{RESULTS_CSV}'")
df_results

## Test Fine-tuned Model on Sample Sentences

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

best_model_path = OUTPUT_DIR
print(f"Loading fine-tuned model from: {best_model_path}")

tokenizer = AutoTokenizer.from_pretrained(best_model_path)
model = AutoModelForTokenClassification.from_pretrained(best_model_path)

metaphor_detector = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

test_sentences = [
    "Van deixar de visitar la família quan van començar les tensions racials.",
    "Santo va treballar per a Disney i operava les tasses de te.",
    "No cal que m'ocupi d'això.",
    "El temps vola quan t'ho passes bé.",
    "Va tocar el cel amb les mans després de guanyar."
]

print("="*80)
print("TESTING FINE-TUNED MODEL ON SAMPLE SENTENCES")
print("="*80 + "\n")

for sentence in test_sentences:
    results = metaphor_detector(sentence)
    print(f"Sentence: {sentence}")

    if results:
        metaphors = [r for r in results if 'METAPHOR' in r['entity_group']]
        if metaphors:
            print("  Metaphors detected:")
            for result in metaphors:
                print(f"    - '{result['word']}' (confidence: {result['score']:.3f})")
        else:
            print("  No metaphors detected")
    else:
        print("  No metaphors detected")

    print("-" * 80)

## Conclusion

This notebook has:
1. ✅ Evaluated the base model before fine-tuning
2. ✅ Fine-tuned **one** model for metaphor detection
3. ✅ Evaluated the fine-tuned model on the test set
4. ✅ Reported label distribution + per-label F1
5. ✅ Used a fixed seed for reproducibility
6. ✅ Used try/except so failures are captured cleanly
7. ✅ Saved results to `*.csv`
