# Fine-tuning RoBERTa for Metaphor Detection
This notebook fine-tunes projecte-aina/roberta-large-ca-v2 for token classification (metaphor detection)

## Requirements

In [None]:
!pip install transformers datasets accelerate evaluate seqeval
!pip install torch

## Load Dataset from Hugging Face

In [None]:
from datasets import load_dataset

# UPDATE THIS with your dataset name
DATASET_NAME = "your-username/metaphor-catalan"

# Load the dataset - it's already split into train/validation/test!
dataset_dict = load_dataset(DATASET_NAME)

print(dataset_dict)
print(f"\nTrain examples: {len(dataset_dict['train'])}")
print(f"Validation examples: {len(dataset_dict['validation'])}")
print(f"Test examples: {len(dataset_dict['test'])}")

# Show an example
print("\nExample from training set:")
example = dataset_dict['train'][0]
print(f"Tokens: {example['tokens']}")
print(f"Labels: {example['ner_tags']}")

In [None]:
# Extract label information from the dataset
label_list = dataset_dict['train'].features['ner_tags'].feature.names
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

print(f"Labels: {label_list}")
print(f"Number of labels: {len(label_list)}")

## Load Model and Tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_checkpoint = "projecte-aina/roberta-large-ca-v2"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

print(f"Model loaded with {len(label_list)} labels")

## Tokenize Dataset

In [None]:
def tokenize_and_align_labels(examples):
    """
    Tokenize the text and align the labels with tokenized words.
    When a word is split into multiple tokens, only the first token gets the label.
    """
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        padding=False
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        
        for word_idx in word_ids:
            if word_idx is None:
                # Special tokens get -100 (ignored in loss)
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                # First token of a word gets the label
                label_ids.append(label[word_idx])
            else:
                # Other tokens of the same word get -100
                label_ids.append(-100)
            previous_word_idx = word_idx
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_datasets = dataset_dict.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset_dict["train"].column_names
)

print("Dataset tokenized successfully!")
print(tokenized_datasets)

## Setup Data Collator and Metrics

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate
import numpy as np

metric = evaluate.load("seqeval")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Training Configuration

In [None]:
from transformers import TrainingArguments

# Login to Hugging Face (optional - for pushing to Hub)
from huggingface_hub import notebook_login
# Uncomment the line below if you want to push to Hub
# notebook_login()

training_args = TrainingArguments(
    output_dir="./roberta-metaphor-detection",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,  # Set to True if you want to push to Hub
    # hub_model_id="your-username/roberta-metaphor-ca",  # Uncomment and set your username
    logging_steps=100,
    fp16=True,  # Use mixed precision if GPU supports it
)

## Train the Model

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Starting training...")
trainer.train()

## Evaluate on Test Set

In [None]:
# Evaluate on test set
test_results = trainer.evaluate(tokenized_datasets["test"])
print("\nTest Results:")
print(test_results)

## Save the Model

In [None]:
# Save locally
output_dir = "./roberta-metaphor-final"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model saved to {output_dir}")

# Optionally push to Hub (if you logged in earlier)
# trainer.push_to_hub()

## Test the Fine-tuned Model

In [None]:
from transformers import pipeline

# Create a pipeline for token classification
metaphor_detector = pipeline(
    "token-classification",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"
)

# Test with an example sentence
test_sentence = "Van deixar de visitar la família quan van començar les tensions racials."

results = metaphor_detector(test_sentence)
print(f"\nTest sentence: {test_sentence}")
print("\nDetected metaphors:")
for result in results:
    print(f"  {result['word']}: {result['entity_group']} (score: {result['score']:.3f})")

In [None]:
# Test with multiple examples
test_sentences = [
    "Van deixar de visitar la família quan van començar les tensions racials.",
    "Santo va treballar per a Disney i operava les tasses de te.",
    "No cal que m'ocupi d'això."
]

print("\n" + "="*60)
print("Testing multiple sentences:")
print("="*60)

for sentence in test_sentences:
    results = metaphor_detector(sentence)
    print(f"\nSentence: {sentence}")
    if results:
        print("Metaphors found:")
        for result in results:
            if result['entity_group'] == 'B-METAPHOR':
                print(f"  - {result['word']} (confidence: {result['score']:.3f})")
    else:
        print("  No metaphors detected")
    print("-" * 60)