In [10]:
# trainer_v2_with_earlystop.ipynb
#%pip install evaluate

import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import numpy as np
from evaluate import load
from span_marker import SpanMarkerModel

# 2Ô∏è‚É£ Load and prepare dataset
data_path = "data/train_v2_small.jsonl"
rows = [json.loads(line) for line in open(data_path, "r", encoding="utf-8")]

samples = []
for item in rows:
    text = item["text"].strip()
    acronym = item["acronym"].strip()
    options = list(item["options"].keys())
    labels = list(item["options"].values())

    correct_indices = [i for i, v in enumerate(labels) if v]
    if not correct_indices:
        continue
    label = correct_indices[0]

    for i, opt in enumerate(options):
        #text_input = f"{text} {acronym} : {opt}"
        text_input = f"Context: {text}\nQuestion: What does {acronym} stand for?\nCandidate answer: {opt}"
        samples.append({"text": text_input, "label": label})

print(f"‚úÖ Loaded {len(samples)} examples")

# 3Ô∏è‚É£ Split dataset into train, validation, and test
train_data, temp_data = train_test_split(samples, test_size=0.2, random_state=42)  # 20% held out
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # split 10% val, 10% test

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")

# 4Ô∏è‚É£ Tokenize
model_name = "tomaarsen/span-marker-bert-base-uncased-acronyms"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 5Ô∏è‚É£ Define model
from transformers import AutoModelForSequenceClassification

num_labels = max(s["label"] for s in samples) + 1
model_name = "distilbert-base-uncased"  # or your preferred model

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)



# 6Ô∏è‚É£ Training arguments
args = TrainingArguments(
    output_dir="./results_v6_4",
    eval_strategy="epoch",       # must evaluate each epoch for early stop
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_ratio=0.1,
    gradient_accumulation_steps=4,
    num_train_epochs=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,                # keep only last 3 checkpoints
)

# 7Ô∏è‚É£ Define metrics
f1_metric = load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return f1_metric.compute(predictions=preds, references=labels, average="macro")
    
# 8Ô∏è‚É£ Trainer with early stopping
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # <-- ‚èπÔ∏è stop after 3 epochs w/o improvement
)

# 9Ô∏è‚É£ Train
trainer.train()

# üîü Save best model
trainer.save_model("./results_v6_4")
tokenizer.save_pretrained("./results_v6_4")

print("‚úÖ Training complete with early stopping! Best model saved in ./results_v6_4")

# 1Ô∏è‚É£1Ô∏è‚É£ Final evaluation on unseen test set
test_results = trainer.evaluate(test_dataset)
print("üìä Final test results:", test_results)

‚úÖ Loaded 557 examples
Train: 445 | Val: 56 | Test: 56


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 445/445 [00:00<00:00, 6004.20 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 56/56 [00:00<00:00, 3662.69 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 56/56 [00:00<00:00, 3517.03 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
0,No log,2.261196,0.046701
2,No log,2.217373,0.058219
4,No log,2.128693,0.058219
6,No log,2.069372,0.030172


‚úÖ Training complete with early stopping! Best model saved in ./results_v6_4


üìä Final test results: {'eval_loss': 2.212435722351074, 'eval_f1': 0.05847953216374269, 'eval_runtime': 0.1551, 'eval_samples_per_second': 361.058, 'eval_steps_per_second': 12.895, 'epoch': 6.0}
