In [15]:
# trainer_v2_with_earlystop.ipynb
#%pip install evaluate

import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import numpy as np
from evaluate import load
from span_marker import SpanMarkerModel

# 2Ô∏è‚É£ Load and prepare dataset
data_path = "data/train_v2.jsonl"
rows = [json.loads(line) for line in open(data_path, "r", encoding="utf-8")]

samples = []
for item in rows:
    text = item["text"].strip()
    acronym = item["acronym"].strip()
    options = list(item["options"].keys())
    labels = list(item["options"].values())
    
    correct_indices = [i for i, v in enumerate(labels) if v]
    if not correct_indices:
        continue
    label = correct_indices[0]

    for i, opt in enumerate(options):
        #text_input = f"{text} {acronym} : {opt}"
        text_input = f"Context: {text}\nQuestion: What does {acronym} stand for?\nCandidate answer: {opt}"
        samples.append({"text": text_input, "label": label})

print(f"‚úÖ Loaded {len(samples)} examples")

# 3Ô∏è‚É£ Split dataset into train, validation, and test
train_data, temp_data = train_test_split(samples, test_size=0.2, random_state=42)  # 20% held out
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # split 10% val, 10% test

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")

# 4Ô∏è‚É£ Tokenize
model_name = "tomaarsen/span-marker-bert-base-uncased-acronyms"
tokenizer = AutoTokenizer.from_pretrained(model_name)


def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 5Ô∏è‚É£ Define model
from transformers import AutoModelForSequenceClassification

num_labels = max(s["label"] for s in samples) + 1
model_name = "distilbert-base-uncased"  # or your preferred model

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    ignore_mismatched_sizes=True
)



# 6Ô∏è‚É£ Training arguments
args = TrainingArguments(
    output_dir="./results_v6",
    eval_strategy="epoch",       # must evaluate each epoch for early stop
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,                # keep only last 3 checkpoints
)

# 7Ô∏è‚É£ Define metrics
f1_metric = load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return f1_metric.compute(predictions=preds, references=labels, average="macro")
    
# 8Ô∏è‚É£ Trainer with early stopping
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # <-- ‚èπÔ∏è stop after 3 epochs w/o improvement
)

# 9Ô∏è‚É£ Train
trainer.train()

# üîü Save best model
trainer.save_model("./results_v6")
tokenizer.save_pretrained("./results_v6")

print("‚úÖ Training complete with early stopping! Best model saved in ./results_v6")

# 1Ô∏è‚É£1Ô∏è‚É£ Final evaluation on unseen test set
test_results = trainer.evaluate(test_dataset)
print("üìä Final test results:", test_results)

‚úÖ Loaded 1792 examples
Train: 1433 | Val: 179 | Test: 180


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1433/1433 [00:00<00:00, 5228.05 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 179/179 [00:00<00:00, 4882.36 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 180/180 [00:00<00:00, 4614.99 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1
1,1.9772,1.801401,0.092398
2,1.6142,1.265621,0.576284
3,1.0552,0.819455,0.682076
4,0.5864,0.57313,0.874119
5,0.3726,0.517407,0.913895
6,0.2614,0.415037,0.943937
7,0.1546,0.359639,0.960902
8,0.0885,0.230431,0.972818
9,0.0352,0.336535,0.965125
10,0.0385,0.302159,0.976585


‚úÖ Training complete with early stopping! Best model saved in ./results_v6


üìä Final test results: {'eval_loss': 0.17309662699699402, 'eval_f1': 0.9781941434614877, 'eval_runtime': 1.1721, 'eval_samples_per_second': 153.565, 'eval_steps_per_second': 19.622, 'epoch': 13.0}


In [None]:
%pip install -U "transformers==4.45.2"
%pip install -U "peft==0.14.0"
%pip install -U "span-marker>=1.6.0"


In [2]:
# 3Ô∏è‚É£ Split dataset into train, validation, and test
train_data, temp_data = train_test_split(samples, test_size=0.2, random_state=42)  # 20% held out
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)  # split 10% val, 10% test

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
test_dataset = Dataset.from_list(test_data)

print(f"Train: {len(train_dataset)} | Val: {len(val_dataset)} | Test: {len(test_dataset)}")


Train: 445 | Val: 56 | Test: 56


In [24]:
import json
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1Ô∏è‚É£ Load model + tokenizer from training output
model_path = "./results_v6"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# 2Ô∏è‚É£ Load test data
test_file = "data/test_v4.jsonl"
test_rows = [json.loads(line) for line in open(test_file, "r", encoding="utf-8")]

# 3Ô∏è‚É£ Define threshold (optional for multi-class, but kept for consistency)
CONFIDENCE_THRESHOLD = 0.5

submission = []

# 4Ô∏è‚É£ Inference loop
for item in test_rows:
    text = item["text"].strip()
    acronym = item["acronym"].strip()
    options = item["options"]

    # Use the same input format as training
    inputs = [
        f"Context: {text}\nQuestion: What does {acronym} stand for?\nCandidate answer: {opt}"
        for opt in options
    ]

    encodings = tokenizer(
        inputs,
        truncation=True,
        padding=True,
        max_length=256,
        return_tensors="pt"
    )

    # DistilBERT doesn‚Äôt use token_type_ids
    if "token_type_ids" in encodings:
        encodings.pop("token_type_ids")

    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)

        # Multi-class: take probability of the predicted class for each option
        pred_classes = torch.argmax(probs, dim=-1)
        pred_scores = probs[range(len(pred_classes)), pred_classes]

        probs = pred_scores.cpu().numpy()

    # Rank options by confidence
    ranked = sorted(list(enumerate(probs)), key=lambda x: x[1], reverse=True)

    selected_indices = []
    last_index = -1
    for idx, prob in ranked:
        if prob < CONFIDENCE_THRESHOLD:
            continue
        if idx > last_index:
            selected_indices.append(idx)
            last_index = idx
        else:
            break

    # Edge case: if none above threshold, take top one
    if len(selected_indices) == 0 and len(ranked) > 0:
        selected_indices = [ranked[0][0]]

    submission.append({
        "id": item["id"],
        "prediction": str(selected_indices)
    })

# 5Ô∏è‚É£ Save results
df = pd.DataFrame(submission)
df.to_csv("submission_v6.csv", index=False)
print("‚úÖ Saved predictions to submission.csv")


‚úÖ Saved predictions to submission.csv
