In [4]:
# trainer_v2_with_earlystop.ipynb
#%pip install evaluate

import json
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.model_selection import train_test_split
import numpy as np
from evaluate import load

# 2Ô∏è‚É£ Load and prepare dataset
data_path = "data/train_v2.jsonl"
rows = [json.loads(line) for line in open(data_path, "r", encoding="utf-8")]

samples = []
for item in rows:
    text = item["text"].strip()
    acronym = item["acronym"].strip()
    options = list(item["options"].keys())
    labels = list(item["options"].values())

    correct_indices = [i for i, v in enumerate(labels) if v]
    if not correct_indices:
        continue
    label = correct_indices[0]

    for i, opt in enumerate(options):
        text_input = f"{text} {acronym} : {opt}"
        samples.append({"text": text_input, "label": label})

print(f"‚úÖ Loaded {len(samples)} examples")

# 3Ô∏è‚É£ Split dataset
train_data, val_data = train_test_split(samples, test_size=0.1, random_state=42)
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

# 4Ô∏è‚É£ Tokenize
model_name = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 5Ô∏è‚É£ Define model
num_labels = max(s["label"] for s in samples) + 1
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# 6Ô∏è‚É£ Training arguments
args = TrainingArguments(
    output_dir="./results_1",
    eval_strategy="epoch",       # must evaluate each epoch for early stop
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_total_limit=3,                # keep only last 3 checkpoints
)

# 7Ô∏è‚É£ Define metrics
accuracy = load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

# 8Ô∏è‚É£ Trainer with early stopping
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # <-- ‚èπÔ∏è stop after 3 epochs w/o improvement
)

# 9Ô∏è‚É£ Train
trainer.train()

# üîü Save best model
trainer.save_model("./results")
tokenizer.save_pretrained("./results")

print("‚úÖ Training complete with early stopping! Best model saved in ./results")


‚úÖ Loaded 1792 examples


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1612/1612 [00:00<00:00, 7805.04 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 180/180 [00:00<00:00, 6380.19 examples/s]
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'processing_class' is not defined

In [2]:
#%pip install datasets
#%pip install scikit-learn
#%pip install evaluate
#%pip install "accelerate>=1.11.0"
#%pip install transformers

In [3]:
import json
import torch
import pandas as pd
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 1Ô∏è‚É£ Load model
model_path = "./results"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# 2Ô∏è‚É£ Load test data
test_file = "data/test_v4.jsonl"
test_rows = [json.loads(line) for line in open(test_file, "r", encoding="utf-8")]

# 3Ô∏è‚É£ Define threshold
CONFIDENCE_THRESHOLD = 0.97  # you can tune this

submission = []

# 4Ô∏è‚É£ Inference loop
for item in test_rows:
    text = item["text"].strip()
    acronym = item["acronym"].strip()
    options = item["options"]

    inputs = [f"{text} {acronym} : {opt}" for opt in options]
    encodings = tokenizer(inputs, truncation=True, padding=True, max_length=256, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)

        # Handle binary or multi-class models
        if probs.shape[-1] == 2:
            probs = probs[:, 1]  # probability of "correct" class
        else:
            probs = probs.max(dim=-1).values  # take max probability per option

        probs = probs.cpu().numpy()

    # Sort by confidence (descending)
    ranked = sorted(list(enumerate(probs)), key=lambda x: x[1], reverse=True)

    # Keep only indices in increasing order
    selected_indices = []
    last_index = -1
    for idx, prob in ranked:
        if prob < CONFIDENCE_THRESHOLD:
            continue
        if idx > last_index:  # enforce ascending order
            selected_indices.append(idx)
            last_index = idx
        else:
            break  # stop when order breaks

    # Edge case: if none above threshold, take top one
    if len(selected_indices) == 0 and len(ranked) > 0:
        selected_indices = [ranked[0][0]]

    submission.append({
        "id": item["id"],
        "prediction": str(selected_indices)
    })

# 5Ô∏è‚É£ Save results
df = pd.DataFrame(submission)
df.to_csv("submission1.csv", index=False)
print("‚úÖ Saved predictions to submission.csv")


‚úÖ Saved predictions to submission.csv
