In [1]:
# pip installs (run in terminal, not inside Python):
# 1) Uninstall CPU-only torch if previously installed
#%pip uninstall -y torch torchvision torchaudio
# 2) Install CUDA build (pick one)
#%pip install --upgrade --index-url https://download.pytorch.org/whl/cu128 torch torchvision torchaudio
# or: pip install --upgrade --index-url https://download.pytorch.org/whl/cu118 torch torchvision torchaudio
# 3) Upgrade libs
#%pip install --upgrade "transformers[torch]" "accelerate>=0.26.0" datasets scikit-learn ipywidgets pandas addict matplotlib easydict einops

import json
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import GroupShuffleSplit
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments)
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np
import torch



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === 1. Load JSONL and expand ===
file_path = "data/train_v2.jsonl"
rows = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        text = item["text"]
        acronym = item["acronym"]
        options = item["options"]
        for option_text, is_correct in options.items():
            rows.append({
                "text": text.strip(),
                "acronym": acronym.strip(),
                "option_text": option_text.strip(),
                "label": int(is_correct)
            })
df = pd.DataFrame(rows)



In [3]:
# === 2. Grouped splits: train/val/test ===
df["group_id"] = (df["text"].str.strip() + "||" + df["acronym"].str.strip()).factorize()[0]
gss1 = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_val_idx, test_idx = next(gss1.split(df, groups=df["group_id"]))
df_train_val = df.iloc[train_val_idx].reset_index(drop=True)
df_test      = df.iloc[test_idx].reset_index(drop=True)
gss2 = GroupShuffleSplit(n_splits=1, test_size=0.1111, random_state=42)
train_idx, val_idx = next(gss2.split(df_train_val, groups=df_train_val["group_id"]))
df_train = df_train_val.iloc[train_idx].reset_index(drop=True)
df_val   = df_train_val.iloc[val_idx].reset_index(drop=True)



In [4]:
# === 3. Tokenizer & preprocess ===
model_name = "camembert/camembert-large"  # switch to xlm-roberta-base if OOM
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    inputs = f'{example["text"].strip()} {example["acronym"].strip()} : {example["option_text"].strip()}'
    tokenized = tokenizer(inputs, truncation=True, padding="max_length", max_length=256)
    tokenized["labels"] = int(example["label"])
    return tokenized

train_hf = Dataset.from_pandas(df_train).map(preprocess)
val_hf   = Dataset.from_pandas(df_val).map(preprocess)
test_hf  = Dataset.from_pandas(df_test).map(preprocess)



Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1517/1517 [00:00<00:00, 2327.24 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 198/198 [00:00<00:00, 2415.34 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 462/462 [00:00<00:00, 2267.37 examples/s]


In [5]:
# === 4. Model ===a
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert/camembert-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# === 5. Metrics ===
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1m = f1_score(labels, preds, average="macro")
    try:
        proba_pos = (logits - logits.max(axis=1, keepdims=True))
        proba_pos = np.exp(proba_pos) / np.exp(proba_pos).sum(axis=1, keepdims=True)
        auc = roc_auc_score(labels, proba_pos[:, 1])
    except Exception:
        auc = float("nan")
    return {"accuracy": acc, "f1_macro": f1m, "roc_auc": auc}



In [7]:
# === 6. Training args ===
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_v2",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    num_train_epochs=30,
    seed=42,                      # âœ… reproducible
    fp16=False,
    gradient_checkpointing=True,
    auto_find_batch_size=False,   # âœ… turn off the auto tuner
    report_to="none",
    save_total_limit=3,           # âœ… keep only the 3 best checkpoints
    greater_is_better=True
)



In [8]:
# === 7. Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_hf,
    eval_dataset=val_hf,
    compute_metrics=compute_metrics,
)

In [9]:
# === 8. Train ===
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro,Roc Auc
0,No log,0.234865,0.924242,0.87404,0.948045
2,No log,0.171555,0.954545,0.921021,0.974794
4,No log,0.160895,0.949495,0.916904,0.970508
6,No log,0.233256,0.949495,0.916904,0.978395
8,No log,0.179808,0.964646,0.941219,0.978738
10,0.162900,0.151203,0.959596,0.933524,0.978395
12,0.162900,0.20987,0.939394,0.890909,0.969993
14,0.162900,0.346571,0.934343,0.885919,0.95559
16,0.162900,0.308662,0.939394,0.898148,0.95559
18,0.162900,0.397539,0.939394,0.900285,0.932785


TrainOutput(global_step=1410, training_loss=0.08021843162834222, metrics={'train_runtime': 2985.0382, 'train_samples_per_second': 15.246, 'train_steps_per_second': 0.472, 'total_flos': 2.098383254237491e+16, 'train_loss': 0.08021843162834222, 'epoch': 29.68421052631579})

In [10]:
# === 9. Save final model and tokenizer ===
trainer.save_model("./results_v2")        # Saves model + config
tokenizer.save_pretrained("./results_v2") # Saves tokenizer files too

('./results_v2/tokenizer_config.json',
 './results_v2/special_tokens_map.json',
 './results_v2/sentencepiece.bpe.model',
 './results_v2/added_tokens.json',
 './results_v2/tokenizer.json')

In [11]:
# === 10. Final test evaluation ===
print(trainer.evaluate(test_hf))

{'eval_loss': 0.24584247171878815, 'eval_accuracy': 0.9329004329004329, 'eval_f1_macro': 0.8897654764745273, 'eval_roc_auc': 0.9567801672640381, 'eval_runtime': 6.643, 'eval_samples_per_second': 69.547, 'eval_steps_per_second': 4.366, 'epoch': 29.68421052631579}


In [12]:
#%pip install pynvml
#%pip install torch
"""
from pynvml import *
nvmlInit()
handle = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(handle)
print(f"Total: {info.total/1e9:.2f} GB")
print(f"Used:  {info.used/1e9:.2f} GB")
print(f"Free:  {info.free/1e9:.2f} GB")
"""


'\nfrom pynvml import *\nnvmlInit()\nhandle = nvmlDeviceGetHandleByIndex(0)\ninfo = nvmlDeviceGetMemoryInfo(handle)\nprint(f"Total: {info.total/1e9:.2f} GB")\nprint(f"Used:  {info.used/1e9:.2f} GB")\nprint(f"Free:  {info.free/1e9:.2f} GB")\n'

In [13]:
#%pip install nvidia-ml-py

In [14]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score

# === 1. Get predictions on the validation set ===
predictions = trainer.predict(test_hf)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

# Convert validation dataset back to pandas for grouping
val_df = pd.DataFrame(test_hf)

# === 2. Attach predicted labels to each example ===
val_df["pred_label"] = pred_labels
val_df["true_label"] = true_labels

# === 3. Compute per-acronym F1 based on set comparison ===
results = []
for acronym, group in val_df.groupby("acronym"):
    # Get sets of options judged true
    predicted_true = set(group.loc[group["pred_label"] == 1, "option_text"])
    actual_true = set(group.loc[group["true_label"] == 1, "option_text"])
    
    VP = len(predicted_true & actual_true)
    FP = len(predicted_true - actual_true)
    FN = len(actual_true - predicted_true)
    
    precision = VP / (VP + FP) if (VP + FP) > 0 else 0
    recall = VP / (VP + FN) if (VP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    results.append({
        "acronym": acronym,
        "VP": VP,
        "FP": FP,
        "FN": FN,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# === 4. Display per-acronym and global F1 ===
results_df = pd.DataFrame(results)
display(results_df)

# Weighted/global averages
global_VP = results_df["VP"].sum()
global_FP = results_df["FP"].sum()
global_FN = results_df["FN"].sum()

global_precision = global_VP / (global_VP + global_FP) if (global_VP + global_FP) > 0 else 0
global_recall = global_VP / (global_VP + global_FN) if (global_VP + global_FN) > 0 else 0
global_f1 = 2 * global_precision * global_recall / (global_precision + global_recall) if (global_precision + global_recall) > 0 else 0

print("\nðŸ“Š Global Metrics (based on set-level comparison):")
print(f"Precision: {global_precision:.3f}")
print(f"Recall:    {global_recall:.3f}")
print(f"F1-score:  {global_f1:.3f}")


Unnamed: 0,acronym,VP,FP,FN,precision,recall,f1
0,AC,0,1,2,0.0,0.0,0.0
1,AGC,1,0,0,1.0,1.0,1.0
2,BHR,1,0,1,1.0,0.5,0.666667
3,BV,1,0,0,1.0,1.0,1.0
4,CCT,0,0,1,0.0,0.0,0.0
5,CLE,1,0,0,1.0,1.0,1.0
6,CMT,0,0,0,0.0,0.0,0.0
7,CSS,1,0,0,1.0,1.0,1.0
8,DE,1,1,1,0.5,0.5,0.5
9,DT,1,0,0,1.0,1.0,1.0



ðŸ“Š Global Metrics (based on set-level comparison):
Precision: 0.769
Recall:    0.732
F1-score:  0.750
