In [None]:
!pip install transformers
!pip install datasets
!pip install pandas
!pip install torch
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install 'accelerate>=0.26.0'
!pip install transformers[torch]
!pip install scikit-learn
!pip install ipywidgets
!pip install addict matplotlib
!pip install easydict einops

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import pandas as pd
import json
import sys
import traceback

In [2]:
# === 1. Load and expand your JSON lines dataset ===
file_path = "data/train_v2.jsonl"  # <-- put your dataset filename here

rows = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        text = item["text"]
        acronym = item["acronym"]
        options = item["options"]

        for option_text, is_correct in options.items():
            rows.append({
                "text": text.strip(),
                "acronym": acronym.strip(),
                "option_text": option_text.strip(),
                "label": int(is_correct)
            })

# Optional: check what it looks like
df = pd.DataFrame(rows)
print(df.head())

                                                text acronym  \
0  LRA  limite de r√©sistance des attelages PAR po...     PAR   
1  LRA  limite de r√©sistance des attelages PAR po...     PAR   
2  LRA  limite de r√©sistance des attelages PAR po...     PAR   
3  LRA  limite de r√©sistance des attelages PAR po...     PAR   
4                               D√©signa -tion des PN      PN   

                                         option_text  label  
0                           Plan d'action r√©gularit√©      0  
1  Poste d'aiguillage et de r√©gulation : assure l...      1  
2                                    PONT DE L'ARCHE      0  
3                             Plan d'action r√©gional      0  
4  Passages √† niveau : fichier des pn, recensemen...      0  


In [3]:
# === 2. Convert into a Hugging Face Dataset ===
dataset = Dataset.from_pandas(df)

In [4]:
# === 3. Tokenize ===
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")  # French-friendly model

def preprocess(example):
    # map(..., batched=True) provides lists ‚Äî build a single input string per item
    texts = example["text"]
    acronyms = example["acronym"]
    options = example["option_text"]

    if not isinstance(texts, list):
        texts = [texts]
        acronyms = [acronyms]
        options = [options]

    inputs = [t.strip() + " " + a.strip() + " : " + o.strip()
              for t, a, o in zip(texts, acronyms, options)]

    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # keep labels under key 'labels' for Trainer
    tokenized["labels"] = example["label"]
    return tokenized

dataset = dataset.map(preprocess, batched=True)

# Split into train/validation sets (e.g., 90/10)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

Map:   0%|          | 0/2177 [00:00<?, ? examples/s]

In [5]:
# === 4. Initialize model ===
#model = AutoModelForSequenceClassification.from_pretrained("camembert-base", num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained("FacebookAI/xlm-roberta-large", num_labels=2)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# === 5. Training configuration ===
# Some transformer versions don't accept newer kwargs (e.g., evaluation_strategy).
# Try the modern constructor first; if it fails (TypeError), fall back to a compatible set.
try:
    training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",  # or your custom metric
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=42,
)
except TypeError as e:
    # Likely an older transformers version where evaluation_strategy is not supported.
    print("Warning: TrainingArguments raised TypeError when using 'evaluation_strategy'.")
    print("Falling back to older-compatible arguments (omitting evaluation_strategy).")
    # Optionally show the original error for debugging
    traceback.print_exception(e, e, e.__traceback__, file=sys.stdout)
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=10,
        weight_decay=0.01,
        do_eval=True  # older flag that may be recognized
    )

Falling back to older-compatible arguments (omitting evaluation_strategy).
Traceback (most recent call last):
  File "/tmp/ipykernel_9108/591666313.py", line 5, in <module>
    training_args = TrainingArguments(
                    ^^^^^^^^^^^^^^^^^^
TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'


In [7]:
# === 6. Trainer setup ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [11]:
# === 7. Train ===
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 8.19 MiB is free. Process 2686602 has 22.52 GiB memory in use. Process 3862586 has 9.20 GiB memory in use. Of the allocated memory 8.53 GiB is allocated by PyTorch, and 306.96 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [65]:
# Print final metrics
metrics = trainer.evaluate()
print(metrics)


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 31.73 GiB of which 2.19 MiB is free. Process 2686602 has 22.52 GiB memory in use. Process 2801316 has 9.21 GiB memory in use. Of the allocated memory 8.81 GiB is allocated by PyTorch, and 26.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [28]:
# === 8. Save final model and tokenizer ===
trainer.save_model("./results")        # Saves model + config
tokenizer.save_pretrained("./results") # Saves tokenizer files too


('./results/tokenizer_config.json',
 './results/special_tokens_map.json',
 './results/sentencepiece.bpe.model',
 './results/added_tokens.json',
 './results/tokenizer.json')

In [29]:
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score, precision_score, recall_score

# === 1. Get predictions on the validation set ===
predictions = trainer.predict(val_dataset)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids

# Convert validation dataset back to pandas for grouping
val_df = pd.DataFrame(val_dataset)

# === 2. Attach predicted labels to each example ===
val_df["pred_label"] = pred_labels
val_df["true_label"] = true_labels

# === 3. Compute per-acronym F1 based on set comparison ===
results = []
for acronym, group in val_df.groupby("acronym"):
    # Get sets of options judged true
    predicted_true = set(group.loc[group["pred_label"] == 1, "option_text"])
    actual_true = set(group.loc[group["true_label"] == 1, "option_text"])
    
    VP = len(predicted_true & actual_true)
    FP = len(predicted_true - actual_true)
    FN = len(actual_true - predicted_true)
    
    precision = VP / (VP + FP) if (VP + FP) > 0 else 0
    recall = VP / (VP + FN) if (VP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    results.append({
        "acronym": acronym,
        "VP": VP,
        "FP": FP,
        "FN": FN,
        "precision": precision,
        "recall": recall,
        "f1": f1
    })

# === 4. Display per-acronym and global F1 ===
results_df = pd.DataFrame(results)
display(results_df)

# Weighted/global averages
global_VP = results_df["VP"].sum()
global_FP = results_df["FP"].sum()
global_FN = results_df["FN"].sum()

global_precision = global_VP / (global_VP + global_FP) if (global_VP + global_FP) > 0 else 0
global_recall = global_VP / (global_VP + global_FN) if (global_VP + global_FN) > 0 else 0
global_f1 = 2 * global_precision * global_recall / (global_precision + global_recall) if (global_precision + global_recall) > 0 else 0

print("\nüìä Global Metrics (based on set-level comparison):")
print(f"Precision: {global_precision:.3f}")
print(f"Recall:    {global_recall:.3f}")
print(f"F1-score:  {global_f1:.3f}")


Unnamed: 0,acronym,VP,FP,FN,precision,recall,f1
0,AC,0,1,1,0.0,0.0,0.0
1,AGC,1,0,0,1.0,1.0,1.0
2,ATO,1,0,0,1.0,1.0,1.0
3,BB,0,0,0,0.0,0.0,0.0
4,BHR,1,0,1,1.0,0.5,0.666667
5,BV,1,0,0,1.0,1.0,1.0
6,CCL,0,1,0,0.0,0.0,0.0
7,CCT,0,1,0,0.0,0.0,0.0
8,CLE,1,0,0,1.0,1.0,1.0
9,CMT,0,0,0,0.0,0.0,0.0



üìä Global Metrics (based on set-level comparison):
Precision: 0.750
Recall:    0.808
F1-score:  0.778


In [31]:
# === 8. Evaluate on training set ===
train_results = trainer.predict(train_dataset)

# Print metrics
print("=== Evaluation on Training Set ===")
print(f"Loss: {train_results.metrics['test_loss']:.4f}")

# Optionally inspect predictions
predictions = train_results.predictions.argmax(axis=-1)
labels = train_results.label_ids

# Example: print first 10 predictions vs. true labels
for i in range(10):
    print(f"Example {i+1}: Predicted = {predictions[i]}, Actual = {labels[i]}")


=== Evaluation on Training Set ===
Loss: 0.0334
Example 1: Predicted = 0, Actual = 0
Example 2: Predicted = 0, Actual = 0
Example 3: Predicted = 1, Actual = 1
Example 4: Predicted = 0, Actual = 0
Example 5: Predicted = 0, Actual = 0
Example 6: Predicted = 0, Actual = 0
Example 7: Predicted = 0, Actual = 0
Example 8: Predicted = 0, Actual = 0
Example 9: Predicted = 0, Actual = 0
Example 10: Predicted = 1, Actual = 1
