In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import json
import sys
import traceback

In [3]:
# === 1. Load and expand your JSON lines dataset ===
file_path = "data/train_v2.jsonl"  # <-- put your dataset filename here

rows = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        text = item["text"]
        acronym = item["acronym"]
        options = item["options"]

        for option_text, is_correct in options.items():
            rows.append({
                "text": text.strip(),
                "acronym": acronym.strip(),
                "option_text": option_text.strip(),
                "label": int(is_correct)
            })

# Optional: check what it looks like
df = pd.DataFrame(rows)
print(df.head())

                                                text acronym  \
0  LRA  limite de résistance des attelages PAR po...     PAR   
1  LRA  limite de résistance des attelages PAR po...     PAR   
2  LRA  limite de résistance des attelages PAR po...     PAR   
3  LRA  limite de résistance des attelages PAR po...     PAR   
4                               Désigna -tion des PN      PN   

                                         option_text  label  
0                           Plan d'action régularité      0  
1  Poste d'aiguillage et de régulation : assure l...      1  
2                                    PONT DE L'ARCHE      0  
3                             Plan d'action régional      0  
4  Passages à niveau : fichier des pn, recensemen...      0  


In [4]:
# === 2. Convert into a Hugging Face Dataset ===
dataset = Dataset.from_pandas(df)

In [6]:
# === 3. Tokenize ===
tokenizer = AutoTokenizer.from_pretrained("camembert-base")  # French-friendly model

def preprocess(example):
    # map(..., batched=True) provides lists — build a single input string per item
    texts = example["text"]
    acronyms = example["acronym"]
    options = example["option_text"]

    if not isinstance(texts, list):
        texts = [texts]
        acronyms = [acronyms]
        options = [options]

    inputs = [t.strip() + " " + a.strip() + " : " + o.strip()
              for t, a, o in zip(texts, acronyms, options)]

    tokenized = tokenizer(
        inputs,
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # keep labels under key 'labels' for Trainer
    tokenized["labels"] = example["label"]
    return tokenized

dataset = dataset.map(preprocess, batched=True)

# Split into train/validation sets (e.g., 90/10)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
val_dataset = dataset["test"]


Map: 100%|██████████| 2177/2177 [00:00<00:00, 5284.30 examples/s]


In [7]:
# === 4. Initialize model ===
model = AutoModelForSequenceClassification.from_pretrained("camembert-base", num_labels=2)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# === 5. Training configuration ===
# Some transformer versions don't accept newer kwargs (e.g., evaluation_strategy).
# Try the modern constructor first; if it fails (TypeError), fall back to a compatible set.
try:
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01
    )
except TypeError as e:
    # Likely an older transformers version where evaluation_strategy is not supported.
    print("Warning: TrainingArguments raised TypeError when using 'evaluation_strategy'.")
    print("Falling back to older-compatible arguments (omitting evaluation_strategy).")
    # Optionally show the original error for debugging
    traceback.print_exception(e, e, e.__traceback__, file=sys.stdout)
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        do_eval=True  # older flag that may be recognized
    )

Falling back to older-compatible arguments (omitting evaluation_strategy).
Traceback (most recent call last):
  File "C:\Users\bdosanjosg\AppData\Local\Temp\ipykernel_5420\709568092.py", line 5, in <module>
    training_args = TrainingArguments(
TypeError: __init__() got an unexpected keyword argument 'evaluation_strategy'


In [9]:
# === 6. Trainer setup ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

In [None]:
# === 7. Train ===
trainer.train()

In [10]:
# Print final metrics
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.7028547525405884, 'eval_model_preparation_time': 0.005, 'eval_runtime': 17.0366, 'eval_samples_per_second': 12.796, 'eval_steps_per_second': 1.644}


In [14]:
# === 8. Save final model and tokenizer ===
trainer.save_model("./results")        # Saves model + config
tokenizer.save_pretrained("./results") # Saves tokenizer files too


('./results\\tokenizer_config.json',
 './results\\special_tokens_map.json',
 './results\\tokenizer.json')