<a href="https://colab.research.google.com/github/Readh-H/NLP/blob/main/train_bert_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets seqeval -q


In [None]:
import torch
from transformers import CamembertTokenizerFast, CamembertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np


In [None]:
from google.colab import files
uploaded = files.upload()  # Sélectionnez les 3 fichiers : slot-filling.in, slot-filling.out, intentlabels


In [None]:
def read_file(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return [line.strip().split() for line in f]

sentences = read_file("slot-filling.in")
slots = read_file("slot-filling.out")
with open("intentlabels", "r", encoding="utf-8") as f:
    intents = [line.strip() for line in f]

assert len(sentences) == len(slots) == len(intents)

data = [{"tokens": s, "slots": sl, "intent": i} for s, sl, i in zip(sentences, slots, intents)]
dataset = Dataset.from_list(data).train_test_split(test_size=0.2)


In [None]:
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

unique_slots = sorted({label for ex in dataset["train"] for label in ex["slots"]})
slot2id = {label: idx for idx, label in enumerate(unique_slots)}
id2slot = {v: k for k, v in slot2id.items()}

intent2id = {label: idx for idx, label in enumerate(sorted(set(dataset["train"]["intent"])))}
id2intent = {v: k for k, v in intent2id.items()}


In [None]:
def encode_example(example):
    tokenized = tokenizer(example["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=64)
    word_ids = tokenized.word_ids()

    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        else:
            labels.append(slot2id[example["slots"][word_idx]])
        previous_word_idx = word_idx

    tokenized["labels"] = labels
    tokenized["intent_label"] = intent2id[example["intent"]]
    return tokenized

encoded_dataset = dataset.map(encode_example)


In [None]:
model = CamembertForTokenClassification.from_pretrained("camembert-base", num_labels=len(slot2id))


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    true_labels = []
    pred_labels = []

    for pred, lab in zip(predictions, labels):
        true_seq = []
        pred_seq = []
        for p, l in zip(pred, lab):
            if l != -100:
                true_seq.append(id2slot[l])
                pred_seq.append(id2slot[p])
        true_labels.append(true_seq)
        pred_labels.append(pred_seq)

    return {
        "precision": precision_score(true_labels, pred_labels),
        "recall": recall_score(true_labels, pred_labels),
        "f1": f1_score(true_labels, pred_labels),
        "accuracy": accuracy_score(true_labels, pred_labels),
    }


In [None]:
args = TrainingArguments(
    output_dir="./camembert-nlu",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
