# Fine-tuning d'un modèle TinyBERT pour NER (la reconnaissance d'entités nommées) en français en utilisant le dataset MultiNERD(https://github.com/Babelscape/multinerd)

In [15]:
! pip install datasets transformers seqeval
! pip install transformers[torch]
! pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
zsh:1: no matches found: transformers[torch]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


## Importation des bibliothèques

In [16]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np


## Chargement du dataset MultiNERD et filtrage des données françaises

In [17]:
# Définir le ratio du sous-ensemble
subset_ratio = 0.1

def select_subset(dataset, ratio):
    """
    Sélectionne un sous-ensemble d'un dataset.
    
    Paramètres :
    dataset : Le dataset à traiter.
    ratio : Le ratio du dataset à sélectionner.
    
    Retourne :
    Un sous-ensemble du dataset.
    """
    # Mélanger aléatoirement le dataset
    dataset = dataset.shuffle(seed=42)
    # Calculer la taille du sous-ensemble selon le ratio spécifié
    subset_size = int(len(dataset) * ratio)
    # Retourner le sous-ensemble du dataset
    return dataset.select(range(subset_size))

# Charger les datasets
datasets = load_dataset("Babelscape/multinerd")

# Filtrer pour ne conserver que les données en français
train_dataset = datasets["train"].filter(lambda exemple: exemple['lang'] == "fr")
val_dataset = datasets["validation"].filter(lambda exemple: exemple['lang'] == "fr")
test_dataset = datasets["test"].filter(lambda exemple: exemple['lang'] == "fr")

# Sélectionner un sous-ensemble pour les datasets d'entraînement, de validation et de test
train_dataset = select_subset(train_dataset, subset_ratio)
val_dataset = select_subset(val_dataset, subset_ratio)
test_dataset = select_subset(test_dataset, subset_ratio)


Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Filter:   0%|          | 0/2678400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/334800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/335986 [00:00<?, ? examples/s]

## Définition des étiquettes et chargement du modèle TinyBERT

In [18]:
# Création d'un dictionnaire pour les étiquettes de la reconnaissance d'entités nommées
labels_vocab = {
    "O": 0, "B-PER": 1, "I-PER": 2, "B-ORG": 3, "I-ORG": 4, "B-LOC": 5, "I-LOC": 6,
    "B-ANIM": 7, "I-ANIM": 8, "B-BIO": 9, "I-BIO": 10, "B-CEL": 11, "I-CEL": 12,
    "B-DIS": 13, "I-DIS": 14, "B-EVE": 15, "I-EVE": 16, "B-FOOD": 17, "I-FOOD": 18,
    "B-INST": 19, "I-INST": 20, "B-MEDIA": 21, "I-MEDIA": 22, "B-MYTH": 23, "I-MYTH": 24,
    "B-PLANT": 25, "I-PLANT": 26, "B-TIME": 27, "I-TIME": 28, "B-VEHI": 29, "I-VEHI": 30,
}
label_list = list(labels_vocab.keys())
labels_vocab_reverse = {v: k for k, v in labels_vocab.items()}

# Chargement du tokenizer et du modèle pré-entraîné TinyBERT
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = AutoModelForTokenClassification.from_pretrained(
    "prajjwal1/bert-tiny", 
    num_labels=len(label_list), 
    label2id=labels_vocab, 
    id2label=labels_vocab_reverse
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Prétraitement et alignement des étiquettes

In [19]:
def tokenize_and_align_labels(examples):
    # Tokenize les exemples
    tokenized_inputs = tokenizer(examples["tokens"], 
                                 truncation=True, 
                                 padding="max_length", 
                                 max_length=128,  # Taille maximale des tokens
                                 is_split_into_words=True,
                                 return_tensors='pt')
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels.append(label_ids)
    
    # Ajoute les étiquettes dans les exemples
    tokenized_inputs["labels"] = labels
    return tokenized_inputs



## Mappage des ensembles de données

In [20]:
train_tokenized = train_dataset.map(tokenize_and_align_labels, batched=True)
val_tokenized = val_dataset.map(tokenize_and_align_labels, batched=True)
test_tokenized = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/28176 [00:00<?, ? examples/s]

Map:   0%|          | 0/3522 [00:00<?, ? examples/s]

Map:   0%|          | 0/3539 [00:00<?, ? examples/s]

## Configuration des paramètres d'entraînement

In [21]:
training_args = TrainingArguments(
    "TinyBERT-finetuned-ner", # Répertoire de sortie
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    weight_decay=0.01,
    gradient_accumulation_steps=10,  # Optimisation pour les GPUs/CPUs avec peu de mémoire
)

## Calcul des métriques

In [22]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    metric = load_metric("seqeval") # Chargement de la métrique seqeval
    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

## Entraînement et évaluation

In [23]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate(test_tokenized)


  0%|          | 0/8451 [00:00<?, ?it/s]

{'loss': 1.5709, 'grad_norm': 2.356165885925293, 'learning_rate': 1.8816708081883804e-05, 'epoch': 0.18}
{'loss': 0.8357, 'grad_norm': 1.2726070880889893, 'learning_rate': 1.7633416163767603e-05, 'epoch': 0.35}
{'loss': 0.6958, 'grad_norm': 1.2442868947982788, 'learning_rate': 1.6450124245651406e-05, 'epoch': 0.53}
{'loss': 0.6223, 'grad_norm': 1.6277961730957031, 'learning_rate': 1.5266832327535205e-05, 'epoch': 0.71}
{'loss': 0.5784, 'grad_norm': 3.5178229808807373, 'learning_rate': 1.4083540409419005e-05, 'epoch': 0.89}


  0%|          | 0/3522 [00:00<?, ?it/s]

  metric = load_metric("seqeval") # Chargement de la métrique seqeval
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6036356687545776, 'eval_precision': 0.4468864468864469, 'eval_recall': 0.17995661605206073, 'eval_f1': 0.2565879005319807, 'eval_accuracy': 0.8893685463824929, 'eval_runtime': 607.4553, 'eval_samples_per_second': 5.798, 'eval_steps_per_second': 5.798, 'epoch': 1.0}
{'loss': 0.5417, 'grad_norm': 1.3371126651763916, 'learning_rate': 1.2900248491302805e-05, 'epoch': 1.06}
{'loss': 0.5128, 'grad_norm': 2.223402261734009, 'learning_rate': 1.1716956573186607e-05, 'epoch': 1.24}
{'loss': 0.4933, 'grad_norm': 1.8359429836273193, 'learning_rate': 1.0533664655070406e-05, 'epoch': 1.42}
{'loss': 0.4752, 'grad_norm': 1.0332006216049194, 'learning_rate': 9.350372736954207e-06, 'epoch': 1.6}
{'loss': 0.4583, 'grad_norm': 2.142378330230713, 'learning_rate': 8.167080818838008e-06, 'epoch': 1.77}
{'loss': 0.4564, 'grad_norm': 2.4748988151550293, 'learning_rate': 6.983788900721808e-06, 'epoch': 1.95}


  0%|          | 0/3522 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5349894762039185, 'eval_precision': 0.4956006768189509, 'eval_recall': 0.25414316702819956, 'eval_f1': 0.3359908230570691, 'eval_accuracy': 0.9003279464702237, 'eval_runtime': 59.1602, 'eval_samples_per_second': 59.533, 'eval_steps_per_second': 59.533, 'epoch': 2.0}
{'loss': 0.4359, 'grad_norm': 2.631856679916382, 'learning_rate': 5.800496982605609e-06, 'epoch': 2.13}
{'loss': 0.4377, 'grad_norm': 1.9682890176773071, 'learning_rate': 4.61720506448941e-06, 'epoch': 2.31}
{'loss': 0.4231, 'grad_norm': 1.819485068321228, 'learning_rate': 3.4339131463732107e-06, 'epoch': 2.48}
{'loss': 0.4243, 'grad_norm': 3.1097989082336426, 'learning_rate': 2.250621228257011e-06, 'epoch': 2.66}
{'loss': 0.4339, 'grad_norm': 2.9967992305755615, 'learning_rate': 1.0673293101408118e-06, 'epoch': 2.84}


  0%|          | 0/3522 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.5200121402740479, 'eval_precision': 0.4965859714463066, 'eval_recall': 0.27765726681127983, 'eval_f1': 0.35616895764928486, 'eval_accuracy': 0.9024724796513045, 'eval_runtime': 56.1209, 'eval_samples_per_second': 62.757, 'eval_steps_per_second': 62.757, 'epoch': 3.0}
{'train_runtime': 3638.0909, 'train_samples_per_second': 23.234, 'train_steps_per_second': 2.323, 'train_loss': 0.5780868340689619, 'epoch': 3.0}


  0%|          | 0/3539 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.4592796266078949,
 'eval_precision': 0.49177153920619554,
 'eval_recall': 0.2826671612723732,
 'eval_f1': 0.35898945880690186,
 'eval_accuracy': 0.9094756235238709,
 'eval_runtime': 53.5189,
 'eval_samples_per_second': 66.126,
 'eval_steps_per_second': 66.126,
 'epoch': 3.0}

## Utilisation du modèle TinyBERT entraîné pour faire des prédictions

In [24]:
# Utilisation du modèle entraîné pour faire des prédictions
predictions, labels, _ = trainer.predict(test_tokenized)
predictions = np.argmax(predictions, axis=2)

# Suppression des indices ignorés (pour les tokens spéciaux)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Calcul des indicateurs de performance
metric = load_metric("seqeval")
results = metric.compute(predictions=true_predictions, references=true_labels)
print(results)


  0%|          | 0/3539 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
  _warn_prf(average, modifier, msg_start, len(result))


{'ANIM': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 329}, 'CEL': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 95}, 'DIS': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 284}, 'EVE': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 200}, 'FOOD': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 204}, 'INST': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 37}, 'LOC': {'precision': 0.553347280334728, 'recall': 0.43646864686468645, 'f1': 0.48800738007380073, 'number': 4848}, 'MEDIA': {'precision': 0.29411764705882354, 'recall': 0.032467532467532464, 'f1': 0.05847953216374269, 'number': 154}, 'MYTH': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 136}, 'ORG': {'precision': 0.18041237113402062, 'recall': 0.06398537477148081, 'f1': 0.09446693657219972, 'number': 547}, 'PER': {'precision': 0.4133895131086142, 'recall': 0.32813080639167597, 'f1': 0.36585871141495757, 'number': 2691}, 'PLANT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, '

## Sauvegarde du modèle et du tokenizer

In [25]:
# Sauvegarde du modèle
model.save_pretrained("Tinybert-finetuned-ner")

# Sauvegarde du tokenizer
tokenizer.save_pretrained("Tinybert-finetuned-ner")


('Tinybert-finetuned-ner/tokenizer_config.json',
 'Tinybert-finetuned-ner/special_tokens_map.json',
 'Tinybert-finetuned-ner/vocab.txt',
 'Tinybert-finetuned-ner/added_tokens.json',
 'Tinybert-finetuned-ner/tokenizer.json')

## Test

In [29]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Charger le modèle et le tokenizer
tokenizer = AutoTokenizer.from_pretrained("Tinybert-finetuned-ner")
model = AutoModelForTokenClassification.from_pretrained("Tinybert-finetuned-ner")

def predict(texts, tokenizer, model, labels_vocab_reverse):
    # Tokenize les textes
    tokenized_inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Faire une prédiction
    with torch.no_grad():
        outputs = model(**tokenized_inputs)

    predictions = torch.argmax(outputs.logits, dim=-1)

    # Aligner les prédictions avec les tokens
    predicted_labels = []
    for i, input_ids in enumerate(tokenized_inputs["input_ids"]):
        # Convertir les ids en tokens
        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        prediction_indices = predictions[i].tolist()
        
        # Aligner les tokens avec les prédictions
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Récupérer les ids des tokens
        aligned_labels = [labels_vocab_reverse[pred_idx] for token, pred_idx, word_id in zip(tokens, prediction_indices, word_ids) if word_id is not None and token not in tokenizer.all_special_tokens]

        predicted_labels.append(aligned_labels)

    return predicted_labels

# Textes à prédire
texts = ["Paris et France", "Je aime bien Paris"]

# Faire des prédictions
predicted_labels = predict(texts, tokenizer, model, labels_vocab_reverse)
for text, labels in zip(texts, predicted_labels):
    print("Text:", text)
    print("Labels:", labels)



Text: Paris et France
Labels: ['B-LOC', 'O', 'B-LOC']
Text: Je aime bien Paris
Labels: ['O', 'O', 'O', 'O', 'B-LOC']
