In [1]:
!pip install seqeval
!pip install peft==0.10.0



In [2]:
import os
import random
import numpy as np
import pandas as pd
from collections import Counter

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback, set_seed, pipeline
)
from seqeval.metrics import classification_report, f1_score, precision_score, recall_score
import plotly.graph_objects as go
import plotly.express as px

# 1. Réglages globaux
set_seed(42)
np.random.seed(42)
random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

2025-04-21 14:48:36.967964: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745246916.990646     465 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745246916.997576     465 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
# 2. Chargement et exploration du dataset
print("Chargement du dataset FiNER-139...")
dataset = load_dataset('nlpaueb/finer-139')
print(dataset)
print("Exemple d'entrée :", dataset["train"][0])

# Affichage des labels
ner_labels = dataset["train"].features["ner_tags"].feature.names
print("Nombre de labels :", len(ner_labels))
print("Exemples de labels :", ner_labels[:10])

Chargement du dataset FiNER-139...
DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 900384
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 112494
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 108378
    })
})
Exemple d'entrée : {'id': 0, 'tokens': ['ITEM', '1', 'Financial', 'Statements', 'Lennar', 'Corporation', 'and', 'Subsidiaries', 'Condensed', 'Consolidated', 'Balance', 'Sheets', '(', 'Dollars', 'in', 'thousands', ',', 'except', 'shares', 'and', 'per', 'share', 'amounts', ')', '(', 'unaudited', ')', '(', '1', ')', 'Under', 'certain', 'provisions', 'of', 'Accounting', 'Standards', 'Codification', '(', '“', 'ASC', '”', ')', 'Topic', '810', ',', 'Consolidations', ',', '(', '“', 'ASC', '810', '”', ')', 'the', 'Company', 'is', 'required', 'to', 'separately', 'disclose', 'on', 'its', 'condensed', 'consolidated', 'balance', 'sheets', 'the', 

In [4]:
# 3. Analyse exploratoire : Distribution des entités (Plotly)
all_labels = [l for example in dataset['train'] for l in example['ner_tags']]
label_counts = Counter(all_labels)
labels_plot = [ner_labels[i] for i in label_counts.keys()]
counts_plot = list(label_counts.values())
fig = px.bar(x=labels_plot, y=counts_plot, labels={'x': 'Entité', 'y': 'Nombre'}, title="Distribution des entités dans le train set")
fig.update_layout(xaxis_tickangle=-45, width=1200, height=500)
fig.show()

In [5]:
# 4. Split 70/10/20 (Train/Val/Test)
train_ds = dataset["train"]
val_ds = dataset["validation"]
test_ds = dataset["test"]
print(f"Train: {len(train_ds)}, Val: {len(val_ds)}, Test: {len(test_ds)}")

Train: 900384, Val: 112494, Test: 108378


In [6]:
# 5. Prétraitement : Tokenization et Alignement des Labels
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def align_labels_and_tokens(word_ids, labels):
    updated_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            updated_labels.append(-100 if word_id is None else labels[word_id])
        elif word_id is None:
            updated_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            updated_labels.append(label)
    return updated_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
        max_length=128
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        new_labels.append(align_labels_and_tokens(word_ids, labels))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

print("Tokenization et alignement des labels...")
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Tokenization et alignement des labels...



`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



Map:   0%|          | 0/108378 [00:00<?, ? examples/s]

In [7]:
# 6. Modélisation : BERT pour la Token Classification
id2label = {str(i): label for i, label in enumerate(ner_labels)}
label2id = {label: i for i, label in enumerate(ner_labels)}

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(ner_labels),
    id2label=id2label,
    label2id=label2id
)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
!pip install transformers==4.38.0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [9]:
import transformers
import datasets
print(transformers.__version__)
print(datasets.__version__)

4.38.0
3.5.0


In [10]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [11]:
# 7. Définition des Arguments d'Entraînement
batch_size = 16
args = TrainingArguments(
    output_dir="./finer-bert-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=100,
    save_total_limit=2,
    report_to="none"
)

import evaluate
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    true_predictions = [
        [ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [ner_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    # Macro metrics (preferred)
    macro_f1 = f1_score(true_labels, true_predictions, average='macro')
    macro_precision = precision_score(true_labels, true_predictions, average='macro')
    macro_recall = recall_score(true_labels, true_predictions, average='macro')
    metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": metrics["overall_precision"],
        "recall": metrics["overall_recall"],
        "f1": metrics["overall_f1"],
        "accuracy": metrics["overall_accuracy"],
        "macro_f1": macro_f1,
        "macro_precision": macro_precision,
        "macro_recall": macro_recall
    }

In [12]:
!pip install accelerate==0.27.2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [None]:
# 8. Entraînement avec Early Stopping
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Début de l'entraînement...")
trainer.train()

Début de l'entraînement...



Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.



Epoch,Training Loss,Validation Loss


In [None]:
import accelerate
print(accelerate.__version__)

In [None]:
# 9. Évaluation avancée sur le test set
print("Évaluation sur le test set...")
predictions, labels, _ = trainer.predict(tokenized_dataset["test"])
predictions = np.argmax(predictions, axis=2)
true_predictions = [
    [ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [ner_labels[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Rapport détaillé par entité (macro privilégié)
print("Rapport détaillé par entité :")
print(classification_report(true_labels, true_predictions, digits=3))

# Calcul des scores macro pour chaque entité
macro_f1 = f1_score(true_labels, true_predictions, average='macro')
macro_precision = precision_score(true_labels, true_predictions, average='macro')
macro_recall = recall_score(true_labels, true_predictions, average='macro')
print(f"Macro F1: {macro_f1:.3f} | Macro Precision: {macro_precision:.3f} | Macro Recall: {macro_recall:.3f}")

# Analyse : Top entités les mieux/moins bien reconnues
from seqeval.metrics import classification_report as seqeval_classification_report
import re

report = seqeval_classification_report(true_labels, true_predictions, output_dict=True)
entity_scores = [(k, v['f1-score']) for k, v in report.items() if re.match(r'^[^O]', k)]
entity_scores = sorted(entity_scores, key=lambda x: x[1], reverse=True)
print("\nTop 10 entités les mieux reconnues :")
for ent, score in entity_scores[:10]:
    print(f"{ent}: F1 = {score:.3f}")
print("\nTop 10 entités les moins bien reconnues :")
for ent, score in entity_scores[-10:]:
    print(f"{ent}: F1 = {score:.3f}")

# Visualisation interactive des F1 par entité (Plotly)
fig = px.bar(x=[e[0] for e in entity_scores], y=[e[1] for e in entity_scores],
             labels={'x': 'Entité', 'y': 'F1-score'},
             title="F1-score par entité (macro)")
fig.update_layout(xaxis_tickangle=-45, width=1200, height=500)
fig.show()

# Matrice de confusion simplifiée (pour les entités principales)
from sklearn.metrics import confusion_matrix
import numpy as np

# On ne prend que les entités principales (hors 'O') pour la confusion
flat_true = [l for seq in true_labels for l in seq]
flat_pred = [l for seq in true_predictions for l in seq]
main_entities = [l for l in set(flat_true + flat_pred) if l != 'O']
main_entities = sorted(main_entities, key=lambda x: -flat_true.count(x))[:15]  # top 15 entités
mask = [(t in main_entities) and (p in main_entities) for t, p in zip(flat_true, flat_pred)]
cm = confusion_matrix(
    [t for t, m in zip(flat_true, mask) if m],
    [p for p, m in zip(flat_pred, mask) if m],
    labels=main_entities
)
fig = px.imshow(cm, x=main_entities, y=main_entities, color_continuous_scale='Blues',
                labels=dict(x="Prédit", y="Vrai", color="Nb"),
                title="Matrice de confusion (top 15 entités)")
fig.update_xaxes(side="top")
fig.show()

# Analyse d'erreurs : exemples d'entités mal prédites
print("\nExemples d'erreurs (entités mal prédites) :")
error_examples = []
for i, (true_seq, pred_seq) in enumerate(zip(true_labels, true_predictions)):
    for j, (t, p) in enumerate(zip(true_seq, pred_seq)):
        if t != p and t != 'O':
            error_examples.append((i, j, t, p))
            if len(error_examples) >= 10:
                break
    if len(error_examples) >= 10:
        break
for idx, pos, t, p in error_examples:
    print(f"Exemple {idx} - Position {pos} : Vrai={t}, Prédit={p}, Token='{test_ds[idx]['tokens'][pos]}'")

In [None]:
# 10. Sauvegarde locale du modèle et du tokenizer
model.save_pretrained("./finer-bert-ner")
tokenizer.save_pretrained("./finer-bert-ner")
print("Modèle et tokenizer sauvegardés localement dans ./finer-bert-ner")


In [None]:

# 11. Fonction d'inférence et visualisation pour application
try:
    from spacy import displacy
except ImportError:
    displacy = None
    print("spacy n'est pas installé, la visualisation colorée ne sera pas disponible.")

def ner_inference(text, model_dir="./finer-bert-ner", return_entities=True, display=True):
    """
    Prend une phrase en entrée, détecte les entités nommées, les affiche joliment et retourne la liste structurée.
    """
    nlp = pipeline("token-classification", model=model_dir, tokenizer=model_dir, aggregation_strategy="simple")
    output = nlp(text)
    if display and displacy is not None:
        ents = [{"start": ent["start"], "end": ent["end"], "label": ent["entity_group"]} for ent in output]
        doc = {"text": text, "ents": ents, "title": None}
        displacy.render(doc, style="ent", manual=True)
    if return_entities:
        entities = []
        for ent in output:
            entities.append({
                "text": ent["word"],
                "label": ent["entity_group"],
                "score": ent["score"],
                "start": ent["start"],
                "end": ent["end"]
            })
        return entities

In [None]:
# 12. Exemple d'utilisation
if __name__ == "__main__":
    phrase = "Apple reported a net income of $20 billion in 2022. Elon Musk is the CEO of Tesla."
    entities = ner_inference(phrase, model_dir="./finer-bert-ner", return_entities=True, display=True)
    print("Liste des entités détectées :")
    for ent in entities:
        print(f"- {ent['text']} ({ent['label']}) [score: {ent['score']:.2f}]")

