# BERT-base-multilingual-cased

In [7]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # Barre de progression pour Jupyter
pd.set_option('display.max_colwidth', None)
import s3fs

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
import re

import torch
from sklearn.metrics import (
    f1_score, precision_score, recall_score, hamming_loss, accuracy_score
)

In [2]:
os.chdir("/home/onyxia/work/projet_NLP")
print(os.getcwd())

/home/onyxia/work/projet_NLP


In [3]:
data_train = pd.read_pickle('data/df_train.pkl') # dataset containing only english cases
data_test = pd.read_pickle('data/df_test.pkl')

In [4]:
print(torch.cuda.is_available())  # doit renvoyer True
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))  # nom de ton GPU

True
NVIDIA A2


## Training BERT-base-multilingual-cased

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Modèle bilingue FR/EN (multilingual BERT)
model_name = "bert-base-multilingual-cased"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Nombre de classes en sortie
n_labels = 26  # à ajuster selon ton jeu de données

# Modèle pour classification multi-label
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_labels,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from datasets import Dataset
df = data_train.rename(columns={"target": "labels"})
df["labels"] = df["labels"].apply(lambda x: np.array(x, dtype=np.float32))
dataset = Dataset.from_pandas(df)

# Tokenizer
def tokenize(batch):
    return tokenizer(batch["case_text"], padding="max_length", truncation=True, max_length=512)

dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/9646 [00:00<?, ? examples/s]

In [10]:
from datasets import DatasetDict
dataset_train_val = dataset.train_test_split(test_size=0.1/0.9) # 0.1 /0.9 pour avoir meme taille de validation set et de test set

print("taille dataset entrainement :", dataset_train_val["train"].shape[0])
print("taille dataset validation :", dataset_train_val["test"].shape[0])

taille dataset entrainement : 8574
taille dataset validation : 1072


In [13]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss

training_args = TrainingArguments(
    output_dir="/home/onyxia/work/projet_NLP/results_BERT_base_ml_en_only",
    eval_strategy="epoch",         # évaluation à chaque époque
    save_strategy="epoch",               # checkpoint à chaque époque
    save_total_limit=1,                  # on ne garde que les 2 derniers
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,                 # on peut tenter d’augmenter
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",    # ← on surveille le micro-F1
    greater_is_better=True,              # ← plus c’est grand mieux c’est
    dataloader_num_workers=2,
    report_to="none",
    lr_scheduler_type="linear",          # on explicite le scheduler
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    thresholds = np.linspace(0.1, 0.99, 20)

    best_threshold = 0.5
    best_f1 = 0.0

    # on choisi le seuil qui maximise micro-F1
    for t in thresholds:
        preds = (logits >= t).astype(int)
        f1 = f1_score(labels, preds, average='micro', zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t

    # on calcule les métriques finales avec ce seuil optimal
    preds = (logits >= best_threshold).astype(int)
    return {
        'threshold': best_threshold,
        'micro_precision': precision_score(labels, preds, average='micro', zero_division=0),
        'micro_recall':    recall_score(labels, preds, average='micro', zero_division=0),
        'micro_f1':        f1_score(labels, preds, average='micro', zero_division=0),
        'hamming_loss':    hamming_loss(labels, preds),
        'exact_match':     np.mean(np.all(labels == preds, axis=1))
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train_val['train'],
    eval_dataset=dataset_train_val['test'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # stop si pas d’amélioration
)



In [None]:
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


In [None]:
#trainer.train(resume_from_checkpoint="./results/checkpoint-....")

# Evaluation 

In [None]:
# Chargement
model = AutoModelForSequenceClassification.from_pretrained("/home/onyxia/work/projet_NLP/results_BERT_base_ml_en_only")

### Evaluation on english testset

In [None]:
# 1. Préparez votre DataFrame de test :
#    - Renommez la colonne target en labels
#    - Transformez chaque liste de labels en array float32
df_test = data_test.rename(columns={"target": "labels"})
df_test["labels"] = df_test["labels"].apply(lambda x: np.array(x, dtype=np.float32))

# 2. Créez un Dataset Hugging Face
test_dataset = Dataset.from_pandas(df_test)

# 3. Définissez la même fonction de tokenisation que pour l’entraînement
def tokenize(batch):
    return tokenizer(
        batch["case_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# 4. Appliquez la tokenisation
test_dataset = test_dataset.map(tokenize, batched=True)

# 5. Facultatif : fixez le format PyTorch pour éviter d’avoir à convertir à la main
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 6a. Évaluation simple : renvoie loss + métriques de compute_metrics
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Résultats sur test :", test_metrics)

Map:   0%|          | 0/1072 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Résultats sur test : {'eval_loss': 0.17420071363449097, 'eval_model_preparation_time': 0.0052, 'eval_threshold': 0.1, 'eval_micro_precision': 0.739689578713969, 'eval_micro_recall': 0.5946524064171123, 'eval_micro_f1': 0.6592885375494071, 'eval_hamming_loss': 0.06185419058553387, 'eval_exact_match': 0.23787313432835822, 'eval_runtime': 13.4788, 'eval_samples_per_second': 79.532, 'eval_steps_per_second': 19.883}


### Evaluation on cases from English text_set translated in French

In [None]:
df_test_translated_in_fr = pd.read_pickle("data/test_Vtranslated_fr.pkl")
print(df_test_translated_in_fr.shape)
print(df_test_translated_in_fr.case_text_fr.head(1))

(300, 14)
0    Concernant les résultats histopathologiques chez ce patient, Bratincsak et al. ont défini l'infiltration lymphocytaire du myocarde à l'autopsie comme l'un des critères de myocardite fulminante. Cabral et coll. décrit le cas d'un garçon de 10 ans atteint d'une myocardite fulminante associée à une infection par le virus de la grippe A ; les résultats histopathologiques à l'autopsie étaient des infiltrats multifocaux comprenant principalement des lymphocytes. Vous suggérez que la dégénérescence de certains myocytes décrite dans notre rapport diffère de la nécrose des cardiomyocytes observée précédemment ; cependant, nous pensons que nos résultats sont similaires aux résultats précédents et qu’une erreur de traduction pourrait être à l’origine de toute différence perçue.\nMaria Lucia Saraiva Lobo, Angela Taguchi, Heloisa Amaral Gaspar, Juliana Ferreira Ferranti, Werther Brunow de Carvalho, Artur Figueiredo Delgado - Unité de soins intensifs pédiatriques, Instituto da Crianca

In [None]:
df_test_translated_in_fr['target'].apply(lambda arr: any(x is None for x in arr)).sum()


0

In [None]:
# 1. Préparez votre DataFrame de test :
#    - Renommez la colonne target en labels
#    - Transformez chaque liste de labels en array float32
df_test_fr = df_test_translated_in_fr.rename(columns={"target": "labels"})
df_test_fr["labels"] = df_test_fr["labels"].apply(lambda x: np.array(x, dtype=np.float32))

# 2. Créez un Dataset Hugging Face
test_dataset = Dataset.from_pandas(df_test_fr)

# 3. Définissez la même fonction de tokenisation que pour l’entraînement
def tokenize(batch):
    return tokenizer(
        batch["case_text_fr"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# 4. Appliquez la tokenisation
test_dataset = test_dataset.map(tokenize, batched=True)

# 5. Facultatif : fixez le format PyTorch pour éviter d’avoir à convertir à la main
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 6a. Évaluation simple : renvoie loss + métriques de compute_metrics
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Résultats sur test :", test_metrics)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Résultats sur test : {'eval_loss': 0.2062920182943344, 'eval_model_preparation_time': 0.0052, 'eval_threshold': 0.1, 'eval_micro_precision': 0.6787072243346007, 'eval_micro_recall': 0.48904109589041095, 'eval_micro_f1': 0.5684713375796179, 'eval_hamming_loss': 0.06948717948717949, 'eval_exact_match': 0.2, 'eval_runtime': 4.0533, 'eval_samples_per_second': 74.014, 'eval_steps_per_second': 18.503}


In [None]:
df = pd.DataFrame([metrics])
display(df)

Unnamed: 0,eval_loss,eval_threshold,eval_micro_precision,eval_micro_recall,eval_micro_f1,eval_hamming_loss,eval_exact_match_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.052049,0.99,0.994627,0.777871,0.872996,0.022227,0.564658,115.6827,92.65,23.167,5.0


In [None]:
os.getcwd()

'/home/onyxia/work/projet_NLP'

In [None]:
#trainer.save_model("./results_BERT_base_ml")

## TEST du modèle

In [None]:
text = """
A 68-year-old man presents with a persistent cough, present for three months, accompanied by increasing shortness of breath when he exerts himself. He also complains of recent lower back pain.
He has a significant smoking history, having smoked the equivalent of 40 packs of cigarettes per year. Notably, he reports coughing up sputum tinged with blood on occasion. During the physical examination, 
the physician observes diminished breath sounds specifically in the lower portion of his right lung. An initial chest X-ray reveals a concerning mass located in the right lower lobe of the lung.
To further investigate, a CT scan of the chest is performed. This imaging confirms the presence of a 4-centimeter mass within the right lower lobe. Additionally, 
the scan reveals enlarged lymph nodes in the region of the lung's hilum (hilar lymphadenopathy). Upon further questioning, the patient admits to experiencing nocturia, characterized by the need to urinate frequently during the night, 
approximately two to three times per night, over the past six months. He initially attributed this to simply drinking more fluids before bed. He also mentions mild, intermittent lower back pain that sometimes radiates down his right leg. 
He had previously dismissed this pain as a normal consequence of aging and stiffness. His medical history includes high blood pressure (hypertension), which is currently being managed with medication. An electrocardiogram (ECG) is performed as part of the evaluation. 
The ECG reveals a left bundle branch block, which is a new finding compared to previous ECG recordings. An echocardiogram shows mild left ventricular hypertrophy. To determine the specific nature of the lung mass and assess the involvement of the lymph nodes, 
the patient is scheduled for a bronchoscopy with a biopsy. In addition, due to his reported nocturia and lower back pain, 
a prostate-specific antigen (PSA) test will be performed to evaluate prostate health. A more comprehensive cardiac assessment is planned to further investigate the newly identified left bundle branch block.
"""

In [None]:
text

"\nA 68-year-old man presents with a persistent cough, present for three months, accompanied by increasing shortness of breath when he exerts himself. He also complains of recent lower back pain.\nHe has a significant smoking history, having smoked the equivalent of 40 packs of cigarettes per year. Notably, he reports coughing up sputum tinged with blood on occasion. During the physical examination, \nthe physician observes diminished breath sounds specifically in the lower portion of his right lung. An initial chest X-ray reveals a concerning mass located in the right lower lobe of the lung.\nTo further investigate, a CT scan of the chest is performed. This imaging confirms the presence of a 4-centimeter mass within the right lower lobe. Additionally, \nthe scan reveals enlarged lymph nodes in the region of the lung's hilum (hilar lymphadenopathy). Upon further questioning, the patient admits to experiencing nocturia, characterized by the need to urinate frequently during the night, \

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("./results_BERT_base_ml")
tokenizer = AutoTokenizer.from_pretrained("./results_BERT_base_ml")

NameError: name 'AutoModelForSequenceClassification' is not defined

In [None]:
# Tokenization
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

# Mettre le modèle en mode évaluation
model.eval()

# Tokenisation
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Prédiction (désactive le calcul de gradients)
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Appliquer une sigmoïde pour obtenir les probabilités
probs = torch.sigmoid(logits)

# Seuil pour dire si chaque label est actif ou pas (ici 0.5)
predicted_labels = (probs > 0.5).squeeze().bool().tolist()

# Affichage
print(predicted_labels)

[False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False]


In [None]:
type(predicted_labels)

list

In [None]:
import multilabel_preprocessing as mp
mp.mesh_labels_from_vector(np.array(predicted_labels))

['C04 – neoplasms',
 'C08 – respiratory tract diseases',
 'C14 – cardiovascular diseases']

In [None]:
# install captum if not already installed:
# pip install captum

import torch
from transformers import BertTokenizerFast
from captum.attr import IntegratedGradients
import collections

# load tokenizer and set model to evaluation mode
model = AutoModelForSequenceClassification.from_pretrained("./results_BERT_base_ml")
tokenizer = AutoTokenizer.from_pretrained("./results_BERT_base_ml")
model.eval()

# helper function that returns the logit for a single label
def forward_label(input_ids, attention_mask, label_index):
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    return outputs.logits[:, label_index]

# compute attributions and return the top k tokens for one label
def get_top_tokens_for_label(text, label_idx, top_k=2, max_length=128):
    encoding = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=max_length
    )
    # initialize Integrated Gradients with our forward function
    ig = IntegratedGradients(lambda ids, mask: forward_label(ids, mask, label_idx))
    # use all-[PAD] sequence as baseline
    baseline_ids = torch.full_like(encoding["input_ids"], tokenizer.pad_token_id)
    baseline_mask = torch.zeros_like(encoding["attention_mask"])
    attributions, delta = ig.attribute(
        inputs=encoding["input_ids"],
        baselines=baseline_ids,
        additional_forward_args=(encoding["attention_mask"],),
        return_convergence_delta=True
    )
    # sum absolute attributions across the embedding dimension
    token_importances = attributions.abs().sum(dim=-1).squeeze(0)
    tokens = tokenizer.convert_ids_to_tokens(encoding["input_ids"].squeeze(0))
    # filter out special tokens and pair each token with its score
    scored_tokens = [
        (tok, float(score))
        for tok, score in zip(tokens, token_importances)
        if tok not in tokenizer.all_special_tokens
    ]
    # sort by importance and keep top_k
    scored_tokens.sort(key=lambda x: x[1], reverse=True)
    return scored_tokens[:top_k]

# example: show top tokens for each label on a single text
example_text = "Patient is a 65-year-old male presenting with progressive shortness of breath and cough."
for idx, label in model.config.id2label.items():
    top_tokens = get_top_tokens_for_label(example_text, idx)
    print(f"Label '{label}' top tokens: {top_tokens}")

# aggregate attributions over multiple examples to get global importance
aggregate_scores = collections.Counter()
token_counts = collections.Counter()

for text in validation_texts:
    for idx in model.config.id2label.keys():
        for tok, score in get_top_tokens_for_label(text, idx):
            aggregate_scores[(idx, tok)] += score
            token_counts[(idx, tok)] += 1

# compute average score per token per label
average_scores = {
    key: aggregate_scores[key] / token_counts[key]
    for key in aggregate_scores
}

# display the global top 2 tokens for each label
for idx, label in model.config.id2label.items():
    tokens_for_label = [
        (tok, average_scores[(idx, tok)])
        for (i, tok) in average_scores if i == idx
    ]
    tokens_for_label.sort(key=lambda x: x[1], reverse=True)
    print(f"Label '{label}' global top tokens: {tokens_for_label[:2]}")
