# BERT-base-multilingual-cased

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # Barre de progression pour Jupyter
pd.set_option('display.max_colwidth', None)
import s3fs

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
import re

import torch
from sklearn.metrics import (
    f1_score, precision_score, recall_score, hamming_loss, accuracy_score
)

In [2]:
os.chdir("/home/onyxia/work/projet_NLP")
print(os.getcwd())

/home/onyxia/work/projet_NLP


In [3]:
data_train = pd.read_pickle('data/df_train.pkl')
data_test = pd.read_pickle('data/df_test.pkl')

In [4]:
print(torch.cuda.is_available())  # doit renvoyer True
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))  # nom de ton GPU

True
NVIDIA A2


## Addition of French dataset train to training set


In [5]:
data_train_fr = pd.read_pickle('data/df_train_fr.pkl')
data_train_fr["language"] = "fr"
data_train_fr.head(2)
print(data_train_fr.shape)

(300, 3)


In [6]:
# We chose 167 out of the 300 case in french from the 'data_train_fr' to match with the size of the training set in the article of Gérardin et al.
sample_167_train_fr = data_train_fr.sample(167)

print(
    f"We selected {sample_167_train_fr.shape[0]} of the 300 French cases in `data_train_fr` "
    "to match the training set size used in Gérardin et al.'s article."
)


We selected 167 of the 300 French cases in `data_train_fr` to match the training set size used in Gérardin et al.'s article.


In [7]:
data_train = data_train.filter(items=['case_text', 'target'])
data_train["language"] = "en"
old_shape = data_train.shape
print(old_shape)

(9646, 3)


In [8]:
# 1. Vérifier que les colonnes sont strictement identiques avant concaténation
if not data_train.columns.equals(sample_167_train_fr.columns):
    raise ValueError(
        "Les colonnes de data_train et sample_167_train_fr ne correspondent pas !\n"
        f"data_train.columns: {list(data_train.columns)}\n"
        f"sample_167_train_fr.columns: {list(sample_167_train_fr.columns)}"
    )

# 2. Mémoriser l’ancienne forme
old_shape = data_train.shape

# 3. Concaténer uniquement par lignes (axis=0)
data_train = pd.concat([data_train, sample_167_train_fr], axis=0)

# 4. Vérifier qu’aucune colonne n’a été ajoutée ou supprimée
new_shape = data_train.shape
assert new_shape[1] == old_shape[1], (
    f"Le nombre de colonnes a changé : "
    f"avant {old_shape[1]}, après {new_shape[1]}"
)

# 5. Vérifier que le nombre de lignes ajouté correspond bien à sample_167_train_fr.shape[0]
expected_rows = old_shape[0] + sample_167_train_fr.shape[0]
assert new_shape[0] == expected_rows, (
    f"Le nombre de lignes après concat est {new_shape[0]}, "
    f"alors qu’on attendait {expected_rows}"
)

# 6. Message de confirmation
print(
    f"{new_shape[0] - old_shape[0]} rows of the 300 rows of the 'data_train_fr' dataset, "
    "which contains fake patient cases in French generated by GEMINI, "
    "have been added to the 'data_train' dataset."
)


167 rows of the 300 rows of the 'data_train_fr' dataset, which contains fake patient cases in French generated by GEMINI, have been added to the 'data_train' dataset.


## Training BERT-base-multilingual-cased

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Modèle bilingue FR/EN (multilingual BERT)
model_name = "bert-base-multilingual-cased"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Nombre de classes en sortie
n_labels = 26  # à ajuster selon ton jeu de données

# Modèle pour classification multi-label
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_labels,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# 1. Calculer le nombre de tokens pour chaque case_text
stat=pd.DataFrame()
stat['n_tokens'] = data_train['case_text'].apply(lambda x: len(tokenizer.tokenize(str(x))))

# 2. Afficher les stats descriptives
print(stat['n_tokens'].describe())

Token indices sequence length is longer than the specified maximum sequence length for this model (794 > 512). Running this sequence through the model will result in indexing errors


count     9813.000000
mean       905.661164
std        636.998588
min         34.000000
25%        506.000000
50%        765.000000
75%       1120.000000
max      14186.000000
Name: n_tokens, dtype: float64


In [11]:
from datasets import Dataset
df = data_train.rename(columns={"target": "labels"})
df["labels"] = df["labels"].apply(lambda x: np.array(x, dtype=np.float32))
dataset = Dataset.from_pandas(df)

# Tokenizer
def tokenize(batch):
    return tokenizer(batch["case_text"], padding="max_length", truncation=True, max_length=512)

dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/9813 [00:00<?, ? examples/s]

In [12]:
from datasets import DatasetDict
dataset_train_val = dataset.train_test_split(test_size=0.1/0.9) # 0.1 /0.9 pour avoir meme taille de validation set et de test set

print("taille dataset entrainement :", dataset_train_val["train"].shape[0])
print("taille dataset validation :", dataset_train_val["test"].shape[0])

taille dataset entrainement : 8722
taille dataset validation : 1091


In [13]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss

training_args = TrainingArguments(
    output_dir="../results_BERT_base_ml",
    evaluation_strategy="epoch",         # évaluation à chaque époque
    save_strategy="epoch",               # checkpoint à chaque époque
    save_total_limit=2,                  # on ne garde que les 2 derniers
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,                 # on peut tenter d’augmenter
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4,
    warmup_ratio=0.1,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",    # ← on surveille le micro-F1
    greater_is_better=True,              # ← plus c’est grand mieux c’est
    dataloader_num_workers=2,
    report_to="none",
    lr_scheduler_type="linear",          # on explicite le scheduler
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    thresholds = np.linspace(0.1, 0.99, 20)

    best_threshold = 0.5
    best_f1 = 0.0

    # on choisi le seuil qui maximise micro-F1
    for t in thresholds:
        preds = (logits >= t).astype(int)
        f1 = f1_score(labels, preds, average='micro', zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = t

    # on calcule les métriques finales avec ce seuil optimal
    preds = (logits >= best_threshold).astype(int)
    return {
        'threshold': best_threshold,
        'micro_precision': precision_score(labels, preds, average='micro', zero_division=0),
        'micro_recall':    recall_score(labels, preds, average='micro', zero_division=0),
        'micro_f1':        f1_score(labels, preds, average='micro', zero_division=0),
        'hamming_loss':    hamming_loss(labels, preds),
        'exact_match':     np.mean(np.all(labels == preds, axis=1))
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train_val['train'],
    eval_dataset=dataset_train_val['test'],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # stop si pas d’amélioration
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Threshold,Micro Precision,Micro Recall,Micro F1,Hamming Loss,Exact Match,Runtime,Samples Per Second,Steps Per Second
1,0.2444,0.255461,0.1,0.629098,0.11083,0.188459,0.09321,0.067828,13.8898,78.547,19.655
2,0.1928,0.198991,0.1,0.781759,0.34657,0.48024,0.073257,0.149404,13.8337,78.866,19.734
3,0.1693,0.180604,0.1,0.749235,0.442238,0.556186,0.068921,0.186984,13.9204,78.374,19.612
4,0.1401,0.174315,0.1,0.765652,0.481227,0.591,0.065043,0.218148,13.8979,78.501,19.643
5,0.1235,0.173863,0.1,0.743178,0.501444,0.598836,0.065607,0.226398,13.7356,79.428,19.875
6,0.1222,0.17274,0.193684,0.733922,0.539711,0.62201,0.064056,0.231897,13.8206,78.94,19.753
7,0.0974,0.177431,0.1,0.708733,0.559567,0.625378,0.065466,0.216315,13.9763,78.06,19.533
8,0.0905,0.178108,0.1,0.712135,0.563538,0.629182,0.064866,0.218148,13.9211,78.37,19.611
9,0.0785,0.180345,0.1,0.718203,0.548375,0.621904,0.065113,0.229148,13.9509,78.203,19.569


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

TrainOutput(global_step=5450, training_loss=0.15157086852493637, metrics={'train_runtime': 4662.0413, 'train_samples_per_second': 18.709, 'train_steps_per_second': 1.169, 'total_flos': 2.291506885436621e+16, 'train_loss': 0.15157086852493637, 'epoch': 9.983035304906007})

In [None]:
# 1. Préparez votre DataFrame de test :
#    - Renommez la colonne target en labels
#    - Transformez chaque liste de labels en array float32
df_test = data_test.rename(columns={"target": "labels"})
df_test["labels"] = df_test["labels"].apply(lambda x: np.array(x, dtype=np.float32))

# 2. Créez un Dataset Hugging Face
test_dataset = Dataset.from_pandas(df_test)

# 3. Définissez la même fonction de tokenisation que pour l’entraînement
def tokenize(batch):
    return tokenizer(
        batch["case_text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# 4. Appliquez la tokenisation
test_dataset = test_dataset.map(tokenize, batched=True)

# 5. Facultatif : fixez le format PyTorch pour éviter d’avoir à convertir à la main
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# 6a. Évaluation simple : renvoie loss + métriques de compute_metrics
test_metrics = trainer.evaluate(eval_dataset=test_dataset)
print("Résultats sur test :", test_metrics)

In [None]:
trainer.train(resume_from_checkpoint="./results/checkpoint-3350")

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=3350, training_loss=0.0, metrics={'train_runtime': 0.4082, 'train_samples_per_second': 131291.268, 'train_steps_per_second': 8207.235, 'total_flos': 1.4136845312249856e+16, 'train_loss': 0.0, 'epoch': 5.0})

In [None]:
metrics = trainer.evaluate()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
df = pd.DataFrame([metrics])
display(df)

Unnamed: 0,eval_loss,eval_threshold,eval_micro_precision,eval_micro_recall,eval_micro_f1,eval_hamming_loss,eval_exact_match_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.052049,0.99,0.994627,0.777871,0.872996,0.022227,0.564658,115.6827,92.65,23.167,5.0


In [14]:
os.getcwd()

'/home/onyxia/work/projet_NLP'

In [18]:
trainer.save_model("./results_BERT_base_ml")

## TEST du modèle

In [None]:
text = """
A 68-year-old man presents with a persistent cough, present for three months, accompanied by increasing shortness of breath when he exerts himself. He also complains of recent lower back pain.
He has a significant smoking history, having smoked the equivalent of 40 packs of cigarettes per year. Notably, he reports coughing up sputum tinged with blood on occasion. During the physical examination, 
the physician observes diminished breath sounds specifically in the lower portion of his right lung. An initial chest X-ray reveals a concerning mass located in the right lower lobe of the lung.
To further investigate, a CT scan of the chest is performed. This imaging confirms the presence of a 4-centimeter mass within the right lower lobe. Additionally, 
the scan reveals enlarged lymph nodes in the region of the lung's hilum (hilar lymphadenopathy). Upon further questioning, the patient admits to experiencing nocturia, characterized by the need to urinate frequently during the night, 
approximately two to three times per night, over the past six months. He initially attributed this to simply drinking more fluids before bed. He also mentions mild, intermittent lower back pain that sometimes radiates down his right leg. 
He had previously dismissed this pain as a normal consequence of aging and stiffness. His medical history includes high blood pressure (hypertension), which is currently being managed with medication. An electrocardiogram (ECG) is performed as part of the evaluation. 
The ECG reveals a left bundle branch block, which is a new finding compared to previous ECG recordings. An echocardiogram shows mild left ventricular hypertrophy. To determine the specific nature of the lung mass and assess the involvement of the lymph nodes, 
the patient is scheduled for a bronchoscopy with a biopsy. In addition, due to his reported nocturia and lower back pain, 
a prostate-specific antigen (PSA) test will be performed to evaluate prostate health. A more comprehensive cardiac assessment is planned to further investigate the newly identified left bundle branch block.
"""

In [None]:
text

"\nA 68-year-old man presents with a persistent cough, present for three months, accompanied by increasing shortness of breath when he exerts himself. He also complains of recent lower back pain.\nHe has a significant smoking history, having smoked the equivalent of 40 packs of cigarettes per year. Notably, he reports coughing up sputum tinged with blood on occasion. During the physical examination, \nthe physician observes diminished breath sounds specifically in the lower portion of his right lung. An initial chest X-ray reveals a concerning mass located in the right lower lobe of the lung.\nTo further investigate, a CT scan of the chest is performed. This imaging confirms the presence of a 4-centimeter mass within the right lower lobe. Additionally, \nthe scan reveals enlarged lymph nodes in the region of the lung's hilum (hilar lymphadenopathy). Upon further questioning, the patient admits to experiencing nocturia, characterized by the need to urinate frequently during the night, \

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("./mon_modele_final2")
tokenizer = AutoTokenizer.from_pretrained("./mon_modele_final2")

In [None]:
# Tokenization
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

# Mettre le modèle en mode évaluation
model.eval()

# Tokenisation
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Prédiction (désactive le calcul de gradients)
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Appliquer une sigmoïde pour obtenir les probabilités
probs = torch.sigmoid(logits)

# Seuil pour dire si chaque label est actif ou pas (ici 0.5)
predicted_labels = (probs > 0.5).squeeze().bool().tolist()

# Affichage
print(predicted_labels)

[False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False]


In [None]:
type(predicted_labels)

list

In [None]:
import multilabel_preprocessing as mp
mp.mesh_labels_from_vector(np.array(predicted_labels))

['C04 – neoplasms',
 'C08 – respiratory tract diseases',
 'C14 – cardiovascular diseases']

## Autre

| Étape                  | Outils                         | Ce que tu fais                            |
|------------------------|--------------------------------|--------------------------------------------|
| Choix du modèle        | HuggingFace `transformers`     | Utilise un BERT médical pré-entraîné       |
| Préparation des données| `datasets`, `tokenizer`        | Tokenisation + conversion des labels       |
| Modélisation           | `AutoModelForSequenceClassification` | Déclare une classification multi-label |
| Entraînement           | `Trainer`                      | Fine-tuning du modèle sur tes données      |
| Évaluation             | `f1_score`, `hamming_loss`     | Calcul des performances globales           |

In [None]:
raise RuntimeError("⛔ Cette cellule ne doit pas être exécutée.")

# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET_OUT = "s3://quentin1999/Data_Projet_NLP"
FILE_KEY_OUT_S3 = "df_target_V3.pkl"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'wb') as file_out:
    df.to_pickle(file_out)

In [None]:
raise RuntimeError("⛔ Cette cellule ne doit pas être exécutée.")

from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./mon_modele_final")
tokenizer = AutoTokenizer.from_pretrained("./mon_modele_final")

## Enregistrer le modèle BERT trained

In [None]:
import shutil
import zipfile

# === 1. Zippage du dossier ===
output_dir = ".././mon_modele_final"
zip_path = ".././mon_modele_final.zip"
shutil.make_archive(base_name="mon_modele_final", format='zip', root_dir=output_dir)

# === 2. Envoi vers S3 Vault ===
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET_OUT = "s3://quentin1999/Data_Projet_NLP"
MODEL_ZIP_KEY = "mon_modele_final.zip"
MODEL_ZIP_PATH_S3 = BUCKET_OUT + "/" + MODEL_ZIP_KEY

with fs.open(MODEL_ZIP_PATH_S3, 'wb') as f_out:
    with open(zip_path, 'rb') as f_in:
        shutil.copyfileobj(f_in, f_out)

print("✅ Modèle sauvegardé dans le Vault S3 :", MODEL_ZIP_PATH_S3)

✅ Modèle sauvegardé dans le Vault S3 : s3://quentin1999/Data_Projet_NLP/mon_modele_final.zip


In [None]:
# Téléchargement depuis S3
with fs.open(MODEL_ZIP_PATH_S3, 'rb') as f_in:
    with open("mon_modele_final.zip", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Dézippage
shutil.unpack_archive("mon_modele_final.zip", extract_dir="./mon_modele_final")

# Chargement
model = AutoModelForSequenceClassification.from_pretrained("./mon_modele_final")
tokenizer = AutoTokenizer.from_pretrained("./mon_modele_final")

In [None]:
from sklearn.metrics import f1_score, hamming_loss
import numpy as np
import torch
from tqdm import tqdm

# 1. Mettre le modèle en mode évaluation
model.eval()

# Exemple si tu as X_test sous forme de textes
for batch in tqdm(X_test):  # ou DataLoader, selon ta structure
    inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits.cpu().numpy()
    y_pred_logits.append(logits)

# 3. Empiler les logits et binariser
y_pred_logits = np.vstack(y_pred_logits)         # (n_samples, n_classes)
y_pred = (y_pred_logits >= 0.5).astype(int)      # Seuil de 0.5 pour binariser

# 4. Évaluation (en supposant que y_test_array est déjà binairisé)
f1_micro = f1_score(y_test_array, y_pred, average='micro')
f1_macro = f1_score(y_test_array, y_pred, average='macro')
hamming = hamming_loss(y_test_array, y_pred)

print(f"✅ F1 Micro : {f1_micro:.4f}")
print(f"✅ F1 Macro : {f1_macro:.4f}")
print(f"🔁 Hamming Loss : {hamming:.4f}")

NameError: name 'X_test' is not defined