# BERT

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm  # Barre de progression pour Jupyter
pd.set_option('display.max_colwidth', None)
import s3fs

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset

import torch
from sklearn.metrics import (
    f1_score, precision_score, recall_score, hamming_loss, accuracy_score
)

In [2]:
data_train = pd.read_pickle('../data/df_train.pkl')
data_test = pd.read_pickle('../data/df_test.pkl')

In [4]:
print(torch.cuda.is_available())  # doit renvoyer True
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))  # nom de ton GPU

False


In [4]:
# Nom du mod√®le m√©dical BERT
model_name = "emilyalsentzer/Bio_ClinicalBERT"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tu dois d√©finir le nombre de classes en sortie
n_labels = 26  # par exemple
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=n_labels,
    problem_type="multi_label_classification"
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 1. Calculer le nombre de tokens pour chaque case_text
df['n_tokens'] = df['case_text'].apply(lambda x: len(tokenizer.tokenize(str(x))))

# 2. Afficher les stats descriptives
print(df['n_tokens'].describe())

count    10718.000000
mean       892.022765
std        623.373792
min         32.000000
25%        493.000000
50%        758.000000
75%       1111.000000
max      13554.000000
Name: n_tokens, dtype: float64


In [None]:
from datasets import Dataset
df = data_train.rename(columns={"target": "labels"})
df["labels"] = df["labels"].apply(lambda x: np.array(x, dtype=np.float32))
dataset = Dataset.from_pandas(df)

# Tokenizer
def tokenize(batch):
    return tokenizer(batch["case_text"], padding="max_length", truncation=True, max_length=512)

dataset = dataset.map(tokenize, batched=True)

Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10718/10718 [00:04<00:00, 2313.24 examples/s]


In [12]:
training_args = TrainingArguments(
    output_dir="../results",
    eval_strategy="epoch",              # √©valuation √† chaque √©poque
    save_strategy="epoch",                    # sauvegarde √† chaque √©poque (utile pour reprise)
    save_total_limit=2,                       # limite le nombre de checkpoints
    learning_rate=3e-5,                       # l√©g√®rement augment√© pour convergence plus rapide
    per_device_train_batch_size=4,            # r√©duit √† 4 pour √©viter OOM (GPU 16 Go)
    per_device_eval_batch_size=4,
    num_train_epochs=5,                       # plus d'√©poques si dataset pas trop gros
    weight_decay=0.01,
    fp16=True,                                # active le mixed precision training (optimis√© pour Ampere)
    gradient_accumulation_steps=4,            # simule un batch size plus grand (4x4 = 16)
    warmup_ratio=0.1,                         # warmup sur 10% des steps pour stabiliser l'entra√Ænement
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,              # r√©cup√®re le meilleur mod√®le (selon eval loss)
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    dataloader_num_workers=2,                 # l√©g√®re parall√©lisation de l‚ÄôI/O
    report_to="none",                         # d√©sactive Weights & Biases si non utilis√©
)

def exact_match_accuracy(y_true, y_pred):
    return np.all(y_true == y_pred, axis=1).mean()

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    thresholds = np.linspace(0.1, 0.99, 20)

    best_threshold = 0.5
    best_precision = 0

    for t in thresholds:
        preds = (logits >= t).astype(int)
        micro_precision = precision_score(labels, preds, average='micro', zero_division=0)
        if micro_precision > best_precision:
            best_precision = micro_precision
            best_threshold = t

    # Final prediction with the best threshold
    preds = (logits >= best_threshold).astype(int)

    return {
        'threshold': best_threshold,
        'micro_precision': precision_score(labels, preds, average='micro', zero_division=0),
        'micro_recall': recall_score(labels, preds, average='micro', zero_division=0),
        'micro_f1': f1_score(labels, preds, average='micro', zero_division=0),
        'hamming_loss': hamming_loss(labels, preds),
        'exact_match_accuracy': exact_match_accuracy(labels, preds)
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.train(resume_from_checkpoint="./results/checkpoint-3350")

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=3350, training_loss=0.0, metrics={'train_runtime': 0.4082, 'train_samples_per_second': 131291.268, 'train_steps_per_second': 8207.235, 'total_flos': 1.4136845312249856e+16, 'train_loss': 0.0, 'epoch': 5.0})

In [None]:
metrics = trainer.evaluate()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
df = pd.DataFrame([metrics])
display(df)

Unnamed: 0,eval_loss,eval_threshold,eval_micro_precision,eval_micro_recall,eval_micro_f1,eval_hamming_loss,eval_exact_match_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.052049,0.99,0.994627,0.777871,0.872996,0.022227,0.564658,115.6827,92.65,23.167,5.0


In [None]:
trainer.save_model(".././mon_modele_final2")
tokenizer.save_pretrained(".././mon_modele_final2")

('./mon_modele_final/tokenizer_config.json',
 './mon_modele_final/special_tokens_map.json',
 './mon_modele_final/vocab.txt',
 './mon_modele_final/added_tokens.json',
 './mon_modele_final/tokenizer.json')

## TEST du mod√®le

In [None]:
text = """
A 68-year-old man presents with a persistent cough, present for three months, accompanied by increasing shortness of breath when he exerts himself. He also complains of recent lower back pain.
He has a significant smoking history, having smoked the equivalent of 40 packs of cigarettes per year. Notably, he reports coughing up sputum tinged with blood on occasion. During the physical examination, 
the physician observes diminished breath sounds specifically in the lower portion of his right lung. An initial chest X-ray reveals a concerning mass located in the right lower lobe of the lung.
To further investigate, a CT scan of the chest is performed. This imaging confirms the presence of a 4-centimeter mass within the right lower lobe. Additionally, 
the scan reveals enlarged lymph nodes in the region of the lung's hilum (hilar lymphadenopathy). Upon further questioning, the patient admits to experiencing nocturia, characterized by the need to urinate frequently during the night, 
approximately two to three times per night, over the past six months. He initially attributed this to simply drinking more fluids before bed. He also mentions mild, intermittent lower back pain that sometimes radiates down his right leg. 
He had previously dismissed this pain as a normal consequence of aging and stiffness. His medical history includes high blood pressure (hypertension), which is currently being managed with medication. An electrocardiogram (ECG) is performed as part of the evaluation. 
The ECG reveals a left bundle branch block, which is a new finding compared to previous ECG recordings. An echocardiogram shows mild left ventricular hypertrophy. To determine the specific nature of the lung mass and assess the involvement of the lymph nodes, 
the patient is scheduled for a bronchoscopy with a biopsy. In addition, due to his reported nocturia and lower back pain, 
a prostate-specific antigen (PSA) test will be performed to evaluate prostate health. A more comprehensive cardiac assessment is planned to further investigate the newly identified left bundle branch block.
"""

In [None]:
text

"\nA 68-year-old man presents with a persistent cough, present for three months, accompanied by increasing shortness of breath when he exerts himself. He also complains of recent lower back pain.\nHe has a significant smoking history, having smoked the equivalent of 40 packs of cigarettes per year. Notably, he reports coughing up sputum tinged with blood on occasion. During the physical examination, \nthe physician observes diminished breath sounds specifically in the lower portion of his right lung. An initial chest X-ray reveals a concerning mass located in the right lower lobe of the lung.\nTo further investigate, a CT scan of the chest is performed. This imaging confirms the presence of a 4-centimeter mass within the right lower lobe. Additionally, \nthe scan reveals enlarged lymph nodes in the region of the lung's hilum (hilar lymphadenopathy). Upon further questioning, the patient admits to experiencing nocturia, characterized by the need to urinate frequently during the night, \

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("./mon_modele_final2")
tokenizer = AutoTokenizer.from_pretrained("./mon_modele_final2")

In [None]:
# Tokenization
inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)

# Mettre le mod√®le en mode √©valuation
model.eval()

# Tokenisation
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# Pr√©diction (d√©sactive le calcul de gradients)
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# Appliquer une sigmo√Øde pour obtenir les probabilit√©s
probs = torch.sigmoid(logits)

# Seuil pour dire si chaque label est actif ou pas (ici 0.5)
predicted_labels = (probs > 0.5).squeeze().bool().tolist()

# Affichage
print(predicted_labels)

[False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, False, False, False, False, False, False, False]


In [None]:
type(predicted_labels)

list

In [None]:
import multilabel_preprocessing as mp
mp.mesh_labels_from_vector(np.array(predicted_labels))

['C04 ‚Äì neoplasms',
 'C08 ‚Äì respiratory tract diseases',
 'C14 ‚Äì cardiovascular diseases']

## Autre

| √âtape                  | Outils                         | Ce que tu fais                            |
|------------------------|--------------------------------|--------------------------------------------|
| Choix du mod√®le        | HuggingFace `transformers`     | Utilise un BERT m√©dical pr√©-entra√Æn√©       |
| Pr√©paration des donn√©es| `datasets`, `tokenizer`        | Tokenisation + conversion des labels       |
| Mod√©lisation           | `AutoModelForSequenceClassification` | D√©clare une classification multi-label |
| Entra√Ænement           | `Trainer`                      | Fine-tuning du mod√®le sur tes donn√©es      |
| √âvaluation             | `f1_score`, `hamming_loss`     | Calcul des performances globales           |

In [None]:
raise RuntimeError("‚õî Cette cellule ne doit pas √™tre ex√©cut√©e.")

# Create filesystem object
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET_OUT = "s3://quentin1999/Data_Projet_NLP"
FILE_KEY_OUT_S3 = "df_target_V3.pkl"
FILE_PATH_OUT_S3 = BUCKET_OUT + "/" + FILE_KEY_OUT_S3

with fs.open(FILE_PATH_OUT_S3, 'wb') as file_out:
    df.to_pickle(file_out)

In [None]:
raise RuntimeError("‚õî Cette cellule ne doit pas √™tre ex√©cut√©e.")

from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("./mon_modele_final")
tokenizer = AutoTokenizer.from_pretrained("./mon_modele_final")

## Enregistrer le mod√®le BERT trained

In [None]:
import shutil
import zipfile

# === 1. Zippage du dossier ===
output_dir = ".././mon_modele_final"
zip_path = ".././mon_modele_final.zip"
shutil.make_archive(base_name="mon_modele_final", format='zip', root_dir=output_dir)

# === 2. Envoi vers S3 Vault ===
S3_ENDPOINT_URL = "https://" + os.environ["AWS_S3_ENDPOINT"]
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

BUCKET_OUT = "s3://quentin1999/Data_Projet_NLP"
MODEL_ZIP_KEY = "mon_modele_final.zip"
MODEL_ZIP_PATH_S3 = BUCKET_OUT + "/" + MODEL_ZIP_KEY

with fs.open(MODEL_ZIP_PATH_S3, 'wb') as f_out:
    with open(zip_path, 'rb') as f_in:
        shutil.copyfileobj(f_in, f_out)

print("‚úÖ Mod√®le sauvegard√© dans le Vault S3 :", MODEL_ZIP_PATH_S3)

‚úÖ Mod√®le sauvegard√© dans le Vault S3 : s3://quentin1999/Data_Projet_NLP/mon_modele_final.zip


In [None]:
# T√©l√©chargement depuis S3
with fs.open(MODEL_ZIP_PATH_S3, 'rb') as f_in:
    with open("mon_modele_final.zip", 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# D√©zippage
shutil.unpack_archive("mon_modele_final.zip", extract_dir="./mon_modele_final")

# Chargement
model = AutoModelForSequenceClassification.from_pretrained("./mon_modele_final")
tokenizer = AutoTokenizer.from_pretrained("./mon_modele_final")

In [None]:
from sklearn.metrics import f1_score, hamming_loss
import numpy as np
import torch
from tqdm import tqdm

# 1. Mettre le mod√®le en mode √©valuation
model.eval()

# Exemple si tu as X_test sous forme de textes
for batch in tqdm(X_test):  # ou DataLoader, selon ta structure
    inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits.cpu().numpy()
    y_pred_logits.append(logits)

# 3. Empiler les logits et binariser
y_pred_logits = np.vstack(y_pred_logits)         # (n_samples, n_classes)
y_pred = (y_pred_logits >= 0.5).astype(int)      # Seuil de 0.5 pour binariser

# 4. √âvaluation (en supposant que y_test_array est d√©j√† binairis√©)
f1_micro = f1_score(y_test_array, y_pred, average='micro')
f1_macro = f1_score(y_test_array, y_pred, average='macro')
hamming = hamming_loss(y_test_array, y_pred)

print(f"‚úÖ F1 Micro : {f1_micro:.4f}")
print(f"‚úÖ F1 Macro : {f1_macro:.4f}")
print(f"üîÅ Hamming Loss : {hamming:.4f}")

NameError: name 'X_test' is not defined