# Model BERT

In [14]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split

import transformers
from tokenizers import BertWordPieceTokenizer


print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


## Chargement des données

In [15]:
df = pd.read_csv('./datasets/test_filtered_data.csv')

In [16]:
test_portion=.2
comments = df['comment_text']
y = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'isToxic']] # Multilabel
# y = df['isToxic'] # Binary

In [17]:
X_train, X_test, y_train, y_test = train_test_split(comments, y, test_size=test_portion)

## Fonction d'encodage rapide

Cette fonction encode rapidement les textes en séquences d'entiers en utilisant un tokenizer BERT

In [18]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    Encoder for encoding the text into sequence of integers for BERT Input
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [19]:
AUTO = tf.data.experimental.AUTOTUNE

strategy = tf.distribute.get_strategy()
# Configuration
EPOCHS = 6
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192

Tokenizer pré-entraîné BERT (DistilBERT)

In [20]:
# First load the real tokenizer
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')
# Save the loaded tokenizer locally
tokenizer.save_pretrained('.')
# Reload it with the huggingface tokenizers library
fast_tokenizer = BertWordPieceTokenizer('vocab.txt', lowercase=False)
fast_tokenizer

Tokenizer(vocabulary_size=119547, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=False, wordpieces_prefix=##)

## Encodage des données d'entrainement

In [21]:
X_train = fast_encode(X_train, fast_tokenizer, maxlen=MAX_LEN)
X_test = fast_encode(X_test, fast_tokenizer, maxlen=MAX_LEN)

100%|██████████| 47/47 [00:00<00:00, 154.96it/s]
100%|██████████| 12/12 [00:00<00:00, 165.42it/s]


## Création des ensembles de données

In [22]:
# Créez vos datasets
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((X_test, y_test))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(X_test)
    .batch(BATCH_SIZE)
)

Configuration GPU

In [23]:
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=1.0)
config = tf.compat.v1.ConfigProto(gpu_options=gpu_options)
session = tf.compat.v1.Session(config=config)

2024-03-15 15:46:30.853415: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-15 15:46:30.853500: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-15 15:46:30.853519: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-15 15:46:30.853911: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-03-15 15:46:30.853930: I tensorflow/core/common_runtime/gpu/gpu

## Construction du modèle BERT

In [24]:
def build_model(transformer, max_len=512):
    """
    function for training the BERT model
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    # out = Dense(1, activation='sigmoid')(cls_token) # binaire
    out = Dense(7, activation='sigmoid')(cls_token) # multilabel
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [25]:
%%time
with strategy.scope():
    transformer_layer = (
        transformers.TFDistilBertModel
        .from_pretrained('distilbert-base-multilingual-cased')
    )
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_word_ids (InputLayer  [(None, 192)]             0         
 )                                                               
                                                                 
 tf_distil_bert_model_1 (TF  TFBaseModelOutput(last_   134734080 
 DistilBertModel)            hidden_state=(None, 192             
                             , 768),                             
                              hidden_states=None, at             
                             tentions=None)                      
                                                                 
 tf.__operators__.getitem_1  (None, 768)               0         
  (SlicingOpLambda)                                              
                                                                 
 dense_1 (Dense)             (None, 7)                 5383

## Entrainement du modèle

In [26]:
n_steps = X_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Epoch 1/6


2024-03-15 15:46:45.069832: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f95450cbc60 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-03-15 15:46:45.069874: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4070 Laptop GPU, Compute Capability 8.9
2024-03-15 15:46:45.076743: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-03-15 15:46:45.097346: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1710514005.159368   62801 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


## Metrics

In [32]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc
from sklearn.metrics import precision_score, recall_score, f1_score


def roc_auc(predictions,target):
    '''
    This methods returns the AUC Score when given the Predictions
    and Labels
    '''
    
    fpr, tpr, thresholds = roc_curve(target, predictions)
    roc_auc = auc(fpr, tpr)
    return roc_auc

In [30]:
# display en BERT binaire
# def display_metrics(model, X_test, y_test):
#     # prédictions et vraies valeurs
#     y_pred = model.predict(X_test)
#     y_true = y_test

#     # Evaluation du modèle
#     evaluate = model.evaluate(X_test, y_test, verbose=0)

#     # Affichage de la loss
#     loss = evaluate[0]
#     print(f"Loss: {loss} ")

#     # Affichage de l'accuracy
#     accuracy_score = evaluate[1]
#     print(f"Accuracy: {accuracy_score} ")

#     # Affichage de l'aire sous la courbe ROC
#     roc = roc_auc(y_pred, y_true)
#     print(f"Auc: {roc} ")

#     # Arrondir les prédictions
#     y_pred = y_pred.round()

#     # Affichage de la matrice de confusion
#     conf = confusion_matrix(y_true, y_pred)
#     print(f"Matrice de confusion : {conf}")

#     # Affichage de la précision
#     precision = precision_score(y_true, y_pred)
#     print(f"Précision : {precision}")

#     # Affichage du rappel
#     recall = recall_score(y_true, y_pred)
#     print(f"Rappel : {recall}")

#     # Affichage du score F1
#     f1 = f1_score(y_true, y_pred)
#     print(f"F1 : {f1}")
    

In [33]:
multi_metrics_score = pd.DataFrame(columns=['model', 'roc_auc', 'precision_score', 'recall_score', 'f1_score'])
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'isToxic']
import statistics


def roc_auc_multilabel(y_pred, y_true):
    # Calculer l'AUC-ROC pour chaque classe

    y_pred_df = pd.DataFrame(y_pred, columns=categories)
    y_true_df = pd.DataFrame(y_true, columns=categories)

    roc_auc_list = []

    for i in range(len(categories)):
        y_true_cat = y_true_df[categories[i]]
        y_pred_cat = y_pred_df[categories[i]]
        roc_auc_list.append(roc_auc(y_pred_cat, y_true_cat))

    return statistics.mean(roc_auc_list)


def display_multi_metrics(model, X_test, y_test, name):
    # prédictions et vraies valeurs
    y_pred = model.predict(X_test)
    y_true = y_test

    # Affichage de l'aire sous la courbe ROC
    roc = roc_auc_multilabel(y_pred, y_true)
    print(f"Auc: {roc} ")

    # Arrondir les prédictions
    y_pred = y_pred.round()

    # Affichage de la précision
    precision = precision_score(y_true, y_pred, average='weighted')
    print(f"Précision : {precision}")

    # Affichage du rappel
    recall = recall_score(y_true, y_pred, average='weighted')
    print(f"Rappel : {recall}")

    # Affichage du score F1
    f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Score F1 : {f1}")

    return {
        'model': name,
        'roc_auc': roc,
        'precision_score': precision,
        'recall_score': recall,
        'f1_score': f1
    }

# Affichage des métriques
metrics = display_multi_metrics(model, X_test, y_test, 'BERT maultilabel model')

# Enregistrement des metrics dans le dataframe
multi_metrics_score = pd.concat([multi_metrics_score, pd.DataFrame.from_dict([metrics])], ignore_index=True)

Auc: 0.9447902716902857 
Précision : 0.7748923863537323
Rappel : 0.692822966507177
Score F1 : 0.7212475944986192


  _warn_prf(average, modifier, msg_start, len(result))
  multi_metrics_score = pd.concat([multi_metrics_score, pd.DataFrame.from_dict([metrics])], ignore_index=True)


In [34]:
# display_metrics(model, X_test, y_test) # Affichage des métriques en BERT binaire