# 🤖 04 - Modèle avancé BERT (Transformers HuggingFace)
Ce notebook permet d'entraîner un modèle BERT pour la classification de sentiment.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizerFast, TFBertForSequenceClassification
import tensorflow as tf
import mlflow
import mlflow.transformers

## 📥 Chargement des données nettoyées

In [None]:
df = pd.read_csv("../data/processed/tweets_clean.csv")
X = df['clean_text']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## 🔠 Tokenisation avec `bert-base-uncased`

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=64)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=64)

## 🔄 Conversion en TensorFlow Dataset

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train.values
)).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    y_test.values
)).batch(16)

## 🧠 Entraînement du modèle BERT

In [None]:
with mlflow.start_run(run_name="BERT_base_sentiment"):

    model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy")

    model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

    model.fit(train_dataset, validation_data=test_dataset, epochs=2)

    # Évaluation
    loss, acc = model.evaluate(test_dataset)
    print(f"✅ Accuracy test : {acc:.4f}")

    mlflow.log_param("model", "bert-base-uncased")
    mlflow.log_metric("test_accuracy", acc)
    mlflow.transformers.log_model(transformers_model=model, artifact_path="model", tokenizer=tokenizer)

## 🧾 Évaluation détaillée

In [None]:
y_probs = model.predict(test_dataset).logits
y_preds = np.argmax(y_probs, axis=1)

print(classification_report(y_test, y_preds))