In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Évite les warnings de parallélisme

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    EvalPrediction
)
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# -----------------------------
# Chargement et préparation des données
# -----------------------------

# Charger les datasets
train_df = pd.read_csv("train_encoded.csv")
test_df = pd.read_csv("test_encoded.csv")

# Vérification rapide
print("Données d'entraînement :")
print(train_df.head())

# Créer jeu de validation
train_df, val_df = train_test_split(
    train_df,
    test_size=0.2,
    stratify=train_df['Emotion_encoded'],
    random_state=42
)

print("Taille entraînement :", len(train_df))
print("Taille validation :", len(val_df))

# -----------------------------
# Recréer le label_encoder à partir des labels textuels (si nécessaire)
# -----------------------------

# Si 'Emotion' existe dans le CSV, on l'utilise pour reconstruire l'encodeur
# Sinon, on suppose que Emotion_encoded est déjà cohérent
if 'Emotion' in train_df.columns:
    label_encoder = LabelEncoder()
    all_emotions = pd.concat([train_df['Emotion'], val_df['Emotion'], test_df['Emotion']]).unique()
    label_encoder.fit(all_emotions)
    num_labels = len(label_encoder.classes_)
else:
    # Sinon, on déduit le nombre de classes à partir des labels encodés
    num_labels = len(pd.concat([train_df['Emotion_encoded'], val_df['Emotion_encoded']]).unique())

# -----------------------------
# Conversion en Dataset Hugging Face
# -----------------------------

train_dataset = Dataset.from_pandas(train_df[['Processed_Comment', 'Emotion_encoded']])
val_dataset = Dataset.from_pandas(val_df[['Processed_Comment', 'Emotion_encoded']])
test_dataset = Dataset.from_pandas(test_df[['Processed_Comment', 'Emotion_encoded']])

# -----------------------------
# Tokenisation
# -----------------------------

MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(batch):
    return tokenizer(
        batch["Processed_Comment"],
        truncation=True,
        padding="max_length",  # ou True si tu veux utiliser DataCollatorWithPadding
        max_length=128
    )

tokenized_datasets = {
    "train": train_dataset.map(tokenize_function, batched=True),
    "validation": val_dataset.map(tokenize_function, batched=True),
    "test": test_dataset.map(tokenize_function, batched=True)
}

# Renommer la colonne de label pour qu'elle soit reconnue par le modèle
for split in tokenized_datasets:
    tokenized_datasets[split] = tokenized_datasets[split].rename_column("Emotion_encoded", "labels")

# -----------------------------
# Modèle
# -----------------------------
# -----------------------------
# Pondération des classes (pour déséquilibre)
# -----------------------------

from sklearn.utils.class_weight import compute_class_weight
import torch

# Calculer les poids de classe
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_df['Emotion_encoded']),
    y=train_df['Emotion_encoded']
)
class_weights = torch.tensor(class_weights, dtype=torch.float)
print("Poids des classes :", class_weights)

# Créer un modèle avec loss pondérée
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

# Remplacer la fonction de perte par une version pondérée
model.loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

# -----------------------------
# Métriques d'évaluation
# -----------------------------

def compute_metrics(p: EvalPrediction):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy_score(p.label_ids, preds),
        "f1_macro": f1_score(p.label_ids, preds, average="macro")
    }

# -----------------------------
# Entraînement
# -----------------------------
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=10,
    do_eval=True,
    eval_steps=100,
    save_steps=100,
    learning_rate=2e-5,
    report_to="none",
    load_best_model_at_end=False,
    save_total_limit=2,
    # metric_for_best_model="eval_loss"  # only works if load_best_model_at_end=True AND evaluation_strategy set (skip in old versions)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer  # utile pour sauvegarder le tokenizer avec le modèle
)

# Lancer l'entraînement
trainer.train()

# -----------------------------
# Évaluation finale sur le test
# -----------------------------

test_results = trainer.evaluate(eval_dataset=tokenized_datasets["test"])
print("\nRésultats sur le jeu de test :", test_results)

# -----------------------------
# Sauvegarde
# -----------------------------

model.save_pretrained("./finetuned-bert-sentiment")
tokenizer.save_pretrained("./finetuned-bert-sentiment")



  from .autonotebook import tqdm as notebook_tqdm


ImportError: Traceback (most recent call last):
  File "c:\Users\WSI\AppData\Local\Programs\Python\Python312\Lib\site-packages\tensorflow\python\pywrap_tensorflow.py", line 73, in <module>
    from tensorflow.python._pywrap_tensorflow_internal import *
ImportError: DLL load failed while importing _pywrap_tensorflow_internal: Une routine d’initialisation d’une bibliothèque de liens dynamiques (DLL) a échoué.


Failed to load the native TensorFlow runtime.
See https://www.tensorflow.org/install/errors for some common causes and solutions.
If you need help, create an issue at https://github.com/tensorflow/tensorflow/issues and include the entire stack trace above this error message.

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# ✅ Bonne pratique : chemin propre
model_path = "finetuned-bert-emotion"  # ou os.path.abspath("finetuned-bert-emotion")

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

def predict_emotion(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_id = logits.argmax().item()
    confidence = torch.softmax(logits, dim=1).max().item()
    return pred_id, confidence

# Test
print(predict_emotion("I love this!"))

Pipline en cours  d'execution 

* D’abord, un modèle fiable et rapide (DistilBERT)
* Ensuite, le rendre transparent (LIME)
* Enfin, l’intégrer dans un système conversationnel utile (LLM + éthique)