# NLP Santé mentale – Baseline SBERT vs BGE‑M3 (Kaggle)

Notebook prêt à exécuter sur Kaggle avec le dataset *Mental Health Text Classification Dataset (4‑Class)*.

- Données : fichiers `mental_heath_unbanlanced.csv` (train) et `mental_health_combined_test.csv` (test)
- Baseline : SBERT embeddings + régression logistique
- Nouveau modèle : BGE‑M3 embeddings + régression logistique

Remarque : le fichier `mental_heath_feature_engineered.csv` est optionnel et non utilisé ici.

In [4]:
# Installation des dépendances
!pip -q install -U sentence-transformers scikit-learn pandas numpy

In [5]:
import os
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
# Chemins des fichiers du dataset Kaggle
# Adapter uniquement si le dossier Kaggle diffère
base_dir = "../data/"

train_path = os.path.join(base_dir, "mental_heath_unbanlanced.csv")
test_path = os.path.join(base_dir, "mental_health_combined_test.csv")

print("Fichiers disponibles dans le dossier dataset :")
print(os.listdir(base_dir))

Fichiers disponibles dans le dossier dataset :
['mental_health_combined_test.csv', 'mental_heath_unbanlanced.csv', 'mental_heath_feature_engineered.csv']


In [10]:
# Chargement des données
df_train = pd.read_csv(train_path).dropna(subset=["text", "status"]).copy()
df_test = pd.read_csv(test_path).dropna(subset=["text", "status"]).copy()

print("Train:", df_train.shape)
print("Test :", df_test.shape)
print("\nColonnes train:", list(df_train.columns))
print("\nRépartition des classes (train):")
print(df_train["status"].value_counts())

Train: (49612, 3)
Test : (992, 2)

Colonnes train: ['Unique_ID', 'text', 'status']

Répartition des classes (train):
status
Normal        18391
Depression    14506
Suicidal      11212
Anxiety        5503
Name: count, dtype: int64


In [11]:
# EDA minimal : longueurs de texte (caractères et mots)
df_train["text_len"] = df_train["text"].astype(str).str.len()
df_train["word_count"] = df_train["text"].astype(str).str.split().map(len)

df_train[["text_len", "word_count"]].describe(percentiles=[0.5, 0.9, 0.95, 0.99])

Unnamed: 0,text_len,word_count
count,49612.0,49612.0
mean,399.227143,78.229199
std,617.780591,122.187106
min,7.0,1.0
50%,242.0,47.0
90%,923.0,182.0
95%,1114.45,220.0
99%,2345.56,449.89
max,38785.0,9684.0


In [12]:
# Encodage des labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_train["status"])
y_test = label_encoder.transform(df_test["status"])

X = df_train["text"].astype(str).tolist()
X_test = df_test["text"].astype(str).tolist()

print("Classes:", list(label_encoder.classes_))

Classes: ['Anxiety', 'Depression', 'Normal', 'Suicidal']


In [13]:
# Split train / validation
# Le jeu de test reste séparé et n'est pas utilisé pendant l'entraînement
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train split:", len(X_tr))
print("Val split  :", len(X_val))

Train split: 39689
Val split  : 9923


In [14]:
# Cache d'embeddings sur disque pour éviter de recalculer en cas de relance
def embed_with_cache(model, texts, path, batch_size=32):
    if os.path.exists(path):
        return np.load(path)
    emb = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    np.save(path, emb)
    return emb

In [15]:
# Entraînement + évaluation d'un classifieur simple
def train_and_eval(emb_tr, emb_val, emb_test, y_tr, y_val, y_test, class_names):
    clf = LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced")
    clf.fit(emb_tr, y_tr)

    val_pred = clf.predict(emb_val)
    test_pred = clf.predict(emb_test)

    val_report = classification_report(y_val, val_pred, target_names=class_names, output_dict=True)
    test_report = classification_report(y_test, test_pred, target_names=class_names, output_dict=True)

    return clf, val_pred, test_pred, val_report, test_report

## Baseline : SBERT embeddings + régression logistique

In [None]:
import os
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"


In [None]:
# Modèle SBERT baseline (embeddings généralistes solides)
from sentence_transformers import SentenceTransformer
sbert_name = "sentence-transformers/all-mpnet-base-v2"

sbert = SentenceTransformer("sentence-transformers/all-mpnet-base-v2", device="cpu")
#sbert = SentenceTransformer(sbert_name)

# Option utile pour accélérer (perte potentielle de qualité)
# sbert.max_seq_length = 256

X_tr_sbert = embed_with_cache(sbert, X_tr, "/kaggle/working/X_tr_sbert.npy", batch_size=64)
X_val_sbert = embed_with_cache(sbert, X_val, "/kaggle/working/X_val_sbert.npy", batch_size=64)
X_test_sbert = embed_with_cache(sbert, X_test, "/kaggle/working/X_test_sbert.npy", batch_size=64)

clf_sbert, val_pred_sbert, test_pred_sbert, val_rep_sbert, test_rep_sbert = train_and_eval(
    X_tr_sbert, X_val_sbert, X_test_sbert, y_tr, y_val, y_test, label_encoder.classes_
)

print("SBERT - validation")
print(classification_report(y_val, val_pred_sbert, target_names=label_encoder.classes_))
print("SBERT - test")
print(classification_report(y_test, test_pred_sbert, target_names=label_encoder.classes_))

In [None]:
# Matrice de confusion SBERT sur le test
confusion_matrix(y_test, test_pred_sbert)

## Nouveau modèle : BGE‑M3 embeddings + régression logistique

In [None]:
# Modèle BGE-M3 (embeddings récents)
bge_name = "BAAI/bge-m3"
bge = SentenceTransformer(bge_name)

# Option utile si tu veux accélérer (réduit la longueur max traitée)
# bge.max_seq_length = 256

X_tr_bge = embed_with_cache(bge, X_tr, "/kaggle/working/X_tr_bge.npy", batch_size=32)
X_val_bge = embed_with_cache(bge, X_val, "/kaggle/working/X_val_bge.npy", batch_size=32)
X_test_bge = embed_with_cache(bge, X_test, "/kaggle/working/X_test_bge.npy", batch_size=32)

clf_bge, val_pred_bge, test_pred_bge, val_rep_bge, test_rep_bge = train_and_eval(
    X_tr_bge, X_val_bge, X_test_bge, y_tr, y_val, y_test, label_encoder.classes_
)

print("BGE-M3 - validation")
print(classification_report(y_val, val_pred_bge, target_names=label_encoder.classes_))
print("BGE-M3 - test")
print(classification_report(y_test, test_pred_bge, target_names=label_encoder.classes_))

In [None]:
# Matrice de confusion BGE-M3 sur le test
confusion_matrix(y_test, test_pred_bge)

## Comparaison synthétique

In [None]:
# Tableau de comparaison (macro F1 et accuracy)
def extract_metrics(rep):
    return {
        "accuracy": rep["accuracy"],
        "f1_macro": rep["macro avg"]["f1-score"],
        "precision_macro": rep["macro avg"]["precision"],
        "recall_macro": rep["macro avg"]["recall"],
    }

rows = []
rows.append({"modele": "SBERT", "split": "validation", **extract_metrics(val_rep_sbert)})
rows.append({"modele": "SBERT", "split": "test", **extract_metrics(test_rep_sbert)})
rows.append({"modele": "BGE-M3", "split": "validation", **extract_metrics(val_rep_bge)})
rows.append({"modele": "BGE-M3", "split": "test", **extract_metrics(test_rep_bge)})

df_compare = pd.DataFrame(rows)
df_compare

In [None]:
# Sauvegarde des métriques (utile pour ton rapport)
out_path = "/kaggle/working/metrics_compare.csv"
df_compare.to_csv(out_path, index=False)
out_path

## Notes

- Le jeu `mental_health_combined_test.csv` est réservé à l’évaluation finale.
- Le fichier `mental_heath_feature_engineered.csv` est utile pour des baselines classiques, mais pas nécessaire pour SBERT/BGE.
- Si la RAM GPU est limitée, diminue `batch_size` ou fixe `max_seq_length = 256`.