# NLP Santé mentale – Baseline SBERT vs BGE‑M3

Notebook prêt à exécuter sur Kaggle avec le dataset *Mental Health Text Classification Dataset (4‑Class)*.

- Données : fichiers `mental_heath_unbanlanced.csv` (train) et `mental_health_combined_test.csv` (test)
- Baseline : SBERT embeddings + régression logistique
- Nouveau modèle : BGE‑M3 embeddings + régression logistique

Remarque : le fichier `mental_heath_feature_engineered.csv` est optionnel et non utilisé ici.

In [None]:
!pip -q install sentence-transformers --no-deps


In [None]:
import numpy as np
import pandas as pd
import sklearn
import torch
from sentence_transformers import SentenceTransformer

print("numpy", np.__version__)
print("pandas", pd.__version__)
print("sklearn", sklearn.__version__)
print("torch", torch.__version__)


In [None]:
# Output directory that works on Kaggle and locally
import os

OUTPUT_DIR = os.getenv('KAGGLE_WORKING_DIR', '/kaggle/working')
if not os.path.isdir(OUTPUT_DIR):
    OUTPUT_DIR = 'models'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print('OUTPUT_DIR =', OUTPUT_DIR)


In [None]:
import os
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Chemins des fichiers du dataset Kaggle
# Adapter uniquement si le dossier Kaggle diffère
base_dir = "/kaggle/input/mental-health-text-classification-dataset"

train_path = os.path.join(base_dir, "mental_heath_unbanlanced.csv")
test_path = os.path.join(base_dir, "mental_health_combined_test.csv")

print("Fichiers disponibles dans le dossier dataset :")
print(os.listdir(base_dir))

In [None]:
# Chargement des données
df_train = pd.read_csv(train_path).dropna(subset=["text", "status"]).copy()
df_test = pd.read_csv(test_path).dropna(subset=["text", "status"]).copy()

print("Train:", df_train.shape)
print("Test :", df_test.shape)
print("\nColonnes train:", list(df_train.columns))
print("\nRépartition des classes (train):")
print(df_train["status"].value_counts())

In [None]:
# EDA minimal : longueurs de texte (caractères et mots)
df_train["text_len"] = df_train["text"].astype(str).str.len()
df_train["word_count"] = df_train["text"].astype(str).str.split().map(len)

df_train[["text_len", "word_count"]].describe(percentiles=[0.5, 0.9, 0.95, 0.99])

In [None]:
# Encodage des labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_train["status"])
y_test = label_encoder.transform(df_test["status"])

X = df_train["text"].astype(str).tolist()
X_test = df_test["text"].astype(str).tolist()

print("Classes:", list(label_encoder.classes_))

In [None]:
# Split train / validation
# Le jeu de test reste séparé et n'est pas utilisé pendant l'entraînement
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("Train split:", len(X_tr))
print("Val split  :", len(X_val))

In [None]:
# Cache d'embeddings sur disque pour éviter de recalculer en cas de relance
def embed_with_cache(model, texts, path, batch_size=32):
    if os.path.exists(path):
        return np.load(path)
    emb = model.encode(
        texts,
        batch_size=batch_size,
        show_progress_bar=True,
        convert_to_numpy=True
    )
    np.save(path, emb)
    return emb

In [None]:
# Entraînement + évaluation d'un classifieur simple
def train_and_eval(emb_tr, emb_val, emb_test, y_tr, y_val, y_test, class_names):
    clf = LogisticRegression(max_iter=2000, n_jobs=-1, class_weight="balanced")
    clf.fit(emb_tr, y_tr)

    val_pred = clf.predict(emb_val)
    test_pred = clf.predict(emb_test)

    val_report = classification_report(y_val, val_pred, target_names=class_names, output_dict=True)
    test_report = classification_report(y_test, test_pred, target_names=class_names, output_dict=True)

    return clf, val_pred, test_pred, val_report, test_report

## Baseline : SBERT embeddings + régression logistique

In [None]:
# Modèle SBERT baseline (embeddings généralistes solides)
sbert_name = "sentence-transformers/all-mpnet-base-v2"
sbert = SentenceTransformer(sbert_name)

# Option utile si tu veux accélérer (et si tu acceptes une perte potentielle de qualité)
# sbert.max_seq_length = 256

X_tr_sbert = embed_with_cache(sbert, X_tr, "/kaggle/working/X_tr_sbert.npy", batch_size=64)
X_val_sbert = embed_with_cache(sbert, X_val, "/kaggle/working/X_val_sbert.npy", batch_size=64)
X_test_sbert = embed_with_cache(sbert, X_test, "/kaggle/working/X_test_sbert.npy", batch_size=64)

clf_sbert, val_pred_sbert, test_pred_sbert, val_rep_sbert, test_rep_sbert = train_and_eval(
    X_tr_sbert, X_val_sbert, X_test_sbert, y_tr, y_val, y_test, label_encoder.classes_
)

print("SBERT - validation")
print(classification_report(y_val, val_pred_sbert, target_names=label_encoder.classes_))
print("SBERT - test")
print(classification_report(y_test, test_pred_sbert, target_names=label_encoder.classes_))

In [None]:
# Matrice de confusion SBERT sur le test
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, test_pred_sbert)

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# Calcul de la matrice de confusion pour SBERT
cm_sbert = confusion_matrix(y_test, test_pred_sbert)

# Affichage graphique
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm_sbert,
    display_labels=label_encoder.classes_
)

plt.figure(figsize=(6, 6))
disp.plot(cmap="Blues", values_format="d")
plt.title("Matrice de confusion – SBERT")
plt.grid(False)
plt.show()


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(y_true, y_pred):
    return {
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "f1_macro": float(f1_score(y_true, y_pred, average="macro")),
        "f1_weighted": float(f1_score(y_true, y_pred, average="weighted")),
    }

RESULTS = []

def add_result(modele, embeddings, split, y_true, y_pred):
    row = {
        "modele": modele,
        "embeddings": embeddings,
        "split": split,
        **compute_metrics(y_true, y_pred),
    }
    RESULTS.append(row)
    return row


In [None]:
from sklearn.metrics import accuracy_score, f1_score

add_result("LogisticRegression", "SBERT", "validation", y_val, val_pred_sbert)
add_result("LogisticRegression", "SBERT", "test", y_test, test_pred_sbert)


In [None]:
# Libérer la mémoire GPU entre SBERT et BGE
import gc, torch

del sbert, X_tr_sbert, X_val_sbert, X_test_sbert
gc.collect()
torch.cuda.empty_cache()

## Nouveau modèle : BGE‑M3 embeddings + régression logistique

In [None]:
# Modèle BGE-M3 (embeddings récents)
bge_name = "BAAI/bge-m3"
bge = SentenceTransformer(bge_name)
bge.max_seq_length = 128


# Option utile si tu veux accélérer (réduit la longueur max traitée)
# bge.max_seq_length = 256

X_tr_bge = embed_with_cache(bge, X_tr, "/kaggle/working/X_tr_bge.npy", batch_size=8)
X_val_bge = embed_with_cache(bge, X_val, "/kaggle/working/X_val_bge.npy", batch_size=8)
X_test_bge = embed_with_cache(bge, X_test, "/kaggle/working/X_test_bge.npy", batch_size=8)


clf_bge, val_pred_bge, test_pred_bge, val_rep_bge, test_rep_bge = train_and_eval(
    X_tr_bge, X_val_bge, X_test_bge, y_tr, y_val, y_test, label_encoder.classes_
)

print("BGE-M3 - validation")
print(classification_report(y_val, val_pred_bge, target_names=label_encoder.classes_))
print("BGE-M3 - test")
print(classification_report(y_test, test_pred_bge, target_names=label_encoder.classes_))

In [None]:
# Matrice de confusion BGE-M3 sur le test
confusion_matrix(y_test, test_pred_bge)

In [None]:
# Calcul de la matrice de confusion pour BGE-M3
cm_bge = confusion_matrix(y_test, test_pred_bge)

# Affichage graphique
disp = ConfusionMatrixDisplay(
    confusion_matrix=cm_bge,
    display_labels=label_encoder.classes_
)

plt.figure(figsize=(6, 6))
disp.plot(cmap="Blues", values_format="d")
plt.title("Matrice de confusion – BGE-M3")
plt.grid(False)
plt.show()


## Comparaison synthétique

In [None]:
# Tableau de comparaison (macro F1 et accuracy)
def extract_metrics(rep):
    return {
        "accuracy": rep["accuracy"],
        "f1_macro": rep["macro avg"]["f1-score"],
        "precision_macro": rep["macro avg"]["precision"],
        "recall_macro": rep["macro avg"]["recall"],
    }

rows = []
rows.append({"modele": "SBERT", "split": "validation", **extract_metrics(val_rep_sbert)})
rows.append({"modele": "SBERT", "split": "test", **extract_metrics(test_rep_sbert)})
rows.append({"modele": "BGE-M3", "split": "validation", **extract_metrics(val_rep_bge)})
rows.append({"modele": "BGE-M3", "split": "test", **extract_metrics(test_rep_bge)})

df_compare = pd.DataFrame(rows)
df_compare

In [None]:
# Sauvegarde des métriques
out_path = "/kaggle/working/metrics_compare.csv"
df_compare.to_csv(out_path, index=False)
out_path

## Notes

- Le jeu `mental_health_combined_test.csv` est réservé à l’évaluation finale.
- Le fichier `mental_heath_feature_engineered.csv` est utile pour des baselines classiques, mais pas nécessaire pour SBERT/BGE.
lightgbm- Autres options si la RAM GPU est limitée, `batch_size` ou fixe `max_seq_length = 256`.

In [None]:
!pip -q install -U lightgbm


In [None]:
import os
import json
import numpy as np
import pandas as pd

import lightgbm as lgb

from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score, f1_score


In [None]:
def run_lgbm_multiclass(X_tr_emb, y_tr, X_val_emb, y_val, X_test_emb, y_test, class_names, run_name, seed=42):
    # LightGBM aime bien le float32
    X_tr_emb = X_tr_emb.astype(np.float32)
    X_val_emb = X_val_emb.astype(np.float32)
    X_test_emb = X_test_emb.astype(np.float32)

    train_set = lgb.Dataset(X_tr_emb, label=y_tr)
    val_set = lgb.Dataset(X_val_emb, label=y_val, reference=train_set)

    params = {
        "objective": "multiclass",
        "num_class": len(class_names),
        "metric": "multi_logloss",
        "learning_rate": 0.05,
        "num_leaves": 63,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
        "min_data_in_leaf": 30,
        "lambda_l2": 1.0,
        "verbosity": -1,
        "seed": seed,
    }

    model = lgb.train(
        params,
        train_set,
        num_boost_round=5000,
        valid_sets=[val_set],
        valid_names=["val"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=100),
            lgb.log_evaluation(period=50),
        ],
    )

    val_proba = model.predict(X_val_emb)
    test_proba = model.predict(X_test_emb)

    val_pred = np.argmax(val_proba, axis=1)
    test_pred = np.argmax(test_proba, axis=1)

    print(f"{run_name} - validation")
    print(classification_report(y_val, val_pred, target_names=class_names))

    print(f"{run_name} - test")
    print(classification_report(y_test, test_pred, target_names=class_names))

    add_result("LightGBM", run_name, "validation", y_val, val_pred)
    add_result("LightGBM", run_name, "test", y_test, test_pred)

    return model, val_pred, test_pred


In [None]:
# SBERT
X_tr_sbert  = np.load("/kaggle/working/X_tr_sbert.npy")
X_val_sbert = np.load("/kaggle/working/X_val_sbert.npy")
X_test_sbert = np.load("/kaggle/working/X_test_sbert.npy")

# BGE-M3
X_tr_bge  = np.load("/kaggle/working/X_tr_bge.npy")
X_val_bge = np.load("/kaggle/working/X_val_bge.npy")
X_test_bge = np.load("/kaggle/working/X_test_bge.npy")

print("SBERT :", X_tr_sbert.shape, X_val_sbert.shape, X_test_sbert.shape)
print("BGE-M3:", X_tr_bge.shape, X_val_bge.shape, X_test_bge.shape)

In [None]:
model_lgb_sbert, val_pred_lgb_sbert, test_pred_lgb_sbert = run_lgbm_multiclass(
    X_tr_sbert, y_tr,
    X_val_sbert, y_val,
    X_test_sbert, y_test,
    label_encoder.classes_,
    run_name="SBERT"
)


In [None]:
model_lgb_bge, val_pred_lgb_bge, test_pred_lgb_bge = run_lgbm_multiclass(
    X_tr_bge, y_tr,
    X_val_bge, y_val,
    X_test_bge, y_test,
    label_encoder.classes_,
    run_name="BGE-M3"
)


In [None]:
def run_linear_svm(X_tr_emb, y_tr, X_val_emb, y_val, X_test_emb, y_test, class_names, run_name):
    # LinearSVC marche bien en haute dimension (embeddings), très bon baseline
    clf = LinearSVC(class_weight="balanced", random_state=42)
    clf.fit(X_tr_emb, y_tr)

    val_pred = clf.predict(X_val_emb)
    test_pred = clf.predict(X_test_emb)

    print(f"{run_name} - validation")
    print(classification_report(y_val, val_pred, target_names=class_names))

    print(f"{run_name} - test")
    print(classification_report(y_test, test_pred, target_names=class_names))

    add_result("LinearSVM", run_name, "validation", y_val, val_pred)
    add_result("LinearSVM", run_name, "test", y_test, test_pred)

    return clf, val_pred, test_pred


In [None]:
clf_svm_sbert, val_pred_svm_sbert, test_pred_svm_sbert = run_linear_svm(
    X_tr_sbert, y_tr,
    X_val_sbert, y_val,
    X_test_sbert, y_test,
    label_encoder.classes_,
    run_name="SBERT"
)


In [None]:
clf_svm_bge, val_pred_svm_bge, test_pred_svm_bge = run_linear_svm(
    X_tr_bge, y_tr,
    X_val_bge, y_val,
    X_test_bge, y_test,
    label_encoder.classes_,
    run_name="BGE-M3"
)


In [None]:
df_results = pd.DataFrame(RESULTS).sort_values(["embeddings", "modele", "split"])
df_results


In [None]:
out_csv = "/kaggle/working/results_embeddings_classifiers.csv"
df_results.to_csv(out_csv, index=False)
out_csv


In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report


In [None]:
def run_mlp(X_tr_emb, y_tr, X_val_emb, y_val, X_test_emb, y_test, class_names, run_name):
    # MLP non linéaire avec early stopping sur un split interne du train
    # On garde ton vrai X_val pour l'évaluation comparable aux autres modèles
    clf = MLPClassifier(
        hidden_layer_sizes=(256, 128),
        activation="relu",
        solver="adam",
        alpha=1e-4,
        batch_size=256,
        learning_rate_init=1e-3,
        max_iter=200,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=10,
        random_state=42
    )

    clf.fit(X_tr_emb, y_tr)

    val_pred = clf.predict(X_val_emb)
    test_pred = clf.predict(X_test_emb)

    print(f"{run_name} - validation")
    print(classification_report(y_val, val_pred, target_names=class_names))

    print(f"{run_name} - test")
    print(classification_report(y_test, test_pred, target_names=class_names))

    add_result("MLP", run_name, "validation", y_val, val_pred)
    add_result("MLP", run_name, "test", y_test, test_pred)

    return clf, val_pred, test_pred


In [None]:
clf_mlp_sbert, val_pred_mlp_sbert, test_pred_mlp_sbert = run_mlp(
    X_tr_sbert, y_tr,
    X_val_sbert, y_val,
    X_test_sbert, y_test,
    label_encoder.classes_,
    run_name="SBERT"
)


In [None]:
clf_mlp_bge, val_pred_mlp_bge, test_pred_mlp_bge = run_mlp(
    X_tr_bge, y_tr,
    X_val_bge, y_val,
    X_test_bge, y_test,
    label_encoder.classes_,
    run_name="BGE-M3"
)

In [None]:
df_results = pd.DataFrame(RESULTS).sort_values(["embeddings", "modele", "split"])
df_results


In [None]:
import pandas as pd

# Chemin du fichier exporté précédemment
results_path = "/kaggle/working/results_embeddings_classifiers.csv"

df_results = pd.read_csv(results_path)

print(df_results.head())
print(df_results.shape)


In [None]:
import pandas as pd

# DataFrame brut
#df_results = pd.DataFrame(RESULTS)

# On garde uniquement le split test
df_test = df_results[df_results["split"] == "test"].copy()

# Suppression des doublons éventuels (sécurité)
df_test = df_test.drop_duplicates(subset=["embeddings", "modele"])

# Tri par accuracy puis f1_weighted (du meilleur au moins bon)
df_test = df_test.sort_values(
    by=["accuracy", "f1_weighted"],
    ascending=False
).reset_index(drop=True)

df_test


In [None]:
import joblib
import numpy as np
import os
import json

# Sauvegarde classificateurs
joblib.dump(clf_sbert, os.path.join(OUTPUT_DIR, 'clf_lr_sbert.joblib'))
joblib.dump(clf_bge, os.path.join(OUTPUT_DIR, 'clf_lr_bge.joblib'))

# Sauvegarde des modeles LightGBM
try:
    model_lgb_sbert.save_model(os.path.join(OUTPUT_DIR, 'clf_lgb_sbert.txt'))
except Exception as e:
    print('LightGBM SBERT not saved:', e)

try:
    model_lgb_bge.save_model(os.path.join(OUTPUT_DIR, 'clf_lgb_bge.txt'))
except Exception as e:
    print('LightGBM BGE not saved:', e)

# Sauvegarde des autres modeles
try:
    joblib.dump(clf_svm_sbert, os.path.join(OUTPUT_DIR, 'clf_svm_sbert.joblib'))
except Exception as e:
    print('SVM SBERT not saved:', e)

try:
    joblib.dump(clf_svm_bge, os.path.join(OUTPUT_DIR, 'clf_svm_bge.joblib'))
except Exception as e:
    print('SVM BGE not saved:', e)

try:
    joblib.dump(clf_mlp_sbert, os.path.join(OUTPUT_DIR, 'clf_mlp_sbert.joblib'))
except Exception as e:
    print('MLP SBERT not saved:', e)

try:
    joblib.dump(clf_mlp_bge, os.path.join(OUTPUT_DIR, 'clf_mlp_bge.joblib'))
except Exception as e:
    print('MLP BGE not saved:', e)

# Sauvegarde predictions
np.save(os.path.join(OUTPUT_DIR, 'val_pred_sbert.npy'), val_pred_sbert)
np.save(os.path.join(OUTPUT_DIR, 'test_pred_sbert.npy'), test_pred_sbert)

# Sauvegarde resultats
df_results.to_csv(os.path.join(OUTPUT_DIR, 'results_embeddings_classifiers.csv'), index=False)

# Sauvegarde encoder
joblib.dump(label_encoder, os.path.join(OUTPUT_DIR, 'label_encoder.joblib'))

# Sauvegarde du meilleur modele
try:
    df_test = df_results[df_results['split'] == 'test'].drop_duplicates(subset=['embeddings', 'modele'])
    df_test = df_test.sort_values(by=['accuracy', 'f1_weighted'], ascending=False).reset_index(drop=True)
    best = df_test.iloc[0].to_dict()
    model_files = {
        ('LightGBM', 'SBERT'): ('clf_lgb_sbert.txt', 'lightgbm'),
        ('LightGBM', 'BGE-M3'): ('clf_lgb_bge.txt', 'lightgbm'),
        ('LogisticRegression', 'SBERT'): ('clf_lr_sbert.joblib', 'sklearn'),
        ('LogisticRegression', 'BGE-M3'): ('clf_lr_bge.joblib', 'sklearn'),
        ('LinearSVM', 'SBERT'): ('clf_svm_sbert.joblib', 'sklearn'),
        ('LinearSVM', 'BGE-M3'): ('clf_svm_bge.joblib', 'sklearn'),
        ('MLP', 'SBERT'): ('clf_mlp_sbert.joblib', 'sklearn'),
        ('MLP', 'BGE-M3'): ('clf_mlp_bge.joblib', 'sklearn'),
    }
    key = (best.get('modele'), best.get('embeddings'))
    filename, model_type = model_files.get(key, (None, None))
    if filename is None:
        print('Best model not saved: unknown key', key)
    else:
        embedding_model_name = None
        if best.get('embeddings') == 'SBERT':
            embedding_model_name = sbert_name
        elif best.get('embeddings') == 'BGE-M3':
            embedding_model_name = bge_name
        meta = {
            'modele': best.get('modele'),
            'embeddings': best.get('embeddings'),
            'model_filename': filename,
            'model_type': model_type,
            'embedding_model_name': embedding_model_name,
        }
        with open(os.path.join(OUTPUT_DIR, 'best_model_meta.json'), 'w', encoding='utf-8') as f:
            json.dump(meta, f, ensure_ascii=False, indent=2)
        print('Best model meta saved:', meta)
except Exception as e:
    print('Best model not saved:', e)

print('Sauvegarde terminee -> Save Version')
