In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [None]:
# ---------- 1. Load JSONL ----------
path = "/Users/noeamar/Documents/HEC/Jocas-HEC/Data/LLM_labels.jsonl"

records = []
with open(path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        try:
            records.append(json.loads(line))
        except:
            continue

df = pd.DataFrame(records)
print("Shape initiale :", df.shape)

# ---------- 2. Garder seulement les lignes avec Label_All DICT ----------
VALID_LABELS = [
    "DIVERSITY",
    "REMUNERATION_BENEFITS",
    "PROFESSIONAL_OPPORTUNITIES",
    "CULTURE_VALUES",
    "LEADERSHIP",
    "WORK_LIFE_BALANCE",
]  # ne pas inclure CONFIDENCE !

def extract_labels(d):
    if isinstance(d, dict):
        if all(k in d for k in VALID_LABELS):
            return [d[k] for k in VALID_LABELS]
    return None

df["label_vector"] = df["Label_All"].apply(extract_labels)
df = df[df["label_vector"].notnull()].reset_index(drop=True)
print("Shape après filtre Label_All :", df.shape)

# ---------- 3. Y : numpy array ----------
y = np.vstack(df["label_vector"].values).astype(int)
print("Y shape :", y.shape)

# ---------- 4. Créer la colonne texte ----------
TEXT_COLS = [
    "job_title",
    "description_job",
    "description_profil",
    "description_entreprise",
    "description_full",
]
df["text"] = df[TEXT_COLS].fillna("").agg(" ".join, axis=1)
X_text = df["text"].values

# ---------- 5. TF-IDF ----------
vectorizer = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 2),
)
X_vec = vectorizer.fit_transform(X_text)
print("X_vec shape :", X_vec.shape)

# ---------- 6. Cross-validation stratifiée multi-label ----------
strat_target = y.sum(axis=1)
strat_target = np.clip(strat_target, 0, 1)  # binaire

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

f1_micro_scores, f1_macro_scores = [], []
fold = 1

for train_idx, test_idx in kf.split(X_vec, strat_target):
    print(f"\n===== Fold {fold} =====")
    fold += 1

    X_train, X_test = X_vec[train_idx], X_vec[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = MultiOutputClassifier(
        XGBClassifier(
            n_estimators=200,
            max_depth=6,
            learning_rate=0.05,
            subsample=0.8,
            colsample_bytree=0.8,
            eval_metric="logloss",
            tree_method="hist",
        )
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    f1_micro = f1_score(y_test, y_pred, average="micro", zero_division=0)
    f1_macro = f1_score(y_test, y_pred, average="macro", zero_division=0)
    f1_micro_scores.append(f1_micro)
    f1_macro_scores.append(f1_macro)

    print(f"F1-micro: {f1_micro:.4f} | F1-macro: {f1_macro:.4f}")
    print("\nClassification report (fold courant) :")
    print(classification_report(
        y_test,
        y_pred,
        target_names=VALID_LABELS,
        zero_division=0
    ))

# ---------- 7. Résultats globaux ----------
print("\n===== Résumé 5-fold CV =====")
print(f"F1-micro moyen : {np.mean(f1_micro_scores):.4f}  ± {np.std(f1_micro_scores):.4f}")
print(f"F1-macro moyen : {np.mean(f1_macro_scores):.4f}  ± {np.std(f1_macro_scores):.4f}")