In [8]:
import deepchem as dc
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from tqdm import tqdm

# Helper function to combine ECFP and SMILES datasets
def dataset_to_df_with_smiles(ecfp_dataset, raw_dataset, tasks):
    X_list, y_list, ids_list, smiles_list = [], [], [], []
    
    # Iterate through both datasets in parallel
    for (X_batch, y_batch, w_batch, ids_batch), (X_raw, _, _, _) in zip(
        ecfp_dataset.iterbatches(batch_size=128, pad_batches=False),
        raw_dataset.iterbatches(batch_size=128, pad_batches=False)
    ):
        X_list.append(X_batch)
        y_list.append(y_batch)
        ids_list.extend(ids_batch)
        smiles_list.extend(X_raw)  # SMILES strings are in the raw features

    # Stack numerical and label arrays
    X_all = np.vstack(X_list)
    y_all = np.vstack(y_list)

    # Create DataFrames
    df_X = pd.DataFrame(X_all, columns=[f"fp_{i}" for i in range(X_all.shape[1])])
    df_y = pd.DataFrame(y_all, columns=tasks)
    df_y["mol_id"] = ids_list
    df_y["smiles"] = smiles_list

    # Combine all information into a single DataFrame
    df = pd.concat([df_y, df_X], axis=1)
    return df


# Load SIDER dataset with ECFP features (for model input)
tasks, datasets, transformers = dc.molnet.load_sider(featurizer='ECFP', splitter='scaffold')
train_ecfp, valid_ecfp, test_ecfp = datasets

# Load SIDER dataset again with raw SMILES (for visualization / metadata)
_, datasets_raw, _ = dc.molnet.load_sider(featurizer='Raw', splitter='scaffold')
train_raw, valid_raw, test_raw = datasets_raw

# Convert both representations to DataFrames
df_train = dataset_to_df_with_smiles(train_ecfp, train_raw, tasks)
df_valid = dataset_to_df_with_smiles(valid_ecfp, valid_raw, tasks)
df_test  = dataset_to_df_with_smiles(test_ecfp,  test_raw,  tasks)

feature_cols = [col for col in df_train.columns if col.startswith("fp_")]
label_cols = [col for col in df_train.columns if col not in feature_cols + ['mol_id', 'smiles', 'scaffold']]

X_train = df_train[feature_cols].astype(float).values
y_train = df_train[label_cols].astype(float).values

X_valid = df_valid[feature_cols].astype(float).values
y_valid = df_valid[label_cols].astype(float).values

X_test = df_test[feature_cols].astype(float).values
y_test = df_test[label_cols].astype(float).values

pca = PCA().fit(X_train)
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Number of components to reach 80% variance
n_components_80 = np.argmax(cumulative_variance >= 0.80) + 1

pca = PCA(n_components=n_components_80)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(X_test)

In [5]:
from sklearn.metrics import (
    accuracy_score, hamming_loss, f1_score, roc_auc_score
)

def evaluate_multilabel_model(y_true, y_pred, y_prob=None, name=None):
    """
    Evaluate multilabel classification performance.
    
    Parameters
    ----------
    y_true : np.ndarray
        Ground-truth binary matrix (n_samples x n_labels)
    y_pred : np.ndarray
        Predicted binary matrix (same shape as y_true)
    y_prob : np.ndarray, optional
        Predicted probabilities (for ROC-AUC if available)
    name : str
        Name of the dataframe evaluated
    """

    metrics = {"Nom :": name}
    metrics["Subset accuracy"] = accuracy_score(y_true, y_pred)
    metrics["Hamming loss"] = hamming_loss(y_true, y_pred)
    metrics["Micro F1"] = f1_score(y_true, y_pred, average="micro")
    metrics["Macro F1"] = f1_score(y_true, y_pred, average="macro")
    metrics["Weighted F1"] = f1_score(y_true, y_pred, average="weighted")
    
    if y_prob is not None:
        try:
            metrics["Micro ROC-AUC"] = roc_auc_score(y_true, y_prob, average="micro")
            metrics["Macro ROC-AUC"] = roc_auc_score(y_true, y_prob, average="macro")
        except ValueError:
            metrics["Micro ROC-AUC"] = np.nan
            metrics["Macro ROC-AUC"] = np.nan

    print("\n📊 Multilabel Evaluation Metrics:")
    for k, v in metrics.items():
        if isinstance(v, float):
            print(f"{k:20s}: {v:.4f}")
        else:
            print(f"{k:20s}: {v}")

    return metrics


1. XGBoost with raw data

In [6]:
# Importations needed
import xgboost as xgb
from sklearn.multiclass import OneVsRestClassifier


# --- Reuse Raw ECFP Data ---
# (Ensure X_train_raw, y_train, X_valid_raw, y_valid, X_test_raw, y_test are available from the previous L1 LR cell)
# feature_cols = [col for col in df_train.columns if col.startswith("fp_")]
# label_cols = [col for col in df_train.columns if col not in feature_cols + ['mol_id', 'smiles', 'scaffold']]
# X_train_raw = df_train[feature_cols].astype(float).values
# y_train = df_train[label_cols].astype(float).values
# ... and so on for valid and test sets ...

# --- Evaluation Function (ensure it's defined) ---
# def evaluate_multilabel_model(...): ...

# --- Train XGBoost Model (Option 1) ---
print("🚀 Training XGBoost model (OneVsRestClassifier, no explicit weights)...")

# Define the XGBoost classifier
# Use eval_metric='logloss' and use_label_encoder=False as recommended for newer versions
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)

# Wrap it with OneVsRestClassifier
ovr_xgb_model = OneVsRestClassifier(xgb_clf, n_jobs=4) # Use n_jobs=-1 for parallel training of binary classifiers

# Train the model
ovr_xgb_model.fit(X_train, y_train)

# --- Predictions ---
print("⚙️ Generating predictions...")
y_train_pred_xgb = ovr_xgb_model.predict(X_train)
y_valid_pred_xgb = ovr_xgb_model.predict(X_valid)
y_test_pred_xgb = ovr_xgb_model.predict(X_test)

# Probabilities are needed for ROC-AUC
y_train_prob_xgb = ovr_xgb_model.predict_proba(X_train)
y_valid_prob_xgb = ovr_xgb_model.predict_proba(X_valid)
y_test_prob_xgb = ovr_xgb_model.predict_proba(X_test)

# --- Evaluation ---
print("\n=== XGBoost (OneVsRest, No Weights) Evaluation ===")
metrics_train_xgb1 = evaluate_multilabel_model(y_train, y_train_pred_xgb, y_train_prob_xgb, "Train (XGB OvR)")
metrics_valid_xgb1 = evaluate_multilabel_model(y_valid, y_valid_pred_xgb, y_valid_prob_xgb, "Validation (XGB OvR)")
metrics_test_xgb1 = evaluate_multilabel_model(y_test, y_test_pred_xgb, y_test_prob_xgb, "Test (XGB OvR)")

🚀 Training XGBoost model (OneVsRestClassifier, no explicit weights)...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


⚙️ Generating predictions...

=== XGBoost (OneVsRest, No Weights) Evaluation ===

📊 Multilabel Evaluation Metrics:
Nom :               : Train (XGB OvR)
Subset accuracy     : 0.7318
Hamming loss        : 0.0240
Micro F1            : 0.9787
Macro F1            : 0.9723
Weighted F1         : 0.9785
Micro ROC-AUC       : 0.9974
Macro ROC-AUC       : 0.9971

📊 Multilabel Evaluation Metrics:
Nom :               : Validation (XGB OvR)
Subset accuracy     : 0.0210
Hamming loss        : 0.2313
Micro F1            : 0.8167
Macro F1            : 0.6629
Weighted F1         : 0.8047
Micro ROC-AUC       : 0.8264
Macro ROC-AUC       : 0.5918

📊 Multilabel Evaluation Metrics:
Nom :               : Test (XGB OvR)
Subset accuracy     : 0.0070
Hamming loss        : 0.2481
Micro F1            : 0.7934
Macro F1            : 0.6364
Weighted F1         : 0.7907
Micro ROC-AUC       : 0.8225
Macro ROC-AUC       : 0.6153


In [9]:
# --- Entraînement des modèles XGBoost (Option 2 : Individuels avec poids) ---
print("🚀 Training individual XGBoost models with scale_pos_weight")

n_samples_train = X_train.shape[0]
n_labels = y_train.shape[1]
trained_models_weighted = [] # Liste pour stocker les modèles entraînés

# Boucle sur chaque label pour entraîner un modèle spécifique
for i in tqdm(range(n_labels), desc="Training models per label"):
    y_train_label = y_train[:, i] # Sélectionne la colonne du label i

    # Calcule scale_pos_weight pour ce label
    n_positives = np.sum(y_train_label == 1)
    n_negatives = n_samples_train - n_positives
    # Gère le cas où il n'y a que des 0 ou que des 1 (peu probable mais sûr)
    scale_pos_weight = n_negatives / n_positives if n_positives > 0 and n_negatives > 0 else 1

    # Définit le classifieur XGBoost AVEC le poids calculé
    xgb_clf_weighted = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        scale_pos_weight=scale_pos_weight, # <<< Poids spécifique à ce label
        random_state=42,
        # Tu peux ajouter n_jobs=1 ici si tu veux limiter l'utilisation du CPU par modèle
        # n_jobs=1
    )

    # Entraîne le modèle pour ce label
    xgb_clf_weighted.fit(X_train, y_train_label)
    trained_models_weighted.append(xgb_clf_weighted)

# --- Prédictions en utilisant la liste des modèles entraînés ---
print("⚙️ Generating predictions from individual weighted models...")

# Fonction pour prédire avec la liste de modèles (peut être réutilisée)
def predict_individual_models(X, models):
    n_samples_pred = X.shape[0]
    n_labels_pred = len(models)
    # Initialise les matrices pour les prédictions binaires et les probabilités
    y_pred = np.zeros((n_samples_pred, n_labels_pred))
    y_prob = np.zeros((n_samples_pred, n_labels_pred))

    # Boucle sur chaque modèle (correspondant à un label)
    for i, model in enumerate(models):
        probas = model.predict_proba(X) # Probabilités [classe 0, classe 1]
        y_prob[:, i] = probas[:, 1]      # Stocke la probabilité de la classe positive (1)
        y_pred[:, i] = (probas[:, 1] >= 0.5).astype(int) # Prédit 1 si proba >= 0.5

    return y_pred, y_prob

# Génère les prédictions pour chaque set
y_train_pred_xgb_w, y_train_prob_xgb_w = predict_individual_models(X_train, trained_models_weighted)
y_valid_pred_xgb_w, y_valid_prob_xgb_w = predict_individual_models(X_valid, trained_models_weighted)
y_test_pred_xgb_w, y_test_prob_xgb_w = predict_individual_models(X_test, trained_models_weighted)

# --- Évaluation ---
print("\n=== XGBoost (Individual Models with Weights) Evaluation ===")
metrics_train_xgb2 = evaluate_multilabel_model(y_train, y_train_pred_xgb_w, y_train_prob_xgb_w, "Train (XGB Weighted)")
metrics_valid_xgb2 = evaluate_multilabel_model(y_valid, y_valid_pred_xgb_w, y_valid_prob_xgb_w, "Validation (XGB Weighted)")
metrics_test_xgb2 = evaluate_multilabel_model(y_test, y_test_pred_xgb_w, y_test_prob_xgb_w, "Test (XGB Weighted)")

🚀 Training individual XGBoost models with scale_pos_weight...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Training models per label: 100%|██████████████████████████████████████████████████████████████████████████| 27/27 [00:07<00:00,  3.42it/s]


⚙️ Generating predictions from individual weighted models...

=== XGBoost (Individual Models with Weights) Evaluation ===

📊 Multilabel Evaluation Metrics:
Nom :               : Train (XGB Weighted)
Subset accuracy     : 0.6328
Hamming loss        : 0.0439
Micro F1            : 0.9596
Macro F1            : 0.9652
Weighted F1         : 0.9595
Micro ROC-AUC       : 0.9939
Macro ROC-AUC       : 0.9944

📊 Multilabel Evaluation Metrics:
Nom :               : Validation (XGB Weighted)
Subset accuracy     : 0.0140
Hamming loss        : 0.2587
Micro F1            : 0.7893
Macro F1            : 0.6555
Weighted F1         : 0.7850
Micro ROC-AUC       : 0.8032
Macro ROC-AUC       : 0.5862

📊 Multilabel Evaluation Metrics:
Nom :               : Test (XGB Weighted)
Subset accuracy     : 0.0000
Hamming loss        : 0.2717
Micro F1            : 0.7700
Macro F1            : 0.6379
Weighted F1         : 0.7756
Micro ROC-AUC       : 0.7998
Macro ROC-AUC       : 0.6270


2. XGBoost with PCA Data

In [13]:
# Importations nécessaires (si pas déjà faites)
import xgboost as xgb
from sklearn.multiclass import OneVsRestClassifier
# Assurez-vous que X_train_pca, y_train, X_valid_pca, y_valid, X_test_pca, y_test sont disponibles
# Assurez-vous que la fonction evaluate_multilabel_model est définie

# --- Entraîner le modèle XGBoost sur les données PCA ---
print("🚀 Training XGBoost model (OneVsRestClassifier, no explicit weights) on PCA data...")

# Définit le classifieur XGBoost (identique à avant)
xgb_clf_pca = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss', use_label_encoder=False, random_state=42)

# Wrapper OneVsRestClassifier
ovr_xgb_model_pca = OneVsRestClassifier(xgb_clf_pca, n_jobs=4) # Gardons n_jobs=4 comme tu l'as utilisé

# Entraîne sur les données PCA
ovr_xgb_model_pca.fit(X_train_pca, y_train)

# --- Prédictions sur les données PCA ---
print("⚙️ Generating predictions on PCA data...")
y_train_pred_xgb_pca = ovr_xgb_model_pca.predict(X_train_pca)
y_valid_pred_xgb_pca = ovr_xgb_model_pca.predict(X_valid_pca)
y_test_pred_xgb_pca = ovr_xgb_model_pca.predict(X_test_pca)

# Probabilités pour ROC-AUC
y_train_prob_xgb_pca = ovr_xgb_model_pca.predict_proba(X_train_pca)
y_valid_prob_xgb_pca = ovr_xgb_model_pca.predict_proba(X_valid_pca)
y_test_prob_xgb_pca = ovr_xgb_model_pca.predict_proba(X_test_pca)

# --- Évaluation ---
print("\n=== XGBoost (OneVsRest, No Weights) on PCA Data Evaluation ===")
metrics_train_xgb_pca = evaluate_multilabel_model(y_train, y_train_pred_xgb_pca, y_train_prob_xgb_pca, "Train (XGB OvR PCA)")
metrics_valid_xgb_pca = evaluate_multilabel_model(y_valid, y_valid_pred_xgb_pca, y_valid_prob_xgb_pca, "Validation (XGB OvR PCA)")
metrics_test_xgb_pca = evaluate_multilabel_model(y_test, y_test_pred_xgb_pca, y_test_prob_xgb_pca, "Test (XGB OvR PCA)")

🚀 Training XGBoost model (OneVsRestClassifier, no explicit weights) on PCA data...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


⚙️ Generating predictions on PCA data...

=== XGBoost (OneVsRest, No Weights) on PCA Data Evaluation ===

📊 Multilabel Evaluation Metrics:
Nom :               : Train (XGB OvR PCA)
Subset accuracy     : 0.9939
Hamming loss        : 0.0007
Micro F1            : 0.9994
Macro F1            : 0.9994
Weighted F1         : 0.9994
Micro ROC-AUC       : 1.0000
Macro ROC-AUC       : 1.0000

📊 Multilabel Evaluation Metrics:
Nom :               : Validation (XGB OvR PCA)
Subset accuracy     : 0.0070
Hamming loss        : 0.2248
Micro F1            : 0.8220
Macro F1            : 0.6423
Weighted F1         : 0.8004
Micro ROC-AUC       : 0.8270
Macro ROC-AUC       : 0.5537

📊 Multilabel Evaluation Metrics:
Nom :               : Test (XGB OvR PCA)
Subset accuracy     : 0.0070
Hamming loss        : 0.2349
Micro F1            : 0.8048
Macro F1            : 0.6253
Weighted F1         : 0.7902
Micro ROC-AUC       : 0.8262
Macro ROC-AUC       : 0.5891


In [14]:
# Importations needed (ensure they are loaded)
import xgboost as xgb
from sklearn.metrics import accuracy_score, hamming_loss, f1_score, roc_auc_score
import numpy as np
import pandas as pd
from tqdm import tqdm

# --- Use PCA-reduced Data ---
# (Ensure X_train_pca, y_train, X_valid_pca, y_valid, X_test_pca, y_test are available)
# These should have been created in a previous cell (around cell ID 14266b9c)

# --- Evaluation Function (ensure it's defined) ---
# def evaluate_multilabel_model(...): ...

# --- Train XGBoost Models (Option 2: Individual models with weights on PCA data) ---
print("🚀 Training individual XGBoost models with scale_pos_weight on PCA data...")

n_samples_train = X_train_pca.shape[0] # Use PCA data shape here
n_labels = y_train.shape[1]
trained_models_weighted_pca = [] # List for storing these models

# Loop over each label
for i in tqdm(range(n_labels), desc="Training models per label (PCA)"):
    y_train_label = y_train[:, i] # Weights are based on original labels

    # Calculate scale_pos_weight for the current label using y_train
    n_positives = np.sum(y_train_label == 1)
    n_negatives = n_samples_train - n_positives
    scale_pos_weight = n_negatives / n_positives if n_positives > 0 and n_negatives > 0 else 1

    # Define the XGBoost classifier WITH the calculated weight
    xgb_clf_weighted_pca = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        scale_pos_weight=scale_pos_weight, # Specific weight for this label
        random_state=42,
        # n_jobs=1 # Optional: limit CPU per model if needed
    )

    # Train the model for this label ON PCA DATA
    xgb_clf_weighted_pca.fit(X_train_pca, y_train_label)
    trained_models_weighted_pca.append(xgb_clf_weighted_pca)

# --- Predictions using individual weighted models on PCA data ---
print("⚙️ Generating predictions from individual weighted models on PCA data...")

# Use the same prediction function as before
# def predict_individual_models(X, models): ...

# Generate predictions for each PCA dataset
y_train_pred_xgb_w_pca, y_train_prob_xgb_w_pca = predict_individual_models(X_train_pca, trained_models_weighted_pca)
y_valid_pred_xgb_w_pca, y_valid_prob_xgb_w_pca = predict_individual_models(X_valid_pca, trained_models_weighted_pca)
y_test_pred_xgb_w_pca, y_test_prob_xgb_w_pca = predict_individual_models(X_test_pca, trained_models_weighted_pca)

# --- Evaluation ---
print("\n=== XGBoost (Individual Models with Weights) on PCA Data Evaluation ===")
metrics_train_xgb_pca2 = evaluate_multilabel_model(y_train, y_train_pred_xgb_w_pca, y_train_prob_xgb_w_pca, "Train (XGB Weighted PCA)")
metrics_valid_xgb_pca2 = evaluate_multilabel_model(y_valid, y_valid_pred_xgb_w_pca, y_valid_prob_xgb_w_pca, "Validation (XGB Weighted PCA)")
metrics_test_xgb_pca2 = evaluate_multilabel_model(y_test, y_test_pred_xgb_w_pca, y_test_prob_xgb_w_pca, "Test (XGB Weighted PCA)")

🚀 Training individual XGBoost models with scale_pos_weight on PCA data...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


⚙️ Generating predictions from individual weighted models on PCA data...

=== XGBoost (Individual Models with Weights) on PCA Data Evaluation ===

📊 Multilabel Evaluation Metrics:
Nom :               : Train (XGB Weighted PCA)
Subset accuracy     : 0.9947
Hamming loss        : 0.0007
Micro F1            : 0.9994
Macro F1            : 0.9994
Weighted F1         : 0.9994
Micro ROC-AUC       : 1.0000
Macro ROC-AUC       : 1.0000

📊 Multilabel Evaluation Metrics:
Nom :               : Validation (XGB Weighted PCA)
Subset accuracy     : 0.0140
Hamming loss        : 0.2354
Micro F1            : 0.8107
Macro F1            : 0.6440
Weighted F1         : 0.7935
Micro ROC-AUC       : 0.8189
Macro ROC-AUC       : 0.5643

📊 Multilabel Evaluation Metrics:
Nom :               : Test (XGB Weighted PCA)
Subset accuracy     : 0.0000
Hamming loss        : 0.2453
Micro F1            : 0.7945
Macro F1            : 0.6444
Weighted F1         : 0.7872
Micro ROC-AUC       : 0.8131
Macro ROC-AUC       : 0.593

In [17]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# --- Définir l'estimateur de base ---
# Note: On ne met pas n_jobs ici, RandomizedSearchCV s'en chargera
xgb_base = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)
ovr_xgb_estimator = OneVsRestClassifier(xgb_base) # Pas de n_jobs ici

# --- Définir l'espace de recherche des hyperparamètres ---
# Paramètres pour XGBClassifier à l'intérieur de OneVsRestClassifier
param_distributions_xgb = {
    'estimator__n_estimators': randint(100, 400),       # Nombre d'arbres
    'estimator__max_depth': randint(3, 10),            # Profondeur max (limite l'overfitting)
    'estimator__learning_rate': uniform(0.01, 0.2),    # Taux d'apprentissage
    'estimator__subsample': uniform(0.6, 0.4),         # Fraction d'échantillons par arbre
    'estimator__colsample_bytree': uniform(0.6, 0.4),  # Fraction de features par arbre
    'estimator__gamma': uniform(0, 0.5),               # Régularisation (min_split_loss)
    'estimator__reg_alpha': uniform(0, 1),             # Régularisation L1 (alpha)
    'estimator__reg_lambda': uniform(0, 1)             # Régularisation L2 (lambda)
    # scale_pos_weight n'est pas inclus ici car difficile à optimiser globalement via OneVsRest
}

# --- Configuration de RandomizedSearchCV ---
# Utilisation de f1_macro pour l'évaluation, sensible à l'imbalance
random_search_xgb = RandomizedSearchCV(
    estimator=ovr_xgb_estimator,
    param_distributions=param_distributions_xgb,
    n_iter=25,          # Nombre de combinaisons à tester (augmente si temps/ressources ok)
    scoring='f1_macro',
    cv=3,               # Validation croisée 3 plis
    verbose=2,
    random_state=42,
    n_jobs=4            # Nombre de coeurs à utiliser pour la recherche (ajuste selon ta machine)
)

print("🚀 Lancement de RandomizedSearchCV pour XGBoost (sur PCA)...")

# --- Lancer la recherche ---
# Important: Fournir les données de validation pour l'Early Stopping
# Note: RandomizedSearchCV ne supporte pas nativement l'early stopping direct via OneVsRestClassifier.
# L'early stopping devrait être implémenté dans une boucle personnalisée si nécessaire,
# ou on se fie à la CV pour choisir des paramètres qui généralisent bien.
# Pour simplifier, nous allons omettre l'early stopping direct ici et nous fier à la CV.
random_search_xgb.fit(X_train_pca, y_train)

# --- Afficher les meilleurs résultats ---
print("\n✅ Meilleurs paramètres trouvés pour XGBoost (sur PCA):")
print(random_search_xgb.best_params_)
print(f"\n🏆 Meilleur score F1-macro (CV) : {random_search_xgb.best_score_:.4f}")

# --- Évaluer le meilleur modèle XGBoost trouvé sur le Test set ---
best_xgb_model_pca = random_search_xgb.best_estimator_

print("\n⚙️ Génération des prédictions du meilleur modèle XGBoost (sur PCA)...")
y_test_pred_best_xgb_pca = best_xgb_model_pca.predict(X_test_pca)
y_test_prob_best_xgb_pca = best_xgb_model_pca.predict_proba(X_test_pca)

print("\n=== Évaluation du Meilleur XGBoost (RandomSearch) sur Test (PCA) ===")
metrics_test_best_xgb_pca = evaluate_multilabel_model(
    y_test, y_test_pred_best_xgb_pca, y_test_prob_best_xgb_pca, "Test (Best XGB PCA)"
)

🚀 Lancement de RandomizedSearchCV pour XGBoost (sur PCA)...
Fitting 3 folds for each of 25 candidates, totalling 75 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.



✅ Meilleurs paramètres trouvés pour XGBoost (sur PCA):
{'estimator__colsample_bytree': np.float64(0.8989280440549523), 'estimator__gamma': np.float64(0.2698460661945399), 'estimator__learning_rate': np.float64(0.12735023313276966), 'estimator__max_depth': 3, 'estimator__n_estimators': 351, 'estimator__reg_alpha': np.float64(0.6070342476866847), 'estimator__reg_lambda': np.float64(0.27599918202254337), 'estimator__subsample': np.float64(0.718509402281633)}

🏆 Meilleur score F1-macro (CV) : 0.6274

⚙️ Génération des prédictions du meilleur modèle XGBoost (sur PCA)...

=== Évaluation du Meilleur XGBoost (RandomSearch) sur Test (PCA) ===

📊 Multilabel Evaluation Metrics:
Nom :               : Test (Best XGB PCA)
Subset accuracy     : 0.0070
Hamming loss        : 0.2445
Micro F1            : 0.7951
Macro F1            : 0.6116
Weighted F1         : 0.7811
Micro ROC-AUC       : 0.8216
Macro ROC-AUC       : 0.5877


In [18]:
# --- Valeurs de C à tester ---
# C=1.0 (moins de régularisation), C=0.5, C=0.1 (plus de régularisation)
C_values = [1.0, 0.5, 0.1]
results_l1 = []

print("🚀 Test de différentes forces de régularisation L1 pour Régression Logistique...")

for c_val in C_values:
    print(f"\n--- Test avec C = {c_val} ---")
    # Définit et entraîne le modèle
    l1_model_tuned = OneVsRestClassifier(
        LogisticRegression(
            penalty='l1',
            C=c_val,
            solver='liblinear',
            class_weight='balanced',
            max_iter=1000 # Augmenté pour convergence
        ),
        n_jobs=4 # Utilise 4 coeurs
    )
    l1_model_tuned.fit(X_train_raw, y_train)

    # Prédictions sur le set de validation (ou test, selon préférence)
    print("⚙️ Génération des prédictions (Validation)...")
    y_valid_pred_l1_tuned = l1_model_tuned.predict(X_valid_raw)
    y_valid_prob_l1_tuned = l1_model_tuned.predict_proba(X_valid_raw)

    # Évaluation sur le set de validation
    print(f"\n=== Évaluation LR L1 (C={c_val}) sur Validation ===")
    metrics = evaluate_multilabel_model(
        y_valid, y_valid_pred_l1_tuned, y_valid_prob_l1_tuned, f"Validation (LR L1 C={c_val})"
    )
    metrics['C'] = c_val # Ajoute C aux résultats
    results_l1.append(metrics)

# --- Afficher un résumé des résultats ---
print("\n=== Résumé des performances LR L1 sur Validation ===")
results_df_l1 = pd.DataFrame(results_l1)
# Affiche les colonnes pertinentes pour la comparaison
print(results_df_l1[['C', 'Macro F1', 'Micro F1', 'Macro ROC-AUC', 'Micro ROC-AUC', 'Hamming loss']].round(4))

# --- Optionnel: Ré-entraîner le meilleur C sur Train+Valid et évaluer sur Test ---
# Trouve le meilleur C basé sur le Macro F1 sur la validation
best_c_l1 = results_df_l1.loc[results_df_l1['Macro F1'].idxmax()]['C']
print(f"\n✅ Meilleur C trouvé basé sur Macro F1 (Validation) : {best_c_l1}")

# (Si souhaité, tu peux ajouter ici le code pour ré-entraîner avec le best_c_l1 sur
# np.vstack((X_train_raw, X_valid_raw)) et np.vstack((y_train, y_valid)),
# puis évaluer sur X_test_raw, y_test)

🚀 Test de différentes forces de régularisation L1 pour Régression Logistique...

--- Test avec C = 1.0 ---


NameError: name 'LogisticRegression' is not defined

[CV] END estimator__colsample_bytree=0.749816047538945, estimator__gamma=0.4753571532049581, estimator__learning_rate=0.15639878836228102, estimator__max_depth=7, estimator__n_estimators=120, estimator__reg_alpha=0.15601864044243652, estimator__reg_lambda=0.15599452033620265, estimator__subsample=0.6232334448672797; total time= 1.1min
[CV] END estimator__colsample_bytree=0.996884623716487, estimator__gamma=0.30874075481385826, estimator__learning_rate=0.1323306320976562, estimator__max_depth=7, estimator__n_estimators=335, estimator__reg_alpha=0.023062425041415757, estimator__reg_lambda=0.5247746602583891, estimator__subsample=0.7599443886861021; total time= 2.1min
[CV] END estimator__colsample_bytree=0.6186662652854461, estimator__gamma=0.4868777594207296, estimator__learning_rate=0.05655426808606085, estimator__max_depth=8, estimator__n_estimators=274, estimator__reg_alpha=0.6183860093330873, estimator__reg_lambda=0.38246199126716274, estimator__subsample=0.9932923543227152; total ti