# Data importation and evaluation function

In [None]:
import deepchem as dc
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Helper function to combine ECFP and SMILES datasets
def dataset_to_df_with_smiles(ecfp_dataset, raw_dataset, tasks):
    X_list, y_list, ids_list, smiles_list = [], [], [], []
    
    # Iterate through both datasets in parallel
    for (X_batch, y_batch, w_batch, ids_batch), (X_raw, _, _, _) in zip(
        ecfp_dataset.iterbatches(batch_size=128, pad_batches=False),
        raw_dataset.iterbatches(batch_size=128, pad_batches=False)
    ):
        X_list.append(X_batch)
        y_list.append(y_batch)
        ids_list.extend(ids_batch)
        smiles_list.extend(X_raw)  # SMILES strings are in the raw features

    # Stack numerical and label arrays
    X_all = np.vstack(X_list)
    y_all = np.vstack(y_list)

    # Create DataFrames
    df_X = pd.DataFrame(X_all, columns=[f"fp_{i}" for i in range(X_all.shape[1])])
    df_y = pd.DataFrame(y_all, columns=tasks)
    df_y["mol_id"] = ids_list
    df_y["smiles"] = smiles_list

    # Combine all information into a single DataFrame
    df = pd.concat([df_y, df_X], axis=1)
    return df


# Load SIDER dataset with ECFP features (for model input)
tasks, datasets, transformers = dc.molnet.load_sider(featurizer='ECFP', splitter='scaffold')
train_ecfp, valid_ecfp, test_ecfp = datasets

# Load SIDER dataset again with raw SMILES (for visualization / metadata)
_, datasets_raw, _ = dc.molnet.load_sider(featurizer='Raw', splitter='scaffold')
train_raw, valid_raw, test_raw = datasets_raw

# Convert both representations to DataFrames
df_train = dataset_to_df_with_smiles(train_ecfp, train_raw, tasks)
df_valid = dataset_to_df_with_smiles(valid_ecfp, valid_raw, tasks)
df_test  = dataset_to_df_with_smiles(test_ecfp,  test_raw,  tasks)

feature_cols = [col for col in df_train.columns if col.startswith("fp_")]
label_cols = [col for col in df_train.columns if col not in feature_cols + ['mol_id', 'smiles', 'scaffold']]

X_train = df_train[feature_cols].astype(float).values
y_train = df_train[label_cols].astype(float).values

X_valid = df_valid[feature_cols].astype(float).values
y_valid = df_valid[label_cols].astype(float).values

X_test = df_test[feature_cols].astype(float).values
y_test = df_test[label_cols].astype(float).values

pca = PCA().fit(X_train)
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Number of components to reach 80% variance
n_components_80 = np.argmax(cumulative_variance >= 0.80) + 1

pca = PCA(n_components=n_components_80)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(X_test)

In [None]:
from sklearn.metrics import (
    accuracy_score, hamming_loss, f1_score, roc_auc_score
)

def evaluate_multilabel_model(y_true, y_pred, y_prob=None, name=None):
    """
    Evaluate multilabel classification performance.
    
    Parameters
    ----------
    y_true : np.ndarray
        Ground-truth binary matrix (n_samples x n_labels)
    y_pred : np.ndarray
        Predicted binary matrix (same shape as y_true)
    y_prob : np.ndarray, optional
        Predicted probabilities (for ROC-AUC if available)
    name : str
        Name of the dataframe evaluated
    """

    metrics = {"Nom :": name}
    metrics["Subset accuracy"] = accuracy_score(y_true, y_pred)
    metrics["Hamming loss"] = hamming_loss(y_true, y_pred)
    metrics["Micro F1"] = f1_score(y_true, y_pred, average="micro")
    metrics["Macro F1"] = f1_score(y_true, y_pred, average="macro")
    metrics["Weighted F1"] = f1_score(y_true, y_pred, average="weighted")
    
    if y_prob is not None:
        try:
            metrics["Micro ROC-AUC"] = roc_auc_score(y_true, y_prob, average="micro")
            metrics["Macro ROC-AUC"] = roc_auc_score(y_true, y_prob, average="macro")
        except ValueError:
            metrics["Micro ROC-AUC"] = np.nan
            metrics["Macro ROC-AUC"] = np.nan

    print("\n📊 Multilabel Evaluation Metrics:")
    for k, v in metrics.items():
        if isinstance(v, float):
            print(f"{k:20s}: {v:.4f}")
        else:
            print(f"{k:20s}: {v}")

    return metrics


___

# Radom Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from tqdm import tqdm

# Random Forest multi-label
rf_model = OneVsRestClassifier(
    RandomForestClassifier(
        n_estimators=200,       
        max_depth=30,         
        class_weight='balanced',
        random_state=42,
        n_jobs=3
    )
)

# Train
rf_model.fit(X_train_pca, y_train)


y_train_pred_rf_pca = rf_model.predict(X_train_pca)
y_valid_pred_rf_pca = rf_model.predict(X_valid_pca)
y_test_pred_rf_pca = rf_model.predict(X_test_pca)

y_train_prob_rf_pca = rf_model.predict_proba(X_train_pca)
y_valid_prob_rf_pca = rf_model.predict_proba(X_valid_pca)
y_test_prob_rf_pca = rf_model.predict_proba(X_test_pca)

print("=== Random Forest on PCA-reduced Data ===")
evaluate_multilabel_model(y_train, y_train_pred_rf_pca, y_train_prob_rf_pca, "Train (RF)")
evaluate_multilabel_model(y_valid, y_valid_pred_rf_pca, y_valid_prob_rf_pca, "Validation (RF)")
evaluate_multilabel_model(y_test, y_test_pred_rf_pca, y_test_prob_rf_pca, "Test (RF)")


=== Random Forest on PCA-reduced Data ===

📊 Multilabel Evaluation Metrics:
Nom :               : Train (RF)
Subset accuracy     : 0.9956
Hamming loss        : 0.0007
Micro F1            : 0.9994
Macro F1            : 0.9994
Weighted F1         : 0.9994
Micro ROC-AUC       : 1.0000
Macro ROC-AUC       : 1.0000

📊 Multilabel Evaluation Metrics:
Nom :               : Validation (RF)
Subset accuracy     : 0.0280
Hamming loss        : 0.2085
Micro F1            : 0.8404
Macro F1            : 0.6323
Weighted F1         : 0.8096
Micro ROC-AUC       : 0.8432
Macro ROC-AUC       : 0.5857

📊 Multilabel Evaluation Metrics:
Nom :               : Test (RF)
Subset accuracy     : 0.0210
Hamming loss        : 0.2303
Micro F1            : 0.8167
Macro F1            : 0.6123
Weighted F1         : 0.7957
Micro ROC-AUC       : 0.8456
Macro ROC-AUC       : 0.6183


{'Nom :': 'Test (RF)',
 'Subset accuracy': 0.02097902097902098,
 'Hamming loss': 0.23025123025123026,
 'Micro F1': 0.8166632295318622,
 'Macro F1': 0.6123079001165772,
 'Weighted F1': 0.7956500082320085,
 'Micro ROC-AUC': 0.8455607463883991,
 'Macro ROC-AUC': 0.618323359341634}

### Random Forest (RF) on PCA Data 🌲
**Training:** Massive overfitting (near-perfect scores). The model is memorizing the training data.

**Test:**
- Micro F1: 0.819 (Moderate performance on frequent labels).
- Macro F1: 0.624 (Poor performance on rare labels).
- Macro ROC AUC: 0.606 (Poor average discrimination across labels).
- Hamming Loss: 0.227 (~23% of labels mispredicted).

**RF (on PCA) Conclusion:** The model still overfits heavily, although PCA slightly reduced it compared to an unregularized model. It performs moderately on common labels but fails on rare labels. The improvement over Logistic Regression on PCA is minimal.

**Comparison with Initial Logistic Regression (LR) (on raw ECFP)**

Initial Logistic Regression (Test):

- Micro F1: 0.771
- Macro F1: 0.622
- Macro ROC AUC: 0.596
- Hamming Loss: 0.271

Comparison:

Overfitting: Both models overfit, but the Random Forest (even on PCA) shows even more extreme overfitting on the training set (perfect scores) than the initial LR.

Performance (Common Labels): RF (on PCA) is slightly better on common labels (Micro F1: 0.819 vs 0.771 for LR).

Performance (Rare Labels): Neither model performs well on rare labels (Macro F1 around 0.62 for both; Macro ROC AUC around 0.60 for both).

Overall Label Errors: RF (on PCA) makes slightly fewer overall label errors (Hamming Loss 0.227 vs 0.271 for LR).

Overall Conclusion: The Random Forest (even with PCA and balanced weights) overfits massively and doesn't significantly improve generalization compared to the initial Logistic Regression, especially for rare labels. PCA reduced dimensionality but wasn't sufficient to solve the core generalization challenge related to scaffolds and label imbalance for this more complex model.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from tqdm import tqdm
# 1. Définir l'estimateur de base
base_estimator = OneVsRestClassifier(
    RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
)

# 2. Définir la grille d'hyperparamètres à tester
# Le préfixe 'estimator__' est nécessaire pour passer les paramètres
# au RandomForestClassifier à l'intérieur de OneVsRestClassifier.
param_dist = {
    # Préfixe 'estimator__' ajouté
    'estimator__n_estimators': randint(low=100, high=1001), 
    
    # Préfixe 'estimator__' ajouté
    'estimator__max_depth': [10, 20, 30, 40, 50, None], 
    
    # Préfixe 'estimator__' ajouté
    'estimator__min_samples_leaf': randint(low=1, high=6),
}

# 3. Mettre en place GridSearchCV
# Le score 'f1_macro' est un bon choix pour les problèmes multi-labels déséquilibrés.
random_search = RandomizedSearchCV(
    estimator=base_estimator, 
    param_distributions=param_dist, # Notez le changement de nom !
    n_iter=5,          # Le nombre de "tirages" aléatoires
    cv=5, 
    scoring='f1_macro', 
    n_jobs=2, # Utilise tous les cœurs
    verbose=2,
    random_state=42 # Pour la reproductibilité
)

print("🚀 Lancement de GridSearchCV pour RandomForestClassifier...")
# 4. Lancer la recherche sur les données d'entraînement réduites par PCA
tqdm(random_search.fit(X_train_pca, y_train))

# 5. Afficher les meilleurs paramètres trouvés
print("\n✅ Meilleurs paramètres trouvés :")
print(random_search.best_params_)
print(f"\n🏆 Meilleur score F1-macro sur les plis de validation : {random_search.best_score_:.4f}")

# Utiliser le meilleur estimateur trouvé pour la prédiction finale
best_rf_model_grid = random_search.best_estimator_

# Évaluer le meilleur modèle sur l'ensemble de test
y_test_pred_best_grid = best_rf_model_grid.predict(X_test_pca)
y_test_prob_best_grid = best_rf_model_grid.predict_proba(X_test_pca)
print("\n=== Évaluation du meilleur Random Forest (GridSearch) sur l'ensemble de test ===")
evaluate_multilabel_model(y_test, y_test_pred_best_grid, y_test_prob_best_grid, "Test (Best RF - Grid)")

🚀 Lancement de GridSearchCV pour RandomForestClassifier...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END estimator__max_depth=40, estimator__min_samples_leaf=5, estimator__n_estimators=370; total time= 1.5min
[CV] END estimator__max_depth=40, estimator__min_samples_leaf=5, estimator__n_estimators=370; total time= 1.6min
[CV] END estimator__max_depth=40, estimator__min_samples_leaf=5, estimator__n_estimators=370; total time= 2.5min
[CV] END estimator__max_depth=40, estimator__min_samples_leaf=5, estimator__n_estimators=370; total time= 2.5min
[CV] END estimator__max_depth=30, estimator__min_samples_leaf=5, estimator__n_estimators=120; total time=  46.7s
[CV] END estimator__max_depth=30, estimator__min_samples_leaf=5, estimator__n_estimators=120; total time=  43.7s
[CV] END estimator__max_depth=40, estimator__min_samples_leaf=5, estimator__n_estimators=370; total time= 2.2min
[CV] END estimator__max_depth=30, estimator__min_samples_leaf=5, estimator__n_estimators