# Dataset importation

In [5]:
import deepchem as dc
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

# Helper function to combine ECFP and SMILES datasets
def dataset_to_df_with_smiles(ecfp_dataset, raw_dataset, tasks):
    X_list, y_list, ids_list, smiles_list = [], [], [], []
    
    # Iterate through both datasets in parallel
    for (X_batch, y_batch, w_batch, ids_batch), (X_raw, _, _, _) in zip(
        ecfp_dataset.iterbatches(batch_size=128, pad_batches=False),
        raw_dataset.iterbatches(batch_size=128, pad_batches=False)
    ):
        X_list.append(X_batch)
        y_list.append(y_batch)
        ids_list.extend(ids_batch)
        smiles_list.extend(X_raw)  # SMILES strings are in the raw features

    # Stack numerical and label arrays
    X_all = np.vstack(X_list)
    y_all = np.vstack(y_list)

    # Create DataFrames
    df_X = pd.DataFrame(X_all, columns=[f"fp_{i}" for i in range(X_all.shape[1])])
    df_y = pd.DataFrame(y_all, columns=tasks)
    df_y["mol_id"] = ids_list
    df_y["smiles"] = smiles_list

    # Combine all information into a single DataFrame
    df = pd.concat([df_y, df_X], axis=1)
    return df


# Load SIDER dataset with ECFP features (for model input)
tasks, datasets, transformers = dc.molnet.load_sider(featurizer='ECFP', splitter='scaffold')
train_ecfp, valid_ecfp, test_ecfp = datasets

# Load SIDER dataset again with raw SMILES (for visualization / metadata)
_, datasets_raw, _ = dc.molnet.load_sider(featurizer='Raw', splitter='scaffold')
train_raw, valid_raw, test_raw = datasets_raw

# Convert both representations to DataFrames
df_train = dataset_to_df_with_smiles(train_ecfp, train_raw, tasks)
df_valid = dataset_to_df_with_smiles(valid_ecfp, valid_raw, tasks)
df_test  = dataset_to_df_with_smiles(test_ecfp,  test_raw,  tasks)

feature_cols = [col for col in df_train.columns if col.startswith("fp_")]
label_cols = [col for col in df_train.columns if col not in feature_cols + ['mol_id', 'smiles', 'scaffold']]

X_train = df_train[feature_cols].astype(float).values
y_train = df_train[label_cols].astype(float).values

X_valid = df_valid[feature_cols].astype(float).values
y_valid = df_valid[label_cols].astype(float).values

X_test = df_test[feature_cols].astype(float).values
y_test = df_test[label_cols].astype(float).values

pca = PCA().fit(X_train)
cumulative_variance = pca.explained_variance_ratio_.cumsum()

# Number of components to reach 80% variance
n_components_80 = np.argmax(cumulative_variance >= 0.80) + 1
print(f"Number of components for 80% variance: {n_components_80}")
pca = PCA(n_components=n_components_80)
X_train_pca = pca.fit_transform(X_train)
X_valid_pca = pca.transform(X_valid)
X_test_pca = pca.transform(X_test)

Number of components for 80% variance: 197


---
### Baseline Modeling

Now that the dataset has been explored and verified — including the structure, label balance, and feature sparsity — we can start testing basic machine learning models.

The goal of this section is to establish **baseline performances** using simple algorithms (e.g., Logistic Regression, Random Forest, or simple Neural Networks) on the ECFP features.  
These baselines will help us understand how well standard models can capture relationships between molecular fingerprints and side effects before moving to more complex or specialized architectures.


### First wee need to evaluate our models 

All our models will be evaluated by the same metrics :

- **Subset Accuracy**  
  Measures the fraction of samples where *all* labels are correctly predicted.  
  → Very strict metric; typically **low** in multilabel tasks.  
  **Good model:** > 0.5 is excellent.

- **Hamming Loss**  
  Fraction of labels that are incorrectly predicted (either 0 instead of 1 or vice versa).  
  → Lower is better.  
  **Good model:** close to 0 means few label errors.

- **Micro F1-score**  
  Aggregates contributions of all labels to compute a global F1.  
  → Favors frequent labels; robust to imbalance.  
  **Good model:** > 0.6–0.7 generally indicates solid performance.

- **Macro F1-score**  
  Averages F1 across all labels equally.  
  → Highlights performance on *rare* labels.  
  **Good model:** similar to Micro-F1; if much lower, the model struggles on rare labels.

- **Weighted F1-score**  
  Weighted average of F1 by label frequency.  
  → Balances importance between frequent and rare labels.  
  **Good model:** typically close to Micro-F1.

- **ROC-AUC (Micro / Macro)**  
  Measures the model’s ranking ability (probabilities).  
  → 1.0 = perfect separation, 0.5 = random guessing.  
  **Good model:** > 0.8 for Micro-AUC, > 0.7 for Macro-AUC is usually strong.


In [6]:
from sklearn.metrics import (
    accuracy_score, hamming_loss, f1_score, roc_auc_score
)

def evaluate_multilabel_model(y_true, y_pred, y_prob=None, name=None):
    """
    Evaluate multilabel classification performance.
    
    Parameters
    ----------
    y_true : np.ndarray
        Ground-truth binary matrix (n_samples x n_labels)
    y_pred : np.ndarray
        Predicted binary matrix (same shape as y_true)
    y_prob : np.ndarray, optional
        Predicted probabilities (for ROC-AUC if available)
    name : str
        Name of the dataframe evaluated
    """

    metrics = {"Nom :": name}
    metrics["Subset accuracy"] = accuracy_score(y_true, y_pred)
    metrics["Hamming loss"] = hamming_loss(y_true, y_pred)
    metrics["Micro F1"] = f1_score(y_true, y_pred, average="micro")
    metrics["Macro F1"] = f1_score(y_true, y_pred, average="macro")
    metrics["Weighted F1"] = f1_score(y_true, y_pred, average="weighted")
    
    if y_prob is not None:
        try:
            metrics["Micro ROC-AUC"] = roc_auc_score(y_true, y_prob, average="micro")
            metrics["Macro ROC-AUC"] = roc_auc_score(y_true, y_prob, average="macro")
        except ValueError:
            metrics["Micro ROC-AUC"] = np.nan
            metrics["Macro ROC-AUC"] = np.nan

    print("\n📊 Multilabel Evaluation Metrics:")
    for k, v in metrics.items():
        if isinstance(v, float):
            print(f"{k:20s}: {v:.4f}")
        else:
            print(f"{k:20s}: {v}")

    return metrics


## 1. Logistic regression

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# --- Train baseline model ---
print("Training baseline Logistic Regression model (One-vs-Rest)...")
model = OneVsRestClassifier(
    LogisticRegression(max_iter=500, solver='liblinear')
)
model.fit(X_train, y_train)

# --- Predict ---
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)
y_test_pred = model.predict(X_test)


y_train_prob = model.predict_proba(X_train)
y_valid_prob = model.predict_proba(X_valid)
y_test_prob = model.predict_proba(X_test)

# --- Evaluate ---
evaluate_multilabel_model(y_train, y_train_pred, y_train_prob, "Train")
evaluate_multilabel_model(y_valid, y_valid_pred, y_valid_prob, "Validation")
evaluate_multilabel_model(y_test, y_test_pred, y_test_prob, "Test")

Training baseline Logistic Regression model (One-vs-Rest)...

📊 Multilabel Evaluation Metrics:
Nom :               : Train
Subset accuracy     : 0.6047
Hamming loss        : 0.0361
Micro F1            : 0.9679
Macro F1            : 0.9551
Weighted F1         : 0.9675
Micro ROC-AUC       : 0.9952
Macro ROC-AUC       : 0.9946

📊 Multilabel Evaluation Metrics:
Nom :               : Validation
Subset accuracy     : 0.0140
Hamming loss        : 0.2486
Micro F1            : 0.8004
Macro F1            : 0.6613
Weighted F1         : 0.7947
Micro ROC-AUC       : 0.8000
Macro ROC-AUC       : 0.5749

📊 Multilabel Evaluation Metrics:
Nom :               : Test
Subset accuracy     : 0.0000
Hamming loss        : 0.2707
Micro F1            : 0.7714
Macro F1            : 0.6223
Weighted F1         : 0.7726
Micro ROC-AUC       : 0.7920
Macro ROC-AUC       : 0.5956


{'Nom :': 'Test',
 'Subset accuracy': 0.0,
 'Hamming loss': 0.2706552706552707,
 'Micro F1': 0.7713848173266243,
 'Macro F1': 0.6222638991076471,
 'Weighted F1': 0.7725664660808413,
 'Micro ROC-AUC': 0.7920089068870102,
 'Macro ROC-AUC': 0.5955655325627189}

### First results

##### Training set

- Subset accuracy: 0.6047 → very high; most molecules’ labels are perfectly predicted.
- Hamming loss: 0.0361 → only 3.6% of labels are incorrect.
- Micro / Macro / Weighted F1: ≈ 0.96–0.97 → excellent, almost perfect fitting.
- ROC-AUC: ≈ 0.995 → near-perfect discrimination.

Conclusion: The model fits the training data almost perfectly — clear overfitting symptoms.

##### Validation set

- Subset accuracy: 0.0140 → strong drop in performance.
- Hamming loss: 0.2486 → about one quarter of labels are wrong.
- Micro F1: 0.80 → still good for frequent labels.
- Macro F1: 0.66 → poor for rare labels.
- ROC-AUC: 0.80 (micro) / 0.57 (macro) → uneven generalization.

Conclusion: Large generalization gap — the model memorizes training patterns and struggles with unseen scaffolds.

##### Test set

- Subset accuracy: 0.0000 → no molecule predicted perfectly.
- Hamming loss: 0.2707 → 27% of labels incorrect.
- Micro F1: 0.77 → moderate performance on common labels.
- Macro F1: 0.62 → poor performance on rare labels.
- ROC-AUC: 0.79 (micro) / 0.60 (macro) → confirms limited generalization.

Conclusion: The model completely overfits to the training scaffolds and fails to generalize to new chemical structures.

To address the clear overfitting observed in the SIDER scaffold split, we’ll focus on improving generalization and handling label imbalance. The main goal is to help the model learn meaningful chemical–biological patterns rather than memorizing scaffolds.

**Possible solutions:**

- Increase regularization → reduces model complexity and prevents memorization of training data.
- Apply dimensionality reduction (e.g., PCA, SVD) → lowers feature redundancy in high-dimensional fingerprints.
- Use class weighting or resampling → balances frequent and rare side-effect labels.
- Ensure strict scaffold separation → avoids any structural leakage between splits.
- Try more robust models (e.g., Random Forest, XGBoost) → better capture non-linear relationships without overfitting as easily.

These adjustments aim to improve the model’s ability to generalize across unseen molecular scaffolds while maintaining balanced predictive performance.

In [8]:
model_pca = OneVsRestClassifier(
    LogisticRegression(max_iter=500, solver='liblinear', class_weight='balanced')
)
model_pca.fit(X_train_pca, y_train)

y_train_pred_pca = model_pca.predict(X_train_pca)
y_valid_pred_pca = model_pca.predict(X_valid_pca)
y_test_pred_pca = model_pca.predict(X_test_pca)

y_train_prob_pca = model_pca.predict_proba(X_train_pca)
y_valid_prob_pca = model_pca.predict_proba(X_valid_pca)
y_test_prob_pca = model_pca.predict_proba(X_test_pca)

print("=== Logistic Regression on PCA-reduced Data ===")
evaluate_multilabel_model(y_train, y_train_pred_pca, y_train_prob_pca, "Train (PCA)")
evaluate_multilabel_model(y_valid, y_valid_pred_pca, y_valid_prob_pca, "Validation (PCA)")
evaluate_multilabel_model(y_test, y_test_pred_pca, y_test_prob_pca, "Test (PCA)")


=== Logistic Regression on PCA-reduced Data ===

📊 Multilabel Evaluation Metrics:
Nom :               : Train (PCA)
Subset accuracy     : 0.0272
Hamming loss        : 0.2209
Micro F1            : 0.7933
Macro F1            : 0.7567
Weighted F1         : 0.8011
Micro ROC-AUC       : 0.8666
Macro ROC-AUC       : 0.8638

📊 Multilabel Evaluation Metrics:
Nom :               : Validation (PCA)
Subset accuracy     : 0.0000
Hamming loss        : 0.3025
Micro F1            : 0.7537
Macro F1            : 0.6449
Weighted F1         : 0.7649
Micro ROC-AUC       : 0.7579
Macro ROC-AUC       : 0.5694

📊 Multilabel Evaluation Metrics:
Nom :               : Test (PCA)
Subset accuracy     : 0.0070
Hamming loss        : 0.3217
Micro F1            : 0.7244
Macro F1            : 0.6078
Weighted F1         : 0.7434
Micro ROC-AUC       : 0.7456
Macro ROC-AUC       : 0.5862


{'Nom :': 'Test (PCA)',
 'Subset accuracy': 0.006993006993006993,
 'Hamming loss': 0.32167832167832167,
 'Micro F1': 0.7243675099866844,
 'Macro F1': 0.6077583706053509,
 'Weighted F1': 0.7434387494481324,
 'Micro ROC-AUC': 0.7456005872432856,
 'Macro ROC-AUC': 0.5861874940246806}

Our logistic regression with PCA traininig datasets is just slightly better than our previous model. It still overfits. We need to try with another model to evaluate the revelance of our PCA. We can not use logistical regression to classify our dataset.

If all our models overfit, we will need to get back on our datas and retry oour logiistic regression but for now, let's try Random Forest in the 3rd notebook 

In [9]:
# --- Entraîner la Régression Logistique L1 ---
print("🚀 Entraînement de la Régression Logistique L1 (Lasso)...")
# C=0.1 correspond à une régularisation L1 assez forte
l1_model = OneVsRestClassifier(
    LogisticRegression(penalty='l1', C=0.1, solver='liblinear', class_weight='balanced', max_iter=1000)
)
l1_model.fit(X_train, y_train)

# --- Prédictions ---
y_train_pred_l1 = l1_model.predict(X_train)
y_valid_pred_l1 = l1_model.predict(X_valid)
y_test_pred_l1 = l1_model.predict(X_test)

y_train_prob_l1 = l1_model.predict_proba(X_train)
y_valid_prob_l1 = l1_model.predict_proba(X_valid)
y_test_prob_l1 = l1_model.predict_proba(X_test)

print("=== Logistic Regression on Lasso L1 penalty ===")
evaluate_multilabel_model(y_train, y_train_pred_l1, y_train_prob_l1, "Train (PCA)")
evaluate_multilabel_model(y_valid, y_valid_pred_l1, y_valid_prob_l1, "Validation (PCA)")
evaluate_multilabel_model(y_test, y_test_pred_l1, y_test_prob_l1, "Test (PCA)")

🚀 Entraînement de la Régression Logistique L1 (Lasso)...
=== Logistic Regression on Lasso L1 penalty ===

📊 Multilabel Evaluation Metrics:
Nom :               : Train (PCA)
Subset accuracy     : 0.0044
Hamming loss        : 0.3308
Micro F1            : 0.6925
Macro F1            : 0.6389
Weighted F1         : 0.7095
Micro ROC-AUC       : 0.7382
Macro ROC-AUC       : 0.7322

📊 Multilabel Evaluation Metrics:
Nom :               : Validation (PCA)
Subset accuracy     : 0.0000
Hamming loss        : 0.3341
Micro F1            : 0.7454
Macro F1            : 0.6581
Weighted F1         : 0.7744
Micro ROC-AUC       : 0.6867
Macro ROC-AUC       : 0.5449

📊 Multilabel Evaluation Metrics:
Nom :               : Test (PCA)
Subset accuracy     : 0.0000
Hamming loss        : 0.3592
Micro F1            : 0.7111
Macro F1            : 0.6245
Weighted F1         : 0.7528
Micro ROC-AUC       : 0.6981
Macro ROC-AUC       : 0.6157


{'Nom :': 'Test (PCA)',
 'Subset accuracy': 0.0,
 'Hamming loss': 0.35923335923335925,
 'Micro F1': 0.7111018537804624,
 'Macro F1': 0.6245410080397917,
 'Weighted F1': 0.7527603122901252,
 'Micro ROC-AUC': 0.69808913981957,
 'Macro ROC-AUC': 0.6156967889007495}

Unlike the previous models, the L1-regularized Logistic Regression shows significantly lower performance on the training set. This is a strong indication that the L1 penalty is effectively reducing overfitting. The model is no longer memorizing the training data. The scores are much more modest, suggesting a simpler model constrained by the regularization.

The validation performance is surprisingly similar to or even slightly better than the training performance in some metrics (F1 scores), but worse in others (AUC). The gap between training and validation is almost eliminated, confirming that overfitting is significantly reduced. However, the overall performance, especially the Macro ROC-AUC (0.54), is quite poor. The strong regularization might be causing underfitting or filtering out too many relevant features.

The test performance is generally consistent with the validation set, confirming good generalization (i.e., minimal overfitting). However, the absolute performance level remains low, especially concerning the Macro F1 and Macro ROC-AUC scores.