# Binder Classification â€” RDKit Descriptor Analysis

This notebook isolates the RDKit descriptor workflow from `binder_classification_smiles.ipynb` so we can focus on descriptor-driven features, compare multiple classifiers, quantify their stability via stratified k-fold validation, and perform binary particle swarm optimization (BPSO) feature selection on the descriptor space.

## 1) Setup & Imports

In [1]:
import os
os.environ.setdefault('OMP_NUM_THREADS', '1')
os.environ.setdefault('RDKIT_MAX_THREADS', '1')

import numpy as np
import pandas as pd

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    precision_recall_curve,
    roc_curve,
    roc_auc_score,
    auc,
    matthews_corrcoef,
    accuracy_score,
    balanced_accuracy_score,
    classification_report,
    average_precision_score,
)
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone

## 2) Dataset Loading & Binder Label

In [2]:
# Load curated IC50 dataset and define the binary binder label

df_smiles = pd.read_csv("ic50.tsv", sep="	", low_memory=False)
df_smiles["Standard Value"] = pd.to_numeric(df_smiles["Standard Value"], errors="coerce")

df_smiles = df_smiles.dropna(subset=["Standard Value", "Smiles"]).copy()
df_smiles["Binder"] = (df_smiles["Standard Value"] <= 2000).astype(int)

unwanted_patterns = [
    ".C", ".Cl", ".NA+", ".Na+", ".na+", "[Na+]", "Cl.", ".O=C(O)C(F)(F)F"
]
mask = pd.Series(False, index=df_smiles.index)
for pattern in unwanted_patterns:
    mask |= df_smiles["Smiles"].str.contains(pattern, regex=False, na=False)
removed = int(mask.sum())
if removed:
    df_smiles = df_smiles[~mask].copy()
    print(f"Removed {removed} compounds due to unwanted ion patterns")

dupe_mask = df_smiles.duplicated(subset="Smiles", keep="first")
removed_dupes = int(dupe_mask.sum())
if removed_dupes:
    df_smiles = df_smiles[~dupe_mask].copy()
    print(f"Removed {removed_dupes} duplicate SMILES entries")

print(f"Dataset size: {len(df_smiles)} compounds")
print(f"Binder ratio: {df_smiles['Binder'].mean()*100:.2f}%")
print("Sample SMILES:")
for i, (_, row) in enumerate(df_smiles.head(5).iterrows(), 1):
    smiles_str = row['Smiles'][:70] + "..." if len(row['Smiles']) > 70 else row['Smiles']
    print(f"{i}. {smiles_str} | Binder: {row['Binder']}")

Removed 25 compounds due to unwanted ion patterns
Removed 787 duplicate SMILES entries
Dataset size: 2039 compounds
Binder ratio: 64.64%
Sample SMILES:
1. CN(C)OC(=O)CCC(=O)O | Binder: 0
2. CC(=O)CC(=O)CCC(=O)O | Binder: 0
3. Cc1cccc(C)c1Oc1cc2c(N3CCCC3)nc(-n3cc(C(=O)O)cn3)nc2cc1F | Binder: 1
4. N#Cc1cccc(NC(=O)c2ccc3cccnc3c2O)c1 | Binder: 0
5. COc1ccc(CNC(=O)c2ccc3cccnc3c2O)cc1 | Binder: 0


## 3) RDKit Descriptor Generation

In [3]:
# Compute RDKit molecular descriptors and clean the resulting feature matrix

descriptor_names = [name for name, _ in Descriptors._descList]
descriptor_calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

def smiles_to_descriptors(smiles: str):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    try:
        return descriptor_calculator.CalcDescriptors(mol)
    except Exception:
        return None

print("Calculating RDKit descriptors...")
descriptor_rows = []
descriptor_indices = []

for idx, smiles in enumerate(df_smiles['Smiles']):
    values = smiles_to_descriptors(smiles)
    if values is not None:
        descriptor_rows.append(values)
        descriptor_indices.append(idx)

descriptor_df = pd.DataFrame(descriptor_rows, columns=descriptor_names)
descriptor_df = descriptor_df.replace([np.inf, -np.inf], np.nan)
all_nan_cols = descriptor_df.columns[descriptor_df.isna().all()].tolist()
if all_nan_cols:
    descriptor_df = descriptor_df.drop(columns=all_nan_cols)
descriptor_df = descriptor_df.fillna(descriptor_df.median())

float32_limit = np.finfo(np.float32).max
too_large_mask = descriptor_df.abs() >= float32_limit
if too_large_mask.any().any():
    bad_columns = descriptor_df.columns[too_large_mask.any(axis=0)].tolist()
    descriptor_df = descriptor_df.drop(columns=bad_columns)
    print(f"Removed {len(bad_columns)} descriptor(s) that exceeded float32 limits: {bad_columns}")

descriptor_df = descriptor_df.astype(np.float64)

X_desc = descriptor_df.values
y_desc = df_smiles.iloc[descriptor_indices]['Binder'].values

print(f"Descriptor matrix shape: {descriptor_df.shape}")
print(f"Total valid molecules: {len(descriptor_indices)}")

Calculating RDKit descriptors...
Removed 1 descriptor(s) that exceeded float32 limits: ['Ipc']
Descriptor matrix shape: (2039, 216)
Total valid molecules: 2039


## 4) Baseline Random Forest Holdout Evaluation

In [4]:
X_train_desc, X_test_desc, y_train_desc, y_test_desc = train_test_split(
    X_desc,
    y_desc,
    test_size=0.25,
    random_state=42,
    stratify=y_desc
)

desc_rf = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    n_jobs=-1,
    max_depth=24,
    min_samples_leaf=2
)

desc_rf.fit(X_train_desc, y_train_desc)

y_desc_pred = desc_rf.predict(X_test_desc)
y_desc_prob = desc_rf.predict_proba(X_test_desc)[:, 1]

desc_precision, desc_recall, _ = precision_recall_curve(y_test_desc, y_desc_prob)
desc_pr_auc = auc(desc_recall, desc_precision)
desc_roc_auc = roc_auc_score(y_test_desc, y_desc_prob)
desc_mcc = matthews_corrcoef(y_test_desc, y_desc_pred)
desc_acc = accuracy_score(y_test_desc, y_desc_pred)
desc_bal_acc = balanced_accuracy_score(y_test_desc, y_desc_pred)

print("RDKit descriptor Random Forest performance (holdout test set)")
print(f"PR-AUC: {desc_pr_auc:.5f}")
print(f"ROC-AUC: {desc_roc_auc:.5f}")
print(f"MCC: {desc_mcc:.5f}")
print(f"Accuracy: {desc_acc:.5f}")
print(f"Balanced Accuracy: {desc_bal_acc:.5f}")
print("Classification report:")
print(classification_report(y_test_desc, y_desc_pred, digits=4))

descriptor_importances = pd.Series(desc_rf.feature_importances_, index=descriptor_df.columns)
top_desc = descriptor_importances.sort_values(ascending=False).head(15)
print("Top descriptor importances (Random Forest)")
print(top_desc.to_frame(name='Importance'))

RDKit descriptor Random Forest performance (holdout test set)
PR-AUC: 0.98669
ROC-AUC: 0.97449
MCC: 0.84867
Accuracy: 0.93137
Balanced Accuracy: 0.91919
Classification report:
              precision    recall  f1-score   support

           0     0.9240    0.8778    0.9003       180
           1     0.9351    0.9606    0.9477       330

    accuracy                         0.9314       510
   macro avg     0.9295    0.9192    0.9240       510
weighted avg     0.9312    0.9314    0.9310       510

Top descriptor importances (Random Forest)
                     Importance
AvgIpc                 0.038577
MaxAbsPartialCharge    0.036648
PEOE_VSA11             0.036358
MinPartialCharge       0.035454
BCUT2D_MRLOW           0.026597
SMR_VSA3               0.025926
fr_Ar_N                0.023560
EState_VSA8            0.019652
PEOE_VSA12             0.019053
VSA_EState7            0.018084
VSA_EState2            0.016334
fr_NH0                 0.015760
SlogP_VSA3             0.015250
MinAbs

## 5) Helper Functions for Stratified K-Fold Evaluation

In [5]:
# Utilities shared by every classifier evaluation

def _positive_class_probs(model, X):
    if hasattr(model, "predict_proba"):
        return model.predict_proba(X)[:, 1]
    if hasattr(model, "decision_function"):
        raw = model.decision_function(X)
        return 1.0 / (1.0 + np.exp(-raw))
    # Last resort: fall back to hard predictions
    return model.predict(X).astype(float)

def _metric_summary(y_true, y_pred, y_prob):
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    return {
        "PR_AUC": auc(recall, precision),
        "ROC_AUC": roc_auc_score(y_true, y_prob),
        "MCC": matthews_corrcoef(y_true, y_pred),
        "Accuracy": accuracy_score(y_true, y_pred),
    }

def run_stratified_kfold(model, X, y, n_splits=5, random_state=42):
    splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    fold_metrics = []
    for train_idx, test_idx in splitter.split(X, y):
        estimator = clone(model)
        estimator.fit(X[train_idx], y[train_idx])
        y_pred = estimator.predict(X[test_idx])
        y_prob = _positive_class_probs(estimator, X[test_idx])
        fold_metrics.append(_metric_summary(y[test_idx], y_pred, y_prob))
    aggregated = {metric: np.mean([fold[metric] for fold in fold_metrics]) for metric in fold_metrics[0]}
    return aggregated

## 6) Model Comparison with Stratified K-Fold

In [6]:
model_registry = {
    "RandomForest": RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        n_jobs=-1,
        max_depth=24,
        min_samples_leaf=2
    ),
    "ExtraTrees": ExtraTreesClassifier(
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        max_depth=None,
        min_samples_leaf=1
    ),
    "GradientBoosting": GradientBoostingClassifier(
        random_state=42,
        learning_rate=0.05,
        n_estimators=400,
        max_depth=3
    ),
    "LogisticRegression": make_pipeline(
        StandardScaler(),
        LogisticRegression(
            penalty='elasticnet',
            l1_ratio=0.4,
            solver='saga',
            max_iter=4000,
            random_state=42,
            class_weight='balanced',
            n_jobs=-1,
        )
    ),
    "GaussianNB": GaussianNB(var_smoothing=1e-9)
}

fold_options = (5, 10)
cv_rows = []
for model_name, estimator in model_registry.items():
    for folds in fold_options:
        metrics = run_stratified_kfold(estimator, X_desc, y_desc, n_splits=folds)
        metrics.update({"Model": model_name, "Folds": folds})
        cv_rows.append(metrics)

cv_df = pd.DataFrame(cv_rows)
cv_df = cv_df[["Model", "Folds", "PR_AUC", "ROC_AUC", "MCC", "Accuracy"]]
cv_df = cv_df.sort_values(by=["Model", "Folds"]).reset_index(drop=True)
print("Cross-validated descriptor model comparison")
cv_df

Cross-validated descriptor model comparison


Unnamed: 0,Model,Folds,PR_AUC,ROC_AUC,MCC,Accuracy
0,ExtraTrees,5,0.979063,0.962775,0.793828,0.905346
1,ExtraTrees,10,0.978519,0.96325,0.799978,0.908292
2,GaussianNB,5,0.922037,0.857921,0.588457,0.813163
3,GaussianNB,10,0.921846,0.856946,0.594422,0.815609
4,GradientBoosting,5,0.983757,0.970325,0.803279,0.909758
5,GradientBoosting,10,0.983753,0.972582,0.814154,0.91466
6,LogisticRegression,5,0.969588,0.946579,0.752369,0.882789
7,LogisticRegression,10,0.971063,0.948861,0.761443,0.886709
8,RandomForest,5,0.984241,0.971153,0.807324,0.91172
9,RandomForest,10,0.984653,0.973391,0.812071,0.914182


## 7) BPSO Feature Selection on Descriptor Space

In [7]:

class BPSOFeatureSelector:
    """Binary Particle Swarm Optimization wrapper for descriptor feature selection."""

    def __init__(
        self,
        estimator,
        n_particles=20,
        n_iterations=25,
        inertia=0.729,
        cognitive=1.49445,
        social=1.49445,
        cv=3,
        max_features=256,
        min_features=8,
        random_state=42,
    ):
        self.estimator = estimator
        self.n_particles = n_particles
        self.n_iterations = n_iterations
        self.inertia = inertia
        self.cognitive = cognitive
        self.social = social
        self.cv = cv
        self.max_features = max_features
        self.min_features = min_features
        self.random_state = random_state

    def _ensure_valid_mask(self, mask, rng):
        if not mask.any():
            mask[rng.integers(0, mask.size)] = True
        if self.max_features is not None and mask.sum() > self.max_features:
            drop = rng.choice(np.where(mask)[0], size=mask.sum() - self.max_features, replace=False)
            mask[drop] = False
        if self.min_features and mask.sum() < self.min_features:
            available = np.where(~mask)[0]
            if available.size:
                add = rng.choice(available, size=min(self.min_features - mask.sum(), available.size), replace=False)
                mask[add] = True
        return mask

    def _predict_scores(self, estimator, X):
        if hasattr(estimator, "predict_proba"):
            return estimator.predict_proba(X)[:, 1]
        if hasattr(estimator, "decision_function"):
            raw = estimator.decision_function(X)
            return 1.0 / (1.0 + np.exp(-raw))
        return estimator.predict(X)

    def _evaluate_mask(self, X, y, mask):
        if mask.sum() == 0:
            return 0.0
        scores = []
        for train_idx, valid_idx in self.cv_splits_:
            est = clone(self.estimator)
            est.fit(X[train_idx][:, mask], y[train_idx])
            y_scores = self._predict_scores(est, X[valid_idx][:, mask])
            scores.append(average_precision_score(y[valid_idx], y_scores))
        return float(np.mean(scores))

    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        rng = np.random.default_rng(self.random_state)
        self.cv_splits_ = list(
            StratifiedKFold(n_splits=self.cv, shuffle=True, random_state=self.random_state).split(X, y)
        )
        n_features = X.shape[1]
        init_prob = min(0.1, (self.max_features or n_features) / n_features)
        positions = rng.random((self.n_particles, n_features)) < init_prob
        velocities = rng.standard_normal((self.n_particles, n_features)) * 0.1
        for i in range(self.n_particles):
            positions[i] = self._ensure_valid_mask(positions[i], rng)
        personal_best_positions = positions.copy()
        personal_best_scores = np.array([
            self._evaluate_mask(X, y, positions[i]) for i in range(self.n_particles)
        ])
        best_idx = np.argmax(personal_best_scores)
        global_best_position = personal_best_positions[best_idx].copy()
        global_best_score = personal_best_scores[best_idx]
        self.history_ = [
            {
                "iteration": 0,
                "best_score": float(global_best_score),
                "mean_features": float(positions.sum(axis=1).mean()),
            }
        ]
        for iteration in range(1, self.n_iterations + 1):
            for i in range(self.n_particles):
                r1 = rng.random(n_features)
                r2 = rng.random(n_features)
                personal_best = personal_best_positions[i].astype(float)
                position_float = positions[i].astype(float)
                global_best = global_best_position.astype(float)
                velocities[i] = (
                    self.inertia * velocities[i]
                    + self.cognitive * r1 * (personal_best - position_float)
                    + self.social * r2 * (global_best - position_float)
                )
                probs = 1.0 / (1.0 + np.exp(-velocities[i]))
                new_position = rng.random(n_features) < probs
                new_position = self._ensure_valid_mask(new_position, rng)
                positions[i] = new_position
                score = self._evaluate_mask(X, y, new_position)
                if score > personal_best_scores[i]:
                    personal_best_scores[i] = score
                    personal_best_positions[i] = new_position.copy()
                    if score > global_best_score:
                        global_best_score = score
                        global_best_position = new_position.copy()
            self.history_.append(
                {
                    "iteration": iteration,
                    "best_score": float(global_best_score),
                    "mean_features": float(positions.sum(axis=1).mean()),
                }
            )
        self.support_ = global_best_position.astype(bool)
        self.best_score_ = float(global_best_score)
        self.selected_features_ = np.where(self.support_)[0]
        return self

    def transform(self, X):
        if not hasattr(self, "support_"):
            raise RuntimeError("The selector is not fitted yet.")
        return np.asarray(X)[:, self.support_]

    def fit_transform(self, X, y):
        return self.fit(X, y).transform(X)


In [8]:
# Run BPSO on the training descriptors
bpso_base_estimator = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        max_iter=4000,
        solver="saga",
        penalty="l2",
        class_weight="balanced",
        n_jobs=-1,
        random_state=42,
    )
)

bpso_selector = BPSOFeatureSelector(
    estimator=bpso_base_estimator,
    n_particles=20,
    n_iterations=25,
    cv=3,
    max_features=256,
    min_features=8,
    random_state=42,
)

bpso_selector.fit(X_train_desc, y_train_desc)
selected_desc_mask = bpso_selector.support_
X_train_desc_bpso = X_train_desc[:, selected_desc_mask]
X_test_desc_bpso = X_test_desc[:, selected_desc_mask]

print(f"BPSO selected {selected_desc_mask.sum()} / {X_train_desc.shape[1]} descriptors")
print(f"Best cross-validated average precision: {bpso_selector.best_score_:.4f}")

bpso_desc_history = pd.DataFrame(bpso_selector.history_)
try:
    display(bpso_desc_history)
except Exception as exc:
    print("Display unavailable:", exc)
    print(bpso_desc_history.head())



BPSO selected 117 / 216 descriptors
Best cross-validated average precision: 0.9718


Unnamed: 0,iteration,best_score,mean_features
0,0,0.936339,21.7
1,1,0.969327,124.1
2,2,0.969327,123.55
3,3,0.969327,117.1
4,4,0.969327,117.35
5,5,0.969327,117.7
6,6,0.969327,116.05
7,7,0.969327,116.6
8,8,0.969327,117.1
9,9,0.971001,115.75


In [9]:
# Train Random Forest on the BPSO-selected descriptors
rf_desc_bpso = RandomForestClassifier(
    n_estimators=400,
    random_state=42,
    n_jobs=-1,
    max_depth=24,
    min_samples_leaf=2
)
rf_desc_bpso.fit(X_train_desc_bpso, y_train_desc)

y_desc_bpso_pred = rf_desc_bpso.predict(X_test_desc_bpso)
y_desc_bpso_prob = rf_desc_bpso.predict_proba(X_test_desc_bpso)[:, 1]

precision_desc_bpso, recall_desc_bpso, _ = precision_recall_curve(y_test_desc, y_desc_bpso_prob)
pr_auc_desc_bpso = auc(recall_desc_bpso, precision_desc_bpso)
roc_auc_desc_bpso = roc_auc_score(y_test_desc, y_desc_bpso_prob)
mcc_desc_bpso = matthews_corrcoef(y_test_desc, y_desc_bpso_pred)
acc_desc_bpso = accuracy_score(y_test_desc, y_desc_bpso_pred)
bal_acc_desc_bpso = balanced_accuracy_score(y_test_desc, y_desc_bpso_pred)

print("=" * 60)
print("Random Forest with BPSO-selected descriptors")
print("=" * 60)
print(classification_report(y_test_desc, y_desc_bpso_pred, digits=4))
print(f"PR-AUC: {pr_auc_desc_bpso:.5f}")
print(f"ROC-AUC: {roc_auc_desc_bpso:.5f}")
print(f"MCC: {mcc_desc_bpso:.5f}")
print(f"Accuracy: {acc_desc_bpso:.5f}")
print(f"Balanced Accuracy: {bal_acc_desc_bpso:.5f}")

Random Forest with BPSO-selected descriptors
              precision    recall  f1-score   support

           0     0.9162    0.8500    0.8818       180
           1     0.9213    0.9576    0.9391       330

    accuracy                         0.9196       510
   macro avg     0.9187    0.9038    0.9105       510
weighted avg     0.9195    0.9196    0.9189       510

PR-AUC: 0.98555
ROC-AUC: 0.97236
MCC: 0.82238
Accuracy: 0.91961
Balanced Accuracy: 0.90379


In [10]:
comparison_rows = [
    {
        "Model": "RF + all descriptors",
        "Features Used": X_train_desc.shape[1],
        "Selected Fraction (%)": 100.0,
        "PR-AUC": desc_pr_auc,
        "ROC-AUC": desc_roc_auc,
        "Accuracy": desc_acc,
        "Balanced Accuracy": desc_bal_acc,
        "MCC": desc_mcc,
    },
    {
        "Model": "RF + BPSO descriptors",
        "Features Used": int(selected_desc_mask.sum()),
        "Selected Fraction (%)": selected_desc_mask.mean() * 100,
        "PR-AUC": pr_auc_desc_bpso,
        "ROC-AUC": roc_auc_desc_bpso,
        "Accuracy": acc_desc_bpso,
        "Balanced Accuracy": bal_acc_desc_bpso,
        "MCC": mcc_desc_bpso,
    },
]
comparison_desc_bpso_df = (
    pd.DataFrame(comparison_rows)
    .set_index("Model")
    .sort_values("PR-AUC", ascending=False)
)
print("Full descriptor set vs BPSO-selected subset")
try:
    display(
        comparison_desc_bpso_df.style.format({
            "Features Used": "{:.0f}",
            "Selected Fraction (%)": "{:.2f}",
            "PR-AUC": "{:.3f}",
            "ROC-AUC": "{:.3f}",
            "Accuracy": "{:.3f}",
            "Balanced Accuracy": "{:.3f}",
            "MCC": "{:.3f}",
        })
    )
except Exception as exc:
    print("Styled display unavailable:", exc)
    print(comparison_desc_bpso_df.round(3))

Full descriptor set vs BPSO-selected subset
Styled display unavailable: The '.style' accessor requires jinja2
                       Features Used  Selected Fraction (%)  PR-AUC  ROC-AUC  \
Model                                                                          
RF + all descriptors             216                100.000   0.987    0.974   
RF + BPSO descriptors            117                 54.167   0.986    0.972   

                       Accuracy  Balanced Accuracy    MCC  
Model                                                      
RF + all descriptors      0.931              0.919  0.849  
RF + BPSO descriptors     0.920              0.904  0.822  
