In [None]:
"""Random descriptor subsets → embedding & regression

• RANDOM_PLAN = {N: K}: Select K random subsets of N descriptors.
• Subsets are drawn AFTER descriptor filtering (dataset-specific).
• Each subset is saved under ./random/{dataset}/{N}/set{i}/flag.txt.
• For each saved subset, run the full pipeline (dataset × model).
• Metrics from K runs are aggregated into aggregate_metrics.json.
"""

In [None]:
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import random
import json
import numpy as np
from __future__ import annotations
from pathlib import Path
from datetime import datetime
from statistics import mean, pstdev

In [None]:
from Chemprompt.embeddings.full_descriptors_embedding import LLMModel
from Chemprompt.data.data_loader import DataLoader
from Chemprompt.models.sklearn_model import ScikitLearnModel

In [None]:
DATASETS = ["FreeSolv", "ESOL", "Lipo"]

In [None]:
MODEL_LIST = [
    {
        "repo": "CohereLabs",
        "name": "aya-expanse-8b"
    }
]

In [None]:
# {number_of_descriptors: number_of_random_sets}
RANDOM_PLAN = {
    10: 100,
    30: 100,
    50: 100,
    70: 100,
    90: 100
}

In [None]:
# Total number of RDKit descriptors (for reference only)
TOTAL = 236

In [None]:
RESULT_ROOT = Path("./result/rdkit_random").resolve()
FLAG_ROOT = Path("./random").resolve()
RESULT_ROOT.mkdir(parents=True, exist_ok=True)
FLAG_ROOT.mkdir(parents=True, exist_ok=True)

In [None]:
# Helper Functions
def log(msg: str):
    print(f"[{datetime.now():%Y-%m-%d %H:%M:%S}] {msg}")

In [None]:
def generate_unique_sets(n_true: int, k: int, total: int) -> list[list[int]]:
    """Generate k unique random index sets of size n_true from total descriptors."""
    if n_true > total:
        raise ValueError(f"Requested {n_true} > available descriptors {total}")
    unique: set[frozenset[int]] = set()
    while len(unique) < k:
        idx = random.sample(range(total), n_true)
        unique.add(frozenset(idx))
    return [sorted(list(s)) for s in unique]

In [None]:
def indices_to_flag(indices: list[int], total: int) -> list[bool]:
    """Convert list of selected indices to boolean mask."""
    flag = [False] * total
    for i in indices:
        flag[i] = True
    return flag

In [None]:
def load_metrics_if_exists(path: Path) -> dict | None:
    """Read metrics.json if it exists and return as dict."""
    f = path / "metrics.json"
    if f.exists():
        try:
            return json.loads(f.read_text())
        except Exception:
            return None
    return None

In [None]:
# Core Routine (Sampling After Filtering)
def run_dataset_with_plan(dataset: str, model_info: dict, plan: dict[int, int], flag_root: Path):
    repo, name = model_info["repo"], model_info["name"]
    model_repo = f"{repo}/{name}"

    # Load data
    loader = DataLoader()
    smiles, y = loader.load_dataset(dataset)

    # Initialize LLM model and filter descriptors
    llm = LLMModel(model_repo, dtype="half", device="cuda:0")
    llm._filter_descriptors(smiles)  # Initializes llm.property_names
    M = len(llm.property_names)
    log(f"[INFO] {dataset}: filtered descriptors = {M}")

    # For each N in RANDOM_PLAN, generate K random subsets after filtering
    for n_true, k_sets in plan.items():
        if n_true > M:
            log(f"[SKIP] {dataset}: N={n_true} > filtered descriptors ({M})")
            continue

        sets = generate_unique_sets(n_true, k_sets, total=M)
        metrics_list = []

        # Result directory per N
        save_root_for_n = RESULT_ROOT / dataset / name / f"N_{n_true}"
        save_root_for_n.mkdir(parents=True, exist_ok=True)

        for idx, indices in enumerate(sets, 1):
            set_dir = flag_root / dataset / str(n_true) / f"set{idx}"
            set_dir.mkdir(parents=True, exist_ok=True)

            # Save flag indices
            (set_dir / "flag.txt").write_text(",".join(map(str, indices)))
            flag_vec = indices_to_flag(indices, total=M)

            log(f"Embeddings ▶ dataset={dataset}, model={model_repo}, N={n_true}, set={idx}/{k_sets}")
            embeds = llm.get_embeddings(smiles, flag_vec)

            # Save per-set results
            save_dir = save_root_for_n / f"set{idx}"
            save_dir.mkdir(parents=True, exist_ok=True)

            model = ScikitLearnModel("regression", save_dir=str(save_dir))
            model.fit_and_evaluate_fold(smiles, embeds, y)
            # np.save(save_dir / "embeddings.npy", embeds)

            # Collect metrics
            metrics = load_metrics_if_exists(save_dir)
            if metrics is not None:
                metrics_list.append(metrics)

        # Aggregate metrics across K sets
        if metrics_list:
            agg = {}
            keys = [k for k, v in metrics_list[0].items() if isinstance(v, (int, float))]
            for k in keys:
                vals = [m.get(k) for m in metrics_list if isinstance(m.get(k), (int, float))]
                if vals:
                    agg[f"{k}_mean"] = float(mean(vals))
                    agg[f"{k}_std"] = float(pstdev(vals))
            (save_root_for_n / "aggregate_metrics.json").write_text(json.dumps(agg, indent=2))
            log(f"[AGG] {dataset}, N={n_true}: aggregate_metrics.json saved")

In [1]:
if __name__ == "__main__":
    log(f"[INFO] Total RDKit descriptors fixed at: {TOTAL}")

    for dataset in DATASETS:
        for model in MODEL_LIST:
            log(f"\n=== Dataset: {dataset} | Model: {model['repo']}/{model['name']} ===")
            run_dataset_with_plan(dataset, model, RANDOM_PLAN, FLAG_ROOT)

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/HDD1/bbq9088/miniconda3/envs/ChEmPrompt1027/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


[2025-10-29 07:11:29] [INFO] Total RDKit descriptors fixed at: 236
[2025-10-29 07:11:29] 
=== Dataset: FreeSolv | Model: CohereLabs/aya-expanse-8b ===
(642, 2)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Filtering descriptors: 100%|███████████████████████████████████████████████████████████████| 642/642 [00:04<00:00, 150.27it/s]


[2025-10-29 07:11:39] [INFO] FreeSolv: filtered descriptors = 120
[2025-10-29 07:11:39] Embeddings ▶ dataset=FreeSolv, model=CohereLabs/aya-expanse-8b, N=10, set=1/1


Embedding SMILES: 100%|█████████████████████████████████████████████████████████████████████| 642/642 [00:38<00:00, 16.68it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_10/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_10/set1/regression_CombinedResults.csv
[2025-10-29 07:12:18] Embeddings ▶ dataset=FreeSolv, model=CohereLabs/aya-expanse-8b, N=30, set=1/1


Embedding SMILES: 100%|█████████████████████████████████████████████████████████████████████| 642/642 [00:48<00:00, 13.32it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_30/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_30/set1/regression_CombinedResults.csv
[2025-10-29 07:13:07] Embeddings ▶ dataset=FreeSolv, model=CohereLabs/aya-expanse-8b, N=50, set=1/1


Embedding SMILES: 100%|█████████████████████████████████████████████████████████████████████| 642/642 [00:57<00:00, 11.24it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_50/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_50/set1/regression_CombinedResults.csv
[2025-10-29 07:14:04] Embeddings ▶ dataset=FreeSolv, model=CohereLabs/aya-expanse-8b, N=70, set=1/1


Embedding SMILES: 100%|█████████████████████████████████████████████████████████████████████| 642/642 [01:07<00:00,  9.53it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_70/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_70/set1/regression_CombinedResults.csv
[2025-10-29 07:15:12] Embeddings ▶ dataset=FreeSolv, model=CohereLabs/aya-expanse-8b, N=90, set=1/1


Embedding SMILES: 100%|█████████████████████████████████████████████████████████████████████| 642/642 [01:18<00:00,  8.16it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_90/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/FreeSolv/aya-expanse-8b/N_90/set1/regression_CombinedResults.csv
[2025-10-29 07:16:31] 
=== Dataset: ESOL | Model: CohereLabs/aya-expanse-8b ===
(1128, 2)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Filtering descriptors: 100%|█████████████████████████████████████████████████████████████| 1128/1128 [00:09<00:00, 120.67it/s]


[2025-10-29 07:16:45] [INFO] ESOL: filtered descriptors = 133
[2025-10-29 07:16:45] Embeddings ▶ dataset=ESOL, model=CohereLabs/aya-expanse-8b, N=10, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1128/1128 [01:06<00:00, 16.97it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_10/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_10/set1/regression_CombinedResults.csv
[2025-10-29 07:17:52] Embeddings ▶ dataset=ESOL, model=CohereLabs/aya-expanse-8b, N=30, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1128/1128 [01:24<00:00, 13.35it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_30/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_30/set1/regression_CombinedResults.csv
[2025-10-29 07:19:17] Embeddings ▶ dataset=ESOL, model=CohereLabs/aya-expanse-8b, N=50, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1128/1128 [01:45<00:00, 10.69it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_50/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_50/set1/regression_CombinedResults.csv
[2025-10-29 07:21:04] Embeddings ▶ dataset=ESOL, model=CohereLabs/aya-expanse-8b, N=70, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1128/1128 [02:00<00:00,  9.33it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_70/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_70/set1/regression_CombinedResults.csv
[2025-10-29 07:23:06] Embeddings ▶ dataset=ESOL, model=CohereLabs/aya-expanse-8b, N=90, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1128/1128 [02:23<00:00,  7.86it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_90/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/ESOL/aya-expanse-8b/N_90/set1/regression_CombinedResults.csv
[2025-10-29 07:25:30] 
=== Dataset: Lipo | Model: CohereLabs/aya-expanse-8b ===
(1400, 2)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Filtering descriptors: 100%|██████████████████████████████████████████████████████████████| 1400/1400 [00:19<00:00, 71.13it/s]


[2025-10-29 07:25:54] [INFO] Lipo: filtered descriptors = 133
[2025-10-29 07:25:54] Embeddings ▶ dataset=Lipo, model=CohereLabs/aya-expanse-8b, N=10, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1400/1400 [01:26<00:00, 16.10it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_10/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_10/set1/regression_CombinedResults.csv
[2025-10-29 07:27:22] Embeddings ▶ dataset=Lipo, model=CohereLabs/aya-expanse-8b, N=30, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1400/1400 [01:51<00:00, 12.57it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_30/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_30/set1/regression_CombinedResults.csv
[2025-10-29 07:29:15] Embeddings ▶ dataset=Lipo, model=CohereLabs/aya-expanse-8b, N=50, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1400/1400 [05:39<00:00,  4.12it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_50/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_50/set1/regression_CombinedResults.csv
[2025-10-29 07:34:56] Embeddings ▶ dataset=Lipo, model=CohereLabs/aya-expanse-8b, N=70, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1400/1400 [11:49<00:00,  1.97it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_70/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_70/set1/regression_CombinedResults.csv
[2025-10-29 07:46:47] Embeddings ▶ dataset=Lipo, model=CohereLabs/aya-expanse-8b, N=90, set=1/1


Embedding SMILES: 100%|███████████████████████████████████████████████████████████████████| 1400/1400 [03:39<00:00,  6.38it/s]


Predictions saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_90/set1/regression_Predictions.csv
Combined results saved to /HDD1/bbq9088/ChEmPrompt_Lab/5.2 Existence of Optimal Subset of Descriptors/result/rdkit_random/Lipo/aya-expanse-8b/N_90/set1/regression_CombinedResults.csv
