In [None]:
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

In [None]:
from Chemprompt.data.data_loader import DataLoader
from Chemprompt.embeddings.full_descriptors_embedding import LLMModel
from Chemprompt.models.sklearn_model import ScikitLearnModel

In [None]:
# GA-Optimized Best Full Flags (Full Descriptors)
flag_list = [
    "0000101101000000000000000001101101000000001000000000000000001000000000000000000011000000000001101000001010001000001100000010000001000001000001100", #fold 1 
    "0000000110000000000000100001101001000000000100000000000000001010000000000100000000100000000000100000000100000110001111000010000011100010000000101", #fold 2
    "0000100011000000000000100001110111000000100101110000000000001000000000000010000010000000000000100000000110001010001011000010000001100010000011000", #fold 3
    "0000101101000000000000100001010100000000100001110000100000000000000000000010000010000000000000101000001010000100001100000010000011000011100011100", #fold 4
    "0000100110000000000000100001101101000000000100010000000000001000000000000000000001000000000000100000000110000000001001000000000010100011000010100" #fold 5
]

In [None]:
dataset_choice = "Caco2_Wang"
model_repo = "CohereLabs"
model_name = "aya-expanse-8b"
device = "cuda:0"

In [None]:
# Data Loading

loader = DataLoader()
x, y = loader.load_dataset(dataset_choice)
y = np.array(y).reshape(-1, 1)

In [None]:
precisions = ["half", "full"]
kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []

In [1]:
# ==========================================
# Precision Loop (FP16 vs FP32)
# ==========================================
for precision in precisions:
    model_type = "fp16" if precision == "half" else "fp32"
    print(f"\n==============================")
    print(f"Running precision mode: {model_type}")
    print(f"==============================")

    current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
    save_base_dir = f"./result/{model_name}_{dataset_choice}_{model_type}_{current_time}"
    os.makedirs(save_base_dir, exist_ok=True)
    
    llm_model = LLMModel(
        model_repo=f"{model_repo}/{model_name}",
        dtype=precision,
        device=device
    )

    # Filter descriptors globally
    llm_model._filter_descriptors(x)
    total_features = len(llm_model.property_names)
    select_method = f"GA-selected descriptors (fixed {total_features}D subset)"
    print(select_method)

    # ==========================================
    # K-Fold Loop
    # ==========================================
    for fold_idx, (train_idx, test_idx) in enumerate(kf.split(x), 1):
        print(f"\n===== Fold {fold_idx}/5 =====")

        x_train = [x[i] for i in train_idx]
        x_test  = [x[i] for i in test_idx]
        y_train = y[train_idx]
        y_test  = y[test_idx]

        # Restore GA-optimized flag
        flag_str = flag_list[fold_idx - 1]
        full_flag = np.array([c == "1" for c in flag_str])
        assert len(full_flag) == total_features, (
            f"Flag length {len(full_flag)} != descriptor count {total_features}"
        )

        print(f"[INFO] Fold {fold_idx}: Loaded flag of length {len(full_flag)}")

        # ==========================================
        # Selected Descriptor Names
        # ==========================================
        rdkit_names = list(llm_model._descriptor_funcs.keys())
        prompt_names = llm_model.property_names
        
        assert len(rdkit_names) == len(prompt_names) == len(full_flag), \
            "Descriptor name length mismatch"
        
        selected_rows = [
            {
                "index": i,
                "rdkit_name": rname,
                "prompt_name": pname
            }
            for i, (keep, rname, pname) in enumerate(
                zip(full_flag, rdkit_names, prompt_names)
            )
            if keep
        ]
        
        print(f"[INFO] Fold {fold_idx}: Selected {len(selected_rows)} descriptors")
        
        fold_dir = os.path.join(save_base_dir, f"fold{fold_idx}")
        os.makedirs(fold_dir, exist_ok=True)
        
        pd.DataFrame(selected_rows).to_csv(
            os.path.join(fold_dir, "selected_descriptors_detailed.csv"),
            index=False
        )

        # ==========================================
        # Step 1. Generate LLM Embeddings
        # ==========================================
        X_train_emb = llm_model.get_embeddings(x_train, full_flag)
        X_test_emb  = llm_model.get_embeddings(x_test, full_flag)

        # ==========================================
        # Step 2. Train Regression Model
        # ==========================================
        model = ScikitLearnModel("regression", save_dir=fold_dir)
        model.fit_and_evaluate(x_train, X_train_emb, y_train, X_test_emb, y_test)
        print("Model training completed.")


print("\nAll experiments completed successfully!")

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/HDD1/bbq9088/miniconda3/envs/Chemprompt/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


(642, 2)

Running precision mode: fp16


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Filtering descriptors: 100%|██████████████████████████████████████| 642/642 [00:04<00:00, 153.83it/s]


GA-selected descriptors (fixed 120D subset)

===== Fold 1/5 =====
[INFO] Fold 1: Loaded flag of length 120


Embedding SMILES: 100%|████████████████████████████████████████████| 513/513 [00:37<00:00, 13.80it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 129/129 [00:09<00:00, 14.03it/s]


Predictions saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold1/regression_Predictions.csv
Combined results saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold1/regression_CombinedResults.csv

[RESULTS]
rmse: 0.328
r2: 0.892
pearson: 0.945
spearman: 0.938
Model training completed.

===== Fold 2/5 =====
[INFO] Fold 2: Loaded flag of length 120


Embedding SMILES: 100%|████████████████████████████████████████████| 513/513 [00:34<00:00, 14.82it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 129/129 [00:08<00:00, 14.71it/s]


Predictions saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold2/regression_Predictions.csv
Combined results saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold2/regression_CombinedResults.csv

[RESULTS]
rmse: 0.316
r2: 0.900
pearson: 0.953
spearman: 0.943
Model training completed.

===== Fold 3/5 =====
[INFO] Fold 3: Loaded flag of length 120


Embedding SMILES: 100%|████████████████████████████████████████████| 514/514 [00:35<00:00, 14.37it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 128/128 [00:08<00:00, 14.37it/s]


Predictions saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold3/regression_Predictions.csv
Combined results saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold3/regression_CombinedResults.csv

[RESULTS]
rmse: 0.341
r2: 0.884
pearson: 0.943
spearman: 0.942
Model training completed.

===== Fold 4/5 =====
[INFO] Fold 4: Loaded flag of length 120


Embedding SMILES: 100%|████████████████████████████████████████████| 514/514 [00:36<00:00, 13.94it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 128/128 [00:09<00:00, 13.94it/s]


Predictions saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold4/regression_Predictions.csv
Combined results saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold4/regression_CombinedResults.csv

[RESULTS]
rmse: 0.325
r2: 0.894
pearson: 0.946
spearman: 0.945
Model training completed.

===== Fold 5/5 =====
[INFO] Fold 5: Loaded flag of length 120


Embedding SMILES: 100%|████████████████████████████████████████████| 514/514 [00:34<00:00, 14.76it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 128/128 [00:08<00:00, 14.76it/s]


Predictions saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold5/regression_Predictions.csv
Combined results saved to ./result/aya-expanse-8b_FreeSolv_fp16_20251031_055430/fold5/regression_CombinedResults.csv

[RESULTS]
rmse: 0.425
r2: 0.819
pearson: 0.908
spearman: 0.919
Model training completed.

Running precision mode: fp32


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 1 has a total capacity of 79.19 GiB of which 172.94 MiB is free. Process 106585 has 49.69 GiB memory in use. Including non-PyTorch memory, this process has 29.31 GiB memory in use. Of the allocated memory 28.41 GiB is allocated by PyTorch, and 131.15 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)