In [None]:
import os, sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import pearsonr, spearmanr

In [None]:
from Chemprompt.data.data_loader import DataLoader
from Chemprompt.embeddings.full_descriptors_embedding import LLMModel
from Chemprompt.models.sklearn_model import ScikitLearnModel
from Chemprompt.genetic.genetic import genetic
from Chemprompt.feature_selection.descriptor_selection import descriptor_selection

In [None]:
# Experiment Configuration
dataset_choice = "FreeSolv"
model_repo = "CohereLabs"
model_name = "aya-expanse-8b"
device = "cuda:0"

In [None]:
# Genetic Algorithm Parameters
num_generations = 50
pop_size = 40
top_k = 5
mutation_rate = 0.05
K = 50
mode = "f_regression"

In [None]:
# Cross-Validation Settings
start_fold = 1
end_fold = 5
n_splits = 5

In [None]:
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
os.makedirs("./result", exist_ok=True)
result_path = f"./result/{model_name}_{dataset_choice}_{current_time}_summary.txt"

In [None]:
# Step 1. Load LLM Model
print("[INFO] Loading LLM model...")
llm_model = LLMModel(
    model_repo=f"{model_repo}/{model_name}",
    dtype="half",
    device=device
)

In [None]:
# Step 2. Load Dataset
print(f"[INFO] Loading dataset: {dataset_choice}")
loader = DataLoader()
x, y = loader.load_dataset(dataset_choice)
y = np.array(y).reshape(-1, 1)

In [None]:
# Step 3. Global Descriptor Filtering (One-time)
print("[INFO] Performing global descriptor filtering...")
llm_model._filter_descriptors(x)
print(f"[INFO] {len(llm_model.property_names)} descriptors retained after filtering.")

In [1]:
# Step 4. K-Fold Cross-Validation
results = []
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(x, y), 1):
    if fold < start_fold or fold > end_fold:
        continue

    print(f"\n===== Fold {fold}/{n_splits} =====")

    # Split data
    x_train = [x[i] for i in train_idx]
    x_test = [x[i] for i in test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Step 5. Feature Selection (Train Set Only)
    print("[INFO] Running feature selection on training data...")

    # Get descriptor values directly from llm_model (SMILES only)
    X_train = llm_model.get_descriptor_values(x_train)
    X_train_df = pd.DataFrame(X_train, columns=llm_model.property_names)

    # Fit selector on training data
    selector = descriptor_selection()
    selector.fit_feature_selector(X_train_df, y_train, k=K, method=mode)

    # Map selected columns to descriptor indices
    selected_indices = selector.selected_indices
    print(f"[INFO] Selected {len(selected_indices)} features for Fold {fold}")

    print(f"unique selected_indices: {len(set(selected_indices))}, total: {len(selected_indices)}")

    # Step 6. Genetic Algorithm
    ga = genetic(
        smiles_list=x_train,
        y=y_train,
        llm_model=llm_model,
        dataset_name=f"{dataset_choice}_fold_{fold}_GA_{mode}",
        selected_indices=selected_indices,
        num_generations=num_generations,
        pop_size=pop_size,
        top_k=top_k,
        mutation_rate=mutation_rate,
        save_predictions=True
    )
    best_flag, full_flag = ga.run()

    # Step 7. LLM Embedding Generation
    X_train_emb = llm_model.get_embeddings(x_train, full_flag)
    X_test_emb = llm_model.get_embeddings(x_test, full_flag)

    # Step 8. Regression Model Training & Evaluation
    save_dir = f"./ga_result/{dataset_choice}_fold_{fold}_{current_time}"
    os.makedirs(save_dir, exist_ok=True)

    model = ScikitLearnModel("regression", save_dir=save_dir)
    model.fit_and_evaluate(data_list = X_train, train_x = X_train_emb, train_y = y_train, test_x = X_test_emb, test_y = y_test)

print("all done.")

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/HDD1/bbq9088/miniconda3/envs/Chemprompt/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'
Skipped loading some PyTorch models, missing a dependency. No module named 'tensorflow'


[INFO] Loading LLM model...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

[INFO] Loading dataset: FreeSolv
(642, 2)
[INFO] Performing global descriptor filtering...


Filtering descriptors: 100%|██████████████████████████████████████| 642/642 [00:04<00:00, 153.16it/s]


[INFO] 120 descriptors retained after filtering.

===== Fold 1/5 =====
[INFO] Running feature selection on training data...
[Feature Selection] 50 features selected using f_regression.
[INFO] Selected 50 features for Fold 1
unique selected_indices: 50, total: 50

=== Generation 1/2 ===


Embedding flag 01101000001011000000001000010000000000001000001100000000001010100110000000010001000000
Embedding flag 11000000000111001000001000010001000000000000000000000000000010001110100010000001000000
Embedding flag 10000000000100001000001001000000000000000000001000000000001000000110000000010001100000


Ind 1: R²=0.8574 | flag=01110101101010011100011101100110010100111111010101
Ind 2: R²=0.8483 | flag=11000011111010100000001011111010010011000111100111
Ind 3: R²=0.8474 | flag=10000010011100001000010001100111011110010001100010

=== Generation 2/2 ===


Embedding flag 11000000000111001000001000010001000000000000000000000000000010001110100010000001000000
Embedding flag 01101000001011000000001000010000000000001000001100000000001010100110000000010001000000
Embedding flag 01101000001011000000001000010000000000001000000000000000000010001110100010000001000000


Ind 1: R²=0.8483 | flag=11000011111010100000001011111010010011000111100111
Ind 2: R²=0.8574 | flag=01110101101010011100011101100110010100111111010101
Ind 3: R²=0.8575 | flag=01110101101010010000001011111010010011000111100111

Best overall R²=0.8575 | flag=01110101101010010000001011111010010011000111100111


Embedding SMILES: 100%|████████████████████████████████████████████| 513/513 [00:37<00:00, 13.55it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 129/129 [00:09<00:00, 13.67it/s]


Predictions saved to ./ga_result/FreeSolv_fold_1_20251031_052757/regression_Predictions.csv
Combined results saved to ./ga_result/FreeSolv_fold_1_20251031_052757/regression_CombinedResults.csv

[RESULTS]
rmse: 0.328
r2: 0.892
pearson: 0.945
spearman: 0.938

===== Fold 2/5 =====
[INFO] Running feature selection on training data...
[Feature Selection] 50 features selected using f_regression.
[INFO] Selected 50 features for Fold 2
unique selected_indices: 50, total: 50

=== Generation 1/2 ===


Embedding flag 01001000011111001000000000000101000000001000000100000000100010100110000010000001001000
Embedding flag 00001000000100001000000001010100000000001000001000000000101000100000100010000001000000
Embedding flag 11001000001101000000000001011000000000000000000100001000000010100000100000000000001000


Ind 1: R²=0.7794 | flag=01011111110000111010010110110111100110110110110001
Ind 2: R²=0.8053 | flag=00010010010110101100011010001110101111000010100100
Ind 3: R²=0.7896 | flag=11010110100111000011000110001001011100111110011000

=== Generation 2/2 ===


Embedding flag 11001000001101000000000001011000000000000000000100001000000010100000100000000000001000
Embedding flag 00001000000100001000000001010100000000001000001000000000101000100000100010000001000000
Embedding flag 11001000001101000000000001010100000000001000001000000000101000100000100010000001000000


Ind 1: R²=0.7896 | flag=11010110100111000011000110001001011100111110011000
Ind 2: R²=0.8053 | flag=00010010010110101100011010001110101111000010100100
Ind 3: R²=0.7682 | flag=11010110100110101100011010001110101111000010100100

Best overall R²=0.8053 | flag=00010010010110101100011010001110101111000010100100


Embedding SMILES: 100%|████████████████████████████████████████████| 513/513 [00:35<00:00, 14.51it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 129/129 [00:08<00:00, 14.67it/s]


Predictions saved to ./ga_result/FreeSolv_fold_2_20251031_052757/regression_Predictions.csv
Combined results saved to ./ga_result/FreeSolv_fold_2_20251031_052757/regression_CombinedResults.csv

[RESULTS]
rmse: 0.316
r2: 0.900
pearson: 0.953
spearman: 0.943

===== Fold 3/5 =====
[INFO] Running feature selection on training data...
[Feature Selection] 50 features selected using f_regression.
[INFO] Selected 50 features for Fold 3
unique selected_indices: 50, total: 50

=== Generation 1/2 ===


Embedding flag 10001000010001000000000000001001000000000000000100000000000010100110100000000001100000
Embedding flag 01101000001000000000001000010001000000001000000000000100100000101100100010000001000000
Embedding flag 10101000001101000000000000001001000000001000001000000100101010101000000000000000100000


Ind 1: R²=0.8429 | flag=10011000100001100100001101110110000011001110011101
Ind 2: R²=0.8533 | flag=01110100001010110001100111011101101001010100010001
Ind 3: R²=0.8288 | flag=10110110100001111001111110000011101010010010111110

=== Generation 2/2 ===


Embedding flag 10001000010001000000000000001001000000000000000100000000000010100110100000000001100000
Embedding flag 01101000001000000000001000010001000000001000000000000100100000101100100010000001000000
Embedding flag 10001000001000000000001000010001000000001000000000000100100000101100100010000001000000


Ind 1: R²=0.8429 | flag=10011000100001100100001101110110000011001110011101
Ind 2: R²=0.8533 | flag=01110100001010110001100111011101101001010100010001
Ind 3: R²=0.8345 | flag=10010100001010110001100111011101101001010100010001

Best overall R²=0.8533 | flag=01110100001010110001100111011101101001010100010001


Embedding SMILES: 100%|████████████████████████████████████████████| 514/514 [00:36<00:00, 14.11it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 128/128 [00:09<00:00, 14.11it/s]


Predictions saved to ./ga_result/FreeSolv_fold_3_20251031_052757/regression_Predictions.csv
Combined results saved to ./ga_result/FreeSolv_fold_3_20251031_052757/regression_CombinedResults.csv

[RESULTS]
rmse: 0.341
r2: 0.884
pearson: 0.943
spearman: 0.942

===== Fold 4/5 =====
[INFO] Running feature selection on training data...
[Feature Selection] 50 features selected using f_regression.
[INFO] Selected 50 features for Fold 4
unique selected_indices: 50, total: 50

=== Generation 1/2 ===


Embedding flag 10101000000100001000000010010000000000001000010100000000101000100000000000000000000000
Embedding flag 11100000000110001000001000011001000000000000000100001000101000000100100000000001100000
Embedding flag 01101000001101000000000001011000000000000000010100001000000010001100000000000001101000


Ind 1: R²=0.8413 | flag=10110010010101001101011010000000100111101010000101
Ind 2: R²=0.8321 | flag=11100011011001110001111000110110001101101010000100
Ind 3: R²=0.8616 | flag=01110110100011100101100101100111001001111010011101

=== Generation 2/2 ===


Embedding flag 10101000000100001000000010010000000000001000010100000000101000100000000000000000000000
Embedding flag 01101000001101000000000001011000000000000000010100001000000010001100000000000001101000
Embedding flag 01101000001101000000000001011000000000000000010100001000000010001100000000000001000000


Ind 1: R²=0.8413 | flag=10110010010101001101011010000000100111101010000101
Ind 2: R²=0.8616 | flag=01110110100011100101100101100111001001111010011101
Ind 3: R²=0.8515 | flag=01110110100011100101100101100100100111101010000101

Best overall R²=0.8616 | flag=01110110100011100101100101100111001001111010011101


Embedding SMILES: 100%|████████████████████████████████████████████| 514/514 [00:37<00:00, 13.83it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 128/128 [00:09<00:00, 13.78it/s]


Predictions saved to ./ga_result/FreeSolv_fold_4_20251031_052757/regression_Predictions.csv
Combined results saved to ./ga_result/FreeSolv_fold_4_20251031_052757/regression_CombinedResults.csv

[RESULTS]
rmse: 0.325
r2: 0.894
pearson: 0.946
spearman: 0.945

===== Fold 5/5 =====
[INFO] Running feature selection on training data...
[Feature Selection] 50 features selected using f_regression.
[INFO] Selected 50 features for Fold 5
unique selected_indices: 50, total: 50

=== Generation 1/2 ===


Embedding flag 00100000011111001000001001000000000000000000000100001100000010100000000010000000000000
Embedding flag 00000000000011000000001001011000000000001000000000000100100000001000100000010000000000
Embedding flag 11000000010101000000000010011000000000000000001100000100100010000000000010000001100000


Ind 1: R²=0.8636 | flag=00111111110100000111001100010000100010011111101110
Ind 2: R²=0.8755 | flag=00000011010111010001100010101000110011011011111111
Ind 3: R²=0.8768 | flag=11010101001011001101101000010110110000011101000001

=== Generation 2/2 ===


Embedding flag 00000000000011000000001001011000000000001000000000000100100000001000100000010000000000
Embedding flag 11000000010101000000000010011000000000000000001100000100100010000000000010000001100000
Embedding flag 00000000000011000000001001011000000000001000000000000100100000001000100000010000000000


Ind 1: R²=0.8755 | flag=00000011010111010001100010101000110011011011111111
Ind 2: R²=0.8768 | flag=11010101001011001101101000010110110000011101000001
Ind 3: R²=0.8632 | flag=00000011010111010001100010101000110010011101000001

Best overall R²=0.8768 | flag=11010101001011001101101000010110110000011101000001


Embedding SMILES: 100%|████████████████████████████████████████████| 514/514 [00:35<00:00, 14.61it/s]
Embedding SMILES: 100%|████████████████████████████████████████████| 128/128 [00:08<00:00, 14.61it/s]


Predictions saved to ./ga_result/FreeSolv_fold_5_20251031_052757/regression_Predictions.csv
Combined results saved to ./ga_result/FreeSolv_fold_5_20251031_052757/regression_CombinedResults.csv

[RESULTS]
rmse: 0.425
r2: 0.819
pearson: 0.908
spearman: 0.919
