In [1]:
pip install scikit-learn xgboost lightgbm catboost joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset you created
df = pd.read_csv("descriptor.csv")
df = df.copy()
df = df[df['Kd'].notnull()]  # Remove NaN
df = df[np.isfinite(df['Kd'])]  # Remove inf/-inf


In [4]:
df.keys()

Index(['SlogP_VSA5', 'BCUT2D_CHGHI', 'SMR_VSA5', 'MinEStateIndex', 'Chi4v',
       'Prot_MW', 'Aromaticity', 'Instability', 'Hydropathy',
       'IsoelectricPoint', 'Kd'],
      dtype='object')

In [5]:
features = ['SlogP_VSA5', 'BCUT2D_CHGHI', 'SMR_VSA5', 'MinEStateIndex', 'Chi4v','Prot_MW', 'Aromaticity', 'Instability', 'Hydropathy','IsoelectricPoint']
target = 'Kd'

X = df[features]
y = df[target]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.22, random_state=42)


In [6]:
# Objective functions
def rf_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [5, 10]),
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def xgb_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            random_state=42, verbosity=0
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def lgb_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LGBMRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def cat_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', CatBoostRegressor(
            iterations=trial.suggest_categorical('iterations', [100, 200]),
            depth=trial.suggest_categorical('depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            verbose=0,
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()


In [7]:
# Train and save best models
objectives = {'rf': rf_objective, 'xgb': xgb_objective, 'lgb': lgb_objective, 'cat': cat_objective}
oof_preds = []
model_paths = {}

In [8]:
for name, obj_func in objectives.items():
    print(f"Running Optuna tuning for {name.upper()}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func, n_trials=10)

    print(f"Best params for {name}: {study.best_params}")

    if name == 'rf':
        model = RandomForestRegressor(**study.best_params, random_state=42)
    elif name == 'xgb':
        model = XGBRegressor(**study.best_params, random_state=42, verbosity=0)
    elif name == 'lgb':
        model = LGBMRegressor(**study.best_params, random_state=42)
    elif name == 'cat':
        model = CatBoostRegressor(**study.best_params, verbose=0, random_state=42)

    pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    print(f"{name.upper()} MSE: {mse:.4f}")

    joblib.dump(pipeline, f"{name}_optuna_model.pkl")
    model_paths[name] = f"{name}_optuna_model.pkl"
    oof_preds.append(preds.reshape(-1, 1))


[I 2025-06-02 15:00:26,118] A new study created in memory with name: no-name-0e1b5b47-f22c-4013-bd6d-6d9ca4d99927


Running Optuna tuning for RF...


[I 2025-06-02 15:01:01,093] Trial 0 finished with value: -1.9086487699046304 and parameters: {'n_estimators': 200, 'max_depth': 5}. Best is trial 0 with value: -1.9086487699046304.
[I 2025-06-02 15:02:07,541] Trial 1 finished with value: -1.5811556942033205 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 1 with value: -1.5811556942033205.
[I 2025-06-02 15:02:53,983] Trial 2 finished with value: -1.9086487699046304 and parameters: {'n_estimators': 200, 'max_depth': 5}. Best is trial 1 with value: -1.5811556942033205.
[I 2025-06-02 15:03:24,508] Trial 3 finished with value: -1.5806399063875343 and parameters: {'n_estimators': 100, 'max_depth': 10}. Best is trial 3 with value: -1.5806399063875343.
[I 2025-06-02 15:04:29,568] Trial 4 finished with value: -1.5811556942033205 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 3 with value: -1.5806399063875343.
[I 2025-06-02 15:05:29,745] Trial 5 finished with value: -1.5811556942033205 and parameters:

Best params for rf: {'n_estimators': 100, 'max_depth': 10}


[I 2025-06-02 15:08:14,533] A new study created in memory with name: no-name-0d7ff106-4677-4ff8-9108-3a33b32f2725


RF MSE: 1.4792
Running Optuna tuning for XGB...


[I 2025-06-02 15:08:15,084] Trial 0 finished with value: -1.6961506368526302 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6961506368526302.
[I 2025-06-02 15:08:15,605] Trial 1 finished with value: -1.6961506368526302 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6961506368526302.
[I 2025-06-02 15:08:16,301] Trial 2 finished with value: -1.5457088639391834 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 2 with value: -1.5457088639391834.
[I 2025-06-02 15:08:17,022] Trial 3 finished with value: -1.5457088639391834 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 2 with value: -1.5457088639391834.
[I 2025-06-02 15:08:17,540] Trial 4 finished with value: -1.5859348274855585 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 2 with value: -1.5

Best params for xgb: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}


[I 2025-06-02 15:08:20,601] A new study created in memory with name: no-name-1e882b16-2f63-4810-987f-4802ef0923d9


XGB MSE: 1.4448
Running Optuna tuning for LGB...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000971 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000994 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104


[I 2025-06-02 15:08:21,223] Trial 0 finished with value: -1.6965993887740909 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6965993887740909.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000875 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000923 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104






[I 2025-06-02 15:08:21,982] Trial 1 finished with value: -1.5362289498346078 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000835 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000827 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001002 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104


[I 2025-06-02 15:08:22,475] Trial 2 finished with value: -1.5703947208826694 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352


[I 2025-06-02 15:08:22,942] Trial 3 finished with value: -1.6448692666493872 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000978 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000790 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000820 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104


[I 2025-06-02 15:08:23,415] Trial 4 finished with value: -1.6965993887740909 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000825 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000959 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352


[I 2025-06-02 15:08:23,817] Trial 5 finished with value: -1.6448692666493872 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000766 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104


[I 2025-06-02 15:08:24,284] Trial 6 finished with value: -1.6005279018116612 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000826 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000946 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000928 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104


[I 2025-06-02 15:08:24,829] Trial 7 finished with value: -1.6448692666493872 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000765 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352


[I 2025-06-02 15:08:25,115] Trial 8 finished with value: -1.6861319527637573 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000767 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000794 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28528, number of used features: 10
[LightGBM] [Info] Start training from score 6.061141




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.055352


[I 2025-06-02 15:08:25,501] Trial 9 finished with value: -1.6448692666493872 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 1 with value: -1.5362289498346078.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000799 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 28529, number of used features: 10
[LightGBM] [Info] Start training from score 6.051104
Best params for lgb: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2549
[LightGBM] [Info] Number of data points in the train set: 42793, number of used features: 10
[LightGBM] [Info] Start training from score 6.055866


[I 2025-06-02 15:08:25,760] A new study created in memory with name: no-name-5212b447-865b-4569-a88c-906982143fd4


LGB MSE: 1.4334
Running Optuna tuning for CAT...


[I 2025-06-02 15:08:28,919] Trial 0 finished with value: -1.6928998348913955 and parameters: {'iterations': 200, 'depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6928998348913955.
[I 2025-06-02 15:08:30,893] Trial 1 finished with value: -1.697847939382976 and parameters: {'iterations': 200, 'depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.6928998348913955.
[I 2025-06-02 15:08:32,426] Trial 2 finished with value: -1.802314524798379 and parameters: {'iterations': 100, 'depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6928998348913955.
[I 2025-06-02 15:08:34,393] Trial 3 finished with value: -1.697847939382976 and parameters: {'iterations': 200, 'depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.6928998348913955.
[I 2025-06-02 15:08:37,364] Trial 4 finished with value: -1.6928998348913955 and parameters: {'iterations': 200, 'depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6928998348913955.
[I 2025-06-02 1

Best params for cat: {'iterations': 200, 'depth': 6, 'learning_rate': 0.1}
CAT MSE: 1.4862


In [12]:
# Train meta-model (stacking)
X_meta = np.hstack(oof_preds)
meta_model = LinearRegression()
meta_model.fit(X_meta, y_val)
meta_preds = meta_model.predict(X_meta)
meta_mse = mean_squared_error(y_val, meta_preds)
print(f"Meta-model (stacking) MSE: {meta_mse:.4f}")


Meta-model (stacking) MSE: 1.4165


In [13]:
joblib.dump(meta_model, "meta_model_optuna.pkl")
np.save("stacking_features_optuna.npy", X_meta)