In [1]:
pip install scikit-learn xgboost lightgbm catboost joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset you created
df = pd.read_csv("descriptor_based_dataset.csv")
df = df.copy()
df = df[df['Kd'].notnull()]  # Remove NaN
df = df[np.isfinite(df['Kd'])]  # Remove inf/-inf

features = ['MolWt', 'LogP', 'RotatableBonds', 'HDonors', 'HAcceptors',
            'Prot_MW', 'Aromaticity', 'Instability', 'Hydropathy']
target = 'Kd'

X = df[features]
y = df[target]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)


In [4]:
# Objective functions
def rf_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [5, 10]),
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def xgb_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            random_state=42, verbosity=0
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def lgb_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LGBMRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def cat_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', CatBoostRegressor(
            iterations=trial.suggest_categorical('iterations', [100, 200]),
            depth=trial.suggest_categorical('depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            verbose=0,
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()


In [5]:
# Train and save best models
objectives = {'rf': rf_objective, 'xgb': xgb_objective, 'lgb': lgb_objective, 'cat': cat_objective}
oof_preds = []
model_paths = {}

In [6]:
for name, obj_func in objectives.items():
    print(f"Running Optuna tuning for {name.upper()}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func, n_trials=10)

    print(f"Best params for {name}: {study.best_params}")

    if name == 'rf':
        model = RandomForestRegressor(**study.best_params, random_state=42)
    elif name == 'xgb':
        model = XGBRegressor(**study.best_params, random_state=42, verbosity=0)
    elif name == 'lgb':
        model = LGBMRegressor(**study.best_params, random_state=42)
    elif name == 'cat':
        model = CatBoostRegressor(**study.best_params, verbose=0, random_state=42)

    pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    print(f"{name.upper()} MSE: {mse:.4f}")

    joblib.dump(pipeline, f"{name}_optuna_model.pkl")
    model_paths[name] = f"{name}_optuna_model.pkl"
    oof_preds.append(preds.reshape(-1, 1))


[I 2025-05-06 16:51:34,255] A new study created in memory with name: no-name-3de8c7e7-c593-4d67-bb6b-609b5800945d


Running Optuna tuning for RF...


[I 2025-05-06 16:52:16,367] Trial 0 finished with value: -1.4740248792908932 and parameters: {'n_estimators': 100, 'max_depth': 10}. Best is trial 0 with value: -1.4740248792908932.
[I 2025-05-06 16:52:33,790] Trial 1 finished with value: -1.9715577895629428 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: -1.4740248792908932.
[I 2025-05-06 16:52:48,901] Trial 2 finished with value: -1.9715577895629428 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: -1.4740248792908932.
[I 2025-05-06 16:53:57,439] Trial 3 finished with value: -1.4729298662893395 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 3 with value: -1.4729298662893395.
[I 2025-05-06 16:55:51,740] Trial 4 finished with value: -1.4729298662893395 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 3 with value: -1.4729298662893395.
[I 2025-05-06 16:57:57,008] Trial 5 finished with value: -1.970086570933273 and parameters: 

Best params for rf: {'n_estimators': 200, 'max_depth': 10}
RF MSE: 1.4491


[I 2025-05-06 17:15:05,093] A new study created in memory with name: no-name-e4211f34-34ac-49f8-b406-a9e1a8c0d10e


Running Optuna tuning for XGB...


[I 2025-05-06 17:15:32,966] Trial 0 finished with value: -1.4890219821506332 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.4890219821506332.
[I 2025-05-06 17:16:01,883] Trial 1 finished with value: -1.6035031163435856 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.4890219821506332.
[I 2025-05-06 17:16:14,249] Trial 2 finished with value: -1.7420494732526937 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.4890219821506332.
[I 2025-05-06 17:16:29,984] Trial 3 finished with value: -1.7420494732526937 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.4890219821506332.
[I 2025-05-06 17:16:56,352] Trial 4 finished with value: -1.4650077382567395 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 4 with value: -1

Best params for xgb: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}


[I 2025-05-06 17:18:32,493] A new study created in memory with name: no-name-d43fefdb-9432-40e1-bd77-996476904203


XGB MSE: 1.3551
Running Optuna tuning for LGB...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005231 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.048862 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:18:46,037] Trial 0 finished with value: -1.4651543049058011 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.4651543049058011.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009312 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001305 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.010219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:18:56,454] Trial 1 finished with value: -1.6045432341557913 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.4651543049058011.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001752 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:19:03,426] Trial 2 finished with value: -1.6068737402021334 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.4651543049058011.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001976 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001450 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.069525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:19:17,790] Trial 3 finished with value: -1.3256212882782183 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 3 with value: -1.3256212882782183.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:19:21,418] Trial 4 finished with value: -1.6068737402021334 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 3 with value: -1.3256212882782183.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008886 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.109351 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:20:00,605] Trial 5 finished with value: -1.4221832165403046 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 3 with value: -1.3256212882782183.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.082907 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.106771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:20:21,126] Trial 6 finished with value: -1.415215465860662 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 3 with value: -1.3256212882782183.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.137405 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029673 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001274 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:20:39,239] Trial 7 finished with value: -1.7433241765819547 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 3 with value: -1.3256212882782183.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.046197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.167496 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.122541 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:21:01,911] Trial 8 finished with value: -1.6045432341557913 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 3 with value: -1.3256212882782183.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031151 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1737
[LightGBM] [Info] Number of data points in the train set: 36654, number of used features: 9
[LightGBM] [Info] Start training from score 6.053076




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000955 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1739
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.058094




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.214442 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1736
[LightGBM] [Info] Number of data points in the train set: 36655, number of used features: 9
[LightGBM] [Info] Start training from score 6.060639


[I 2025-05-06 17:21:41,407] Trial 9 finished with value: -1.4221832165403046 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 3 with value: -1.3256212882782183.


Best params for lgb: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1752
[LightGBM] [Info] Number of data points in the train set: 54982, number of used features: 9
[LightGBM] [Info] Start training from score 6.057270


[I 2025-05-06 17:21:52,545] A new study created in memory with name: no-name-76c48786-7f63-4412-889d-d1c483dddbfc


LGB MSE: 1.2662
Running Optuna tuning for CAT...


[I 2025-05-06 17:22:28,359] Trial 0 finished with value: -1.477405439044367 and parameters: {'iterations': 200, 'depth': 6, 'learning_rate': 0.1}. Best is trial 0 with value: -1.477405439044367.
[I 2025-05-06 17:22:45,034] Trial 1 finished with value: -1.9186270505242806 and parameters: {'iterations': 100, 'depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.477405439044367.
[I 2025-05-06 17:22:59,330] Trial 2 finished with value: -1.7773012720369399 and parameters: {'iterations': 100, 'depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.477405439044367.
[I 2025-05-06 17:23:35,258] Trial 3 finished with value: -1.477405439044367 and parameters: {'iterations': 200, 'depth': 6, 'learning_rate': 0.1}. Best is trial 0 with value: -1.477405439044367.
[I 2025-05-06 17:24:10,284] Trial 4 finished with value: -1.477405439044367 and parameters: {'iterations': 200, 'depth': 6, 'learning_rate': 0.1}. Best is trial 0 with value: -1.477405439044367.
[I 2025-05-06 17:24:30

Best params for cat: {'iterations': 200, 'depth': 6, 'learning_rate': 0.1}
CAT MSE: 1.4298


In [7]:
# Train meta-model (stacking)
X_meta = np.hstack(oof_preds)
meta_model = LinearRegression()
meta_model.fit(X_meta, y_val)
meta_preds = meta_model.predict(X_meta)
meta_mse = mean_squared_error(y_val, meta_preds)
print(f"Meta-model (stacking) MSE: {meta_mse:.4f}")


Meta-model (stacking) MSE: 1.2442


In [8]:
joblib.dump(meta_model, "meta_model_optuna.pkl")
np.save("stacking_features_optuna.npy", X_meta)