In [1]:
pip install scikit-learn xgboost lightgbm catboost joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the dataset you created
df = pd.read_csv("descriptor.csv")
df = df.copy()
df = df[df['Kd'].notnull()]  # Remove NaN
df = df[np.isfinite(df['Kd'])]  # Remove inf/-inf

features = ['MolWt', 'LogP', 'RotatableBonds', 'HDonors', 'HAcceptors',
            'Prot_MW', 'Aromaticity', 'Instability', 'Hydropathy']
target = 'Kd'

X = df[features]
y = df[target]


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.22, random_state=42)


In [4]:
# Objective functions
def rf_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [5, 10]),
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def xgb_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            random_state=42, verbosity=0
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def lgb_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', LGBMRegressor(
            n_estimators=trial.suggest_categorical('n_estimators', [100, 200]),
            max_depth=trial.suggest_categorical('max_depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()

def cat_objective(trial):
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('model', CatBoostRegressor(
            iterations=trial.suggest_categorical('iterations', [100, 200]),
            depth=trial.suggest_categorical('depth', [4, 6]),
            learning_rate=trial.suggest_categorical('learning_rate', [0.05, 0.1]),
            verbose=0,
            random_state=42
        ))
    ])
    return cross_val_score(model, X_train, y_train, cv=3, scoring='neg_mean_squared_error').mean()


In [5]:
# Train and save best models
objectives = {'rf': rf_objective, 'xgb': xgb_objective, 'lgb': lgb_objective, 'cat': cat_objective}
oof_preds = []
model_paths = {}

In [6]:
for name, obj_func in objectives.items():
    print(f"Running Optuna tuning for {name.upper()}...")
    study = optuna.create_study(direction='maximize')
    study.optimize(obj_func, n_trials=10)

    print(f"Best params for {name}: {study.best_params}")

    if name == 'rf':
        model = RandomForestRegressor(**study.best_params, random_state=42)
    elif name == 'xgb':
        model = XGBRegressor(**study.best_params, random_state=42, verbosity=0)
    elif name == 'lgb':
        model = LGBMRegressor(**study.best_params, random_state=42)
    elif name == 'cat':
        model = CatBoostRegressor(**study.best_params, verbose=0, random_state=42)

    pipeline = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    print(f"{name.upper()} MSE: {mse:.4f}")

    joblib.dump(pipeline, f"{name}_optuna_model.pkl")
    model_paths[name] = f"{name}_optuna_model.pkl"
    oof_preds.append(preds.reshape(-1, 1))


[I 2025-05-18 21:46:29,262] A new study created in memory with name: no-name-1363f0fb-bcf3-4ee3-8840-cc9de8fa015b


Running Optuna tuning for RF...


[I 2025-05-18 21:47:12,081] Trial 0 finished with value: -1.5525346745427653 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 0 with value: -1.5525346745427653.
[I 2025-05-18 21:47:23,502] Trial 1 finished with value: -1.9360633966302603 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: -1.5525346745427653.
[I 2025-05-18 21:48:05,765] Trial 2 finished with value: -1.5525346745427653 and parameters: {'n_estimators': 200, 'max_depth': 10}. Best is trial 0 with value: -1.5525346745427653.
[I 2025-05-18 21:48:26,996] Trial 3 finished with value: -1.5540203315744243 and parameters: {'n_estimators': 100, 'max_depth': 10}. Best is trial 0 with value: -1.5525346745427653.
[I 2025-05-18 21:48:49,908] Trial 4 finished with value: -1.9363167079433985 and parameters: {'n_estimators': 200, 'max_depth': 5}. Best is trial 0 with value: -1.5525346745427653.
[I 2025-05-18 21:49:31,890] Trial 5 finished with value: -1.5525346745427653 and parameters:

Best params for rf: {'n_estimators': 200, 'max_depth': 10}


[I 2025-05-18 21:51:19,885] A new study created in memory with name: no-name-ef306989-49fd-4591-a786-371cf3f80dc6


RF MSE: 1.5042
Running Optuna tuning for XGB...


[I 2025-05-18 21:51:20,420] Trial 0 finished with value: -1.6227096832215888 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6227096832215888.
[I 2025-05-18 21:51:20,962] Trial 1 finished with value: -1.7024799842603306 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.6227096832215888.
[I 2025-05-18 21:51:22,202] Trial 2 finished with value: -1.5574796804815232 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 2 with value: -1.5574796804815232.
[I 2025-05-18 21:51:23,083] Trial 3 finished with value: -1.5574796804815232 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 2 with value: -1.5574796804815232.
[I 2025-05-18 21:51:23,585] Trial 4 finished with value: -1.5553015727444326 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 4 with value: -1

Best params for xgb: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}


[I 2025-05-18 21:51:25,977] A new study created in memory with name: no-name-57682981-6b44-4cf5-835d-ea03dec7aa99


XGB MSE: 1.5090
Running Optuna tuning for LGB...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001079 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093








[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000718 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134


[I 2025-05-18 21:51:26,953] Trial 0 finished with value: -1.5838368695328608 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000721 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134


[I 2025-05-18 21:51:27,470] Trial 1 finished with value: -1.6100739755011932 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000697 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189


[I 2025-05-18 21:51:27,782] Trial 2 finished with value: -1.695905027924003 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000703 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000777 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start traini



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093


[I 2025-05-18 21:51:28,219] Trial 3 finished with value: -1.644866087218172 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000759 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189


[I 2025-05-18 21:51:28,541] Trial 4 finished with value: -1.8133913852310306 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000700 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000706 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start traini



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134


[I 2025-05-18 21:51:29,260] Trial 5 finished with value: -1.5838368695328608 and parameters: {'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.05}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000704 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000676 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000743 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start traini

[I 2025-05-18 21:51:29,581] Trial 6 finished with value: -1.8133913852310306 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000753 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093


[I 2025-05-18 21:51:30,082] Trial 7 finished with value: -1.7041838914331482 and parameters: {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000717 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000692 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000691 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134


[I 2025-05-18 21:51:30,393] Trial 8 finished with value: -1.695905027924003 and parameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1}. Best is trial 0 with value: -1.5838368695328608.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000716 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1722
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060189




[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000733 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1733
[LightGBM] [Info] Number of data points in the train set: 28589, number of used features: 9
[LightGBM] [Info] Start training from score 6.060093
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1727
[LightGBM] [Info] Number of data points in the train set: 28590, number of used features: 9
[LightGBM] [Info] Start training from score 6.051134


[I 2025-05-18 21:51:30,815] Trial 9 finished with value: -1.5823644088943576 and parameters: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}. Best is trial 9 with value: -1.5823644088943576.
[I 2025-05-18 21:51:30,981] A new study created in memory with name: no-name-58c3134c-b351-42ef-ac75-6a0b6745b8ff


Best params for lgb: {'n_estimators': 100, 'max_depth': 6, 'learning_rate': 0.1}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001018 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1741
[LightGBM] [Info] Number of data points in the train set: 42884, number of used features: 9
[LightGBM] [Info] Start training from score 6.057139
LGB MSE: 1.5232
Running Optuna tuning for CAT...


[I 2025-05-18 21:51:32,572] Trial 0 finished with value: -1.9554233566689503 and parameters: {'iterations': 100, 'depth': 4, 'learning_rate': 0.05}. Best is trial 0 with value: -1.9554233566689503.
[I 2025-05-18 21:51:33,738] Trial 1 finished with value: -1.840765587809889 and parameters: {'iterations': 100, 'depth': 4, 'learning_rate': 0.1}. Best is trial 1 with value: -1.840765587809889.
[I 2025-05-18 21:51:35,854] Trial 2 finished with value: -1.8418546394446622 and parameters: {'iterations': 200, 'depth': 4, 'learning_rate': 0.05}. Best is trial 1 with value: -1.840765587809889.
[I 2025-05-18 21:51:38,013] Trial 3 finished with value: -1.8418546394446622 and parameters: {'iterations': 200, 'depth': 4, 'learning_rate': 0.05}. Best is trial 1 with value: -1.840765587809889.
[I 2025-05-18 21:51:39,608] Trial 4 finished with value: -1.7257885250710914 and parameters: {'iterations': 100, 'depth': 6, 'learning_rate': 0.1}. Best is trial 4 with value: -1.7257885250710914.
[I 2025-05-18 21

Best params for cat: {'iterations': 200, 'depth': 6, 'learning_rate': 0.1}
CAT MSE: 1.5597


In [7]:
# Train meta-model (stacking)
X_meta = np.hstack(oof_preds)
meta_model = LinearRegression()
meta_model.fit(X_meta, y_val)
meta_preds = meta_model.predict(X_meta)
meta_mse = mean_squared_error(y_val, meta_preds)
print(f"Meta-model (stacking) MSE: {meta_mse:.4f}")


Meta-model (stacking) MSE: 1.4828


In [8]:
test_df = pd.read_csv("descriptor_test.csv")
x_test = test_df.drop(columns=['Kd'])
y_test = test_df['Kd']
y_pred_test = meta_model.predict(x_test)

print("Test MSE:", mean_squared_error(y_test, y_pred_test))




ValueError: X has 9 features, but LinearRegression is expecting 4 features as input.

In [9]:
joblib.dump(meta_model, "meta_model_optuna.pkl")
np.save("stacking_features_optuna.npy", X_meta)