In [1]:
pip install scikit-learn xgboost joblib

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from joblib import dump, load
import pandas as pd
import numpy as np


In [3]:
# Load the dataset you created
df = pd.read_csv("descriptor_based_dataset.csv")
df = df.copy()
df = df[df['Kd'].notnull()]  # Remove NaN
df = df[np.isfinite(df['Kd'])]  # Remove inf/-inf

X = df.drop("Kd", axis=1)
y = df["Kd"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)


In [4]:
df.describe()

Unnamed: 0,MolWt,LogP,RotatableBonds,HDonors,HAcceptors,Prot_MW,Aromaticity,Instability,Hydropathy,Kd
count,61092.0,61092.0,61092.0,61092.0,61092.0,61092.0,61092.0,61092.0,61092.0,61092.0
mean,493.110912,3.36501,7.312643,2.777319,6.906813,75943.994115,0.084169,46.450565,-0.356622,6.055919
std,296.495108,3.025195,8.49822,3.995878,5.662067,54539.68093,0.02012,10.518773,0.297445,1.576809
min,46.073,-48.0123,0.0,0.0,0.0,2448.5128,0.0,0.30793,-1.610233,0.02
25%,370.384,2.5445,4.0,1.0,5.0,44460.1117,0.070175,40.82794,-0.522418,5.0
50%,450.345,3.6333,6.0,2.0,6.0,59353.7879,0.082873,46.330469,-0.370892,5.451737
75%,527.669,5.0242,8.0,3.0,8.0,92849.5813,0.097222,52.685901,-0.238547,7.05061
max,10112.436,16.7844,178.0,98.0,276.0,794048.9258,0.186441,90.878018,1.171429,15.221849


In [5]:
base_models = [
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42)),
    ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42))
]

meta_model = LinearRegression()

stacking_model = StackingRegressor(estimators=base_models, final_estimator=meta_model, cv=5)

In [6]:
stacking_model.fit(X_train, y_train)


In [7]:
y_pred = stacking_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
print("Stacked Ensemble MSE:", mse)

Stacked Ensemble MSE: 1.0872787307383882


In [8]:
dump(stacking_model, 'stacked_model.joblib')

['stacked_model.joblib']