In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shap as sh
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, root_mean_squared_error

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df_ic50 = pd.read_csv('final_ic50.csv')
df_ki = pd.read_csv('final_ki.csv')

In [4]:
X_ic50 = df_ic50.drop(columns=['Standard Value'])
y_ic50 = df_ic50['Standard Value']
X_ic50_train, X_ic50_test, y_ic50_train, y_ic50_test = train_test_split(X_ic50, y_ic50, test_size=0.2, random_state=42)
X_ki = df_ki.drop(columns=['Standard Value'])
y_ki = df_ki['Standard Value']
X_ki_train, X_ki_test, y_ki_train, y_ki_test = train_test_split(X_ki, y_ki, test_size=0.2, random_state=42)

In [5]:
models = {
    "LGBM": (LGBMRegressor(random_state=42), 'blue'),
    "XGBoost": (xgb.XGBRegressor(random_state=42), 'red'),
    "Random Forest": (RandomForestRegressor(random_state=42), 'green'),
    "Gradient Boosting": (GradientBoostingRegressor(random_state=42), 'purple'),
    "SVR": (SVR(), 'brown')
}

plt.figure(figsize=(8, 6))

for name, (model, color) in models.items():
    model.fit(X_ic50_train, y_ic50_train)
    y_pred_ic50 = model.predict(X_ic50_test)
    r2_ic50 = r2_score(y_ic50_test, y_pred_ic50)
    rmse_ic50 = root_mean_squared_error(y_ic50_test, y_pred_ic50)
    print(f"{name} - IC50 R^2: {r2_ic50:.3f}, RMSE: {rmse_ic50:.3f}")
    model.fit(X_ki_train, y_ki_train)
    y_pred_ki = model.predict(X_ki_test)
    r2_ki = r2_score(y_ki_test, y_pred_ki)
    rmse_ki = root_mean_squared_error(y_ki_test, y_pred_ki)
    print(f"{name} - Ki R^2: {r2_ki:.3f}, RMSE: {rmse_ki:.3f}")
    

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32
[LightGBM] [Info] Number of data points in the train set: 812, number of used features: 16
[LightGBM] [Info] Start training from score 9473.413264
LGBM - IC50 R^2: 0.245, RMSE: 65252.701
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000047 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16
[LightGBM] [Info] Number of data points in the train set: 507, number of used features: 8
[LightGBM] [Info] Start training from score 413.854737
LGBM - Ki R^2: 0.219, RMSE: 755.041
XGBoost - IC50 R^2: 0.497, RMSE: 53269.468
XGBoost - Ki R^2: 0.178, RMSE: 774.620
Random Forest - IC50 R^2: 0.627, R

<Figure size 800x600 with 0 Axes>

In [6]:
from sklearn.linear_model import Ridge
model= Ridge(alpha=1.0)
model.fit(X_ic50_train, y_ic50_train)
y_pred_ic50 = model.predict(X_ic50_test)
r2_ic50 = r2_score(y_ic50_test, y_pred_ic50)
rmse_ic50 = root_mean_squared_error(y_ic50_test, y_pred_ic50)
print(f" IC50 R^2: {r2_ic50:.3f}, RMSE: {rmse_ic50:.3f}")

 IC50 R^2: 0.511, RMSE: 52506.301


In [7]:

model.fit(X_ki_train, y_ki_train)
y_pred_ki = model.predict(X_ki_test)
r2_ki = r2_score(y_ki_test, y_pred_ki)
rmse_ki = root_mean_squared_error(y_ki_test, y_pred_ki)
print(f"Ki R^2: {r2_ki:.3f}, RMSE: {rmse_ki:.3f}")

Ki R^2: 0.467, RMSE: 624.086


<h3>Ridge worked much better than other models for both IC50 and Ki<h3>