# Import Libraries

In [29]:
# Data Manipulation
import pandas as pd
import numpy as np

# File Paths
from pathlib import Path

# General CPU Usage Optimization
import os
os.environ['OMP_NUM_THREADS'] = '16'
os.environ['MKL_NUM_THREADS'] = '16'
os.environ['OPENBLAS_NUM_THREADS'] = '16'
os.environ['NUMEXPR_NUM_THREADS'] = '16'

# Machine Learning - Model
import xgboost as xgb

# Machine Learning - Hyperparameter Optimization
import optuna
import optuna.integration

# Machine Learning - Model Evaluation
from sklearn.metrics import mean_squared_error, r2_score

# Machine Learning - Model Saving
import joblib

# Progress bars
tqdm_notebook_available = False
try:
    from tqdm.notebook import tqdm
    tqdm.pandas() # Enable tqdm for pandas apply
    tqdm_notebook_available = True
except ImportError:
    print("tqdm.notebook not found.")

# Load Data Splits

In [2]:
splits_dir = Path("../data/splits")
print(f"\nLoading data splits from {splits_dir}...")

try:
    X_train = pd.read_parquet(splits_dir / "X_train.parquet")
    X_val = pd.read_parquet(splits_dir / "X_val.parquet")
    X_test = pd.read_parquet(splits_dir / "X_test.parquet")
    
    y_train = pd.read_parquet(splits_dir / "y_train.parquet")
    y_val = pd.read_parquet(splits_dir / "y_val.parquet")
    y_test = pd.read_parquet(splits_dir / "y_test.parquet")
    print("Data splits loaded successfully.")
except FileNotFoundError:
    print(f"Error: One or more split files not found in '{splits_dir}'.")
    print("Please ensure you have run '02_Split_Features.ipynb' to generate and save the splits.")

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

# Display first few rows to verify data
print("\nFirst 5 rows of X_train:")
display(X_train.head())

print("\nFirst 5 rows of y_train:")
display(y_train.head())


Loading data splits from ..\data\splits...
Data splits loaded successfully.
X_train shape: (13119, 2268)
X_val shape: (2812, 2268)
X_test shape: (2812, 2268)
y_train shape: (13119, 1)
y_val shape: (2812, 1)
y_test shape: (2812, 1)

First 5 rows of X_train:


Unnamed: 0,molregno,canonical_smiles,num_activities,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,morgan_fp_2038,morgan_fp_2039,morgan_fp_2040,morgan_fp_2041,morgan_fp_2042,morgan_fp_2043,morgan_fp_2044,morgan_fp_2045,morgan_fp_2046,morgan_fp_2047
0,2307646,COc1cccc2c1OCc1c-2nc2cnc3ccccc3c2c1C,6,6.033142,6.033142,0.494176,0.494176,0.476742,12.56,328.371,...,0,0,0,0,0,0,0,0,0,0
1,2081122,COc1cc(/C(C#N)=C/c2ccc3c(c2)OCCO3)cc(OC)c1OC,9,9.645791,9.645791,0.459195,0.459195,0.604738,12.923077,353.374,...,0,0,0,0,0,0,0,0,0,0
2,2199496,COC(=O)[C@@H]1CCCN1Cc1ccc(-c2ncc(-c3ccc(OCC=C(...,6,11.953178,11.953178,0.169552,-0.173158,0.359463,15.909091,447.535,...,0,0,0,0,0,0,0,0,0,0
3,2221960,O=C(/C=C/c1cccn(C/C=C/c2ccccc2Br)c1=O)NO,4,12.253458,12.253458,0.216419,-0.686457,0.479732,11.217391,375.222,...,0,0,0,0,0,0,0,0,0,0
4,2879093,Cc1cc(C2c3c(-c4cccc5[nH]c(=O)oc45)n[nH]c3C(=O)...,2,14.128489,14.128489,0.124437,-3.116139,0.437556,16.121212,472.879,...,0,0,0,0,0,0,0,0,0,0



First 5 rows of y_train:


Unnamed: 0,pGI50
14387,5.734742
12543,7.164746
12810,4.928428
13172,6.882724
18712,6.094208


# Prepare X for XGBoost

In [3]:
print("\nPreparing X for XGBoost training (dropping identifiers)...")
X_train_xgb = X_train.drop(columns=['molregno', 'canonical_smiles'], errors='ignore')
X_val_xgb = X_test.drop(columns=['molregno', 'canonical_smiles'], errors='ignore')
X_test_xgb = X_test.drop(columns=['molregno', 'canonical_smiles'], errors='ignore')

print(f"X_train_xgb shape (numerical features only): {X_train_xgb.shape}")
print(f"X_val_xgb shape (numerical features only): {X_val_xgb.shape}")
print(f"X_test_xgb shape (numerical features only): {X_test_xgb.shape}")

display(X_train_xgb.head())
display(y_train.head())

print("Converting data to numpy arrays...")
X_train_xgb = X_train_xgb.values.astype(np.float32)
y_train = y_train.values.astype(np.float32)

X_val_xgb = X_val_xgb.values.astype(np.float32)
y_val = y_val.values.astype(np.float32)

X_test_xgb = X_test_xgb.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

print(f"X_train_xgb type after conversion: {type(X_train_xgb)}")
print(f"X_train_xgb dtype: {X_train_xgb.dtype}")
print("\nData preparation for XGBoost complete. Ready for model definition and training.")


Preparing X for XGBoost training (dropping identifiers)...
X_train_xgb shape (numerical features only): (13119, 2266)
X_val_xgb shape (numerical features only): (2812, 2266)
X_test_xgb shape (numerical features only): (2812, 2266)


Unnamed: 0,num_activities,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,morgan_fp_2038,morgan_fp_2039,morgan_fp_2040,morgan_fp_2041,morgan_fp_2042,morgan_fp_2043,morgan_fp_2044,morgan_fp_2045,morgan_fp_2046,morgan_fp_2047
0,6,6.033142,6.033142,0.494176,0.494176,0.476742,12.56,328.371,312.243,328.121178,...,0,0,0,0,0,0,0,0,0,0
1,9,9.645791,9.645791,0.459195,0.459195,0.604738,12.923077,353.374,334.222,353.126323,...,0,0,0,0,0,0,0,0,0,0
2,6,11.953178,11.953178,0.169552,-0.173158,0.359463,15.909091,447.535,418.303,447.215806,...,0,0,0,0,0,0,0,0,0,0
3,4,12.253458,12.253458,0.216419,-0.686457,0.479732,11.217391,375.222,360.102,374.026604,...,0,0,0,0,0,0,0,0,0,0
4,2,14.128489,14.128489,0.124437,-3.116139,0.437556,16.121212,472.879,453.727,472.111375,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,pGI50
14387,5.734742
12543,7.164746
12810,4.928428
13172,6.882724
18712,6.094208


Converting data to numpy arrays...
X_train_xgb type after conversion: <class 'numpy.ndarray'>
X_train_xgb dtype: float32

Data preparation for XGBoost complete. Ready for model definition and training.


# Optimize Hyperparameters

## Create Optuna Objective Function

In [14]:
def objective(trial):
    # Suggest these hyperparams
    # Learning rate
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3, log=True)
    # Max depth of a tree
    max_depth = trial.suggest_int("max_depth", 3, 12)
    # Subsample of rows used to fit a tree
    subsample = trial.suggest_float("subsample", 0.6, 1.0)
    # Subsample of columns used to fit a tree
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0)
    # L1 regularization
    reg_alpha = trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True)
    # L2 Regularization
    reg_lambda = trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True)
    # Min loss required to further partition a leaf node
    gamma = trial.suggest_float("gamma", 1e-4, 1.0, log=True)
    # Num of trees
    n_estimators = trial.suggest_int("n_estimators", 500, 3000)
    # Min sum of instance weight needed in a child
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10)

    # Initialize model with suggested hyperparams
    model = xgb.XGBRegressor(
        objective='reg:squarederror',  # For regression tasks, minimizes squared error
        eval_metric='rmse',  # Evaluation metric to be used during training (for early stopping)
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        gamma=gamma,
        min_child_weight=min_child_weight,
        random_state=42,  # For reproducibility of the model training itself
        tree_method='hist',
        device='cuda:0',  # Use GPU
        early_stopping_rounds=50,
    )

    # Train model on training data
    try:
        evals_result = {}
        model.fit(
            X_train_xgb, y_train,
            eval_set=[(X_val_xgb, y_val)], # Use the validation set for early stopping/pruning
            verbose=False,
        )
        # --- DEBUGGING PRINT STATEMENT ---
        # Check the actual reported RMSE at the first few iterations
        if "validation_0" in evals_result and "rmse" in evals_result["validation_0"]:
            initial_rmse_val = evals_result["validation_0"]["rmse"][0] # Get RMSE at iteration 0
            print(f"Trial {trial.number}: Reported RMSE at Iteration 0 = {initial_rmse_val:.4f}")
            if np.isnan(initial_rmse_val) or np.isinf(initial_rmse_val):
                print(f"Trial {trial.number}: WARNING! Initial RMSE is NaN/Inf. This might be causing immediate pruning.")
    
    except xgb.core.XGBoostError as e:
        # Handle cases where training might fail for some hyperparameter combinations
        # (e.g., if a combination leads to numerical instability).
        print(f"XGBoost training error for trial {trial.number}: {e}")
        trial.set_user_attr("exception_type", "XGBoostError")
        trial.set_user_attr("exception_message", str(e))
        return float('inf') # Return a very high value if training fails, to discourage this trial

    # Make predictions and evaluate on validation set
    y_pred_val = model.predict(X_val_xgb)
    rmse = float(np.sqrt(np.mean((y_pred_val - y_val)**2)))
    r2 = float(r2_score(y_val, y_pred_val))
    
    trial.set_user_attr("r2_score", r2)  # Store r2 score as well in study logs

    return rmse  # Optuna minimizes THIS, not r2

## Run Optuna Study

In [15]:
optuna.logging.set_verbosity(optuna.logging.INFO)

print("Optuna logging verbosity set to INFO.")

Optuna logging verbosity set to INFO.


In [None]:
# Define the path for the Optuna study storage
study_dir = Path("../studies/xgboost_study")
study_dir.mkdir(parents=True, exist_ok=True)

study_db_path = f"sqlite:///{study_dir / 'xgb_optuna_study.db'}"
study_name = "xgboost_regression_pGI50"
print(f"Optuna study will be stored at: {study_db_path}")

# Create Pruner object
pruner = optuna.pruners.MedianPruner(
    n_startup_trials=10,  # Run at least 10 trials completely before starting to prune
    n_warmup_steps=10,  # Don't prune trials until they've completed 50 boosting rounds
    interval_steps=10  # Check for pruning every 10 boosting rounds
)

# Check if a study with the same name already exists in the database
# If it does, load it to resume the optimization.
try:
    study = optuna.load_study(study_name=study_name, storage=study_db_path)
    print(f"Loaded existing study '{study_name}' from {study_db_path}. Resuming optimization.")
except KeyError:
    # If the study does not exist, create a new one
    print(f"Creating new study '{study_name}' at {study_db_path}.")
    study = optuna.create_study(
        study_name=study_name,
        direction="minimize",
        storage=study_db_path,
        pruner=pruner
    )

print("\nStarting Optuna optimization...")
study.optimize(objective, n_trials=300, timeout=7200) # Run 150 trials or for 1 hour (3600 seconds), whichever comes first
print("\nOptuna optimization finished.")

# Print best trial results
print("\n--- Best Trial Results ---")
print(f"Best trial number: {study.best_trial.number}")
print(f"Best RMSE (Validation): {study.best_value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# You can also access user attributes (like R2 score) from the best trial
if "r2_score" in study.best_trial.user_attrs:
    print(f"Best R2 Score (Validation): {study.best_trial.user_attrs['r2_score']:.4f}")

Optuna study will be stored at: sqlite:///..\studies\xgboost_study\xgb_optuna_study.db
Loaded existing study 'xgboost_regression_pGI50' from sqlite:///..\studies\xgboost_study\xgb_optuna_study.db. Resuming optimization.

Starting Optuna optimization...


[I 2025-07-15 00:20:02,168] Trial 782 finished with value: 0.9900553226470947 and parameters: {'learning_rate': 0.0016587239632441964, 'max_depth': 7, 'subsample': 0.973925550314919, 'colsample_bytree': 0.606521323874092, 'reg_alpha': 0.0003534651101778127, 'reg_lambda': 0.00022362584339581426, 'gamma': 0.2027101912960382, 'n_estimators': 622, 'min_child_weight': 4}. Best is trial 238 with value: 0.9899262921512715.
[I 2025-07-15 00:20:11,008] Trial 783 finished with value: 0.9900605082511902 and parameters: {'learning_rate': 0.0013333948137170108, 'max_depth': 6, 'subsample': 0.9921758522240521, 'colsample_bytree': 0.6195823143439292, 'reg_alpha': 0.0001203136138234615, 'reg_lambda': 0.0002646529119328142, 'gamma': 0.8270193793739218, 'n_estimators': 505, 'min_child_weight': 5}. Best is trial 238 with value: 0.9899262921512715.
[I 2025-07-15 00:20:19,432] Trial 784 finished with value: 0.9900442957878113 and parameters: {'learning_rate': 0.0010921629984086089, 'max_depth': 5, 'subsamp

# Train Final XGBoost Model

## Retrieve Best Hyperparams from Optuna Study

In [19]:
study_dir = Path("../studies/xgboost_study")
study_db_path = f"sqlite:///{study_dir / 'xgb_optuna_study.db'}"
study_name = "xgboost_regression_pGI50"

try:
    study = optuna.load_study(study_name=study_name, storage=study_db_path)
    print("Best trial parameters (XGBoost):", study.best_trial.params)
    best_params = study.best_trial.params
except KeyError:
    print("Study does not exist. Please make sure that the previous cell has been run.")

Best trial parameters (XGBoost): {'learning_rate': 0.0020930505614224202, 'max_depth': 7, 'subsample': 0.9874008464174688, 'colsample_bytree': 0.6198264363422864, 'reg_alpha': 0.0001419307486809107, 'reg_lambda': 0.0002640498949844627, 'gamma': 0.45206313012880717, 'n_estimators': 519, 'min_child_weight': 5}


In [22]:
# Set other hyperparams not suggested by Optuna
best_params['objective'] = 'reg:squarederror'
best_params['eval_metric'] = 'rmse'
best_params['random_state'] = 42
best_params['tree_method'] = 'hist'
best_params['device'] = 'gpu'

# Initialize the model
final_xgb_model = xgb.XGBRegressor(**best_params)
print("Final model has been initialized with best parameters. Ready for training.")

Final model has been initialized with best parameters. Ready for training.


## Train Model

In [23]:
print("\nTraining final XGBoost model on the full training set (training data + validation data)...")
X_train_xgb_final = np.concatenate([X_train_xgb, X_val_xgb]).astype(np.float32)
y_train_xgb_final = np.concatenate([y_train, y_val]).astype(np.float32)

final_xgb_model.fit(X_train_xgb_final, y_train_xgb_final, verbose=False)
print("Final model training complete. Ready for test set evaluation.")


Training final XGBoost model on the full training set (training data + validation data)...
Final model training complete. Ready for test set evaluation.


## Evaluate Model

In [24]:
print("Making predictions and evaluating on the test set...")
final_xgb_model.set_params(device='cpu')  # Prevent device mismatch errors
y_pred_test_xgb = final_xgb_model.predict(X_test_xgb)
rmse_test_xgb = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb))
r2_test_xgb = r2_score(y_test, y_pred_test_xgb)

print(f"\n--- Final XGBoost Model Performance on Test Set ---")
print(f"Test RMSE: {rmse_test_xgb:.4f}")
print(f"Test R2 Score: {r2_test_xgb:.4f}")

print(f"Compared with Best Validation RMSE from Optuna Study: {study.best_value:.4f}")

Making predictions and evaluating on the test set...

--- Final XGBoost Model Performance on Test Set ---
Test RMSE: 0.8660
Test R2 Score: 0.2176
Compared with Best Validation RMSE from Optuna Study: 0.9899


# Save Final XGBoost Model

In [30]:
model_filename = Path(f'final_xgboost_model_rmse_{rmse_test_xgb:.4f}_r2_{r2_test_xgb:.4f}.joblib')

save_dir = Path('../models/xgb')
save_dir.mkdir(parents=True, exist_ok=True)

full_path = save_dir / model_filename

print(f"\nSaving the final XGBoost model to: {full_path}...")
try:
    joblib.dump(final_xgb_model, full_path)
    print("Model saved successfully!")
except Exception as e:
    print(f"Error saving model: {e}")


Saving the final XGBoost model to: ..\models\xgb\final_xgboost_model_rmse_0.8660_r2_0.2176.joblib...
Model saved successfully!
