# Initial Setups

## Setup Environment

In [1]:
# File Paths
from pathlib import Path

# General CPU Usage Optimization
import os
os.environ['OMP_NUM_THREADS'] = '16'
os.environ['MKL_NUM_THREADS'] = '16'
os.environ['OPENBLAS_NUM_THREADS'] = '16'
os.environ['NUMEXPR_NUM_THREADS'] = '16'

## Import Core Libraries

In [2]:
# Data Manipulation
import pandas as pd
import numpy as np

# Machine Learning - Model
import xgboost as xgb

# Machine Learning - Hyperparameter Optimization
import optuna
import optuna.integration

# Machine Learning - Model Evaluation
from sklearn.metrics import mean_squared_error, r2_score

# Machine Learning - Model Saving
import joblib

## Import Utility Libraries

In [3]:
# Progress bars
tqdm_notebook_available = False
try:
    from tqdm.notebook import tqdm
    tqdm.pandas() # Enable tqdm for pandas apply method
    tqdm_notebook_available = True
    print("tqdm.notebook found and enabled for pandas.")
except ImportError:
    print("tqdm.notebook not found. Install with 'pip install tqdm'.")

tqdm.notebook found and enabled for pandas.


# Load Data Splits

In [4]:
splits_dir = Path("../data/splits")
print(f"\nLoading data splits from {splits_dir}...")

try:
    X_train = pd.read_parquet(splits_dir / "X_train.parquet")
    X_val = pd.read_parquet(splits_dir / "X_val.parquet")
    X_test = pd.read_parquet(splits_dir / "X_test.parquet")
    
    y_train = pd.read_parquet(splits_dir / "y_train.parquet")
    y_val = pd.read_parquet(splits_dir / "y_val.parquet")
    y_test = pd.read_parquet(splits_dir / "y_test.parquet")
    print("Data splits loaded successfully.")
except FileNotFoundError:
    print(f"Error: One or more split files not found in '{splits_dir}'.")
    print("Please ensure you have run '02_Split_Features.ipynb' to generate and save the splits.")

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")

print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

# Display first few rows to verify data
print("\nFirst 5 rows of X_train:")
display(X_train.head())

print("\nFirst 5 rows of y_train:")
display(y_train.head())


Loading data splits from ..\data\splits...
Data splits loaded successfully.
X_train shape: (13119, 2268)
X_val shape: (2812, 2268)
X_test shape: (2812, 2268)
y_train shape: (13119, 1)
y_val shape: (2812, 1)
y_test shape: (2812, 1)

First 5 rows of X_train:


Unnamed: 0,molregno,canonical_smiles,num_activities,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,...,morgan_fp_2038,morgan_fp_2039,morgan_fp_2040,morgan_fp_2041,morgan_fp_2042,morgan_fp_2043,morgan_fp_2044,morgan_fp_2045,morgan_fp_2046,morgan_fp_2047
0,2307646,COc1cccc2c1OCc1c-2nc2cnc3ccccc3c2c1C,6,6.033142,6.033142,0.494176,0.494176,0.476742,12.56,328.371,...,0,0,0,0,0,0,0,0,0,0
1,2081122,COc1cc(/C(C#N)=C/c2ccc3c(c2)OCCO3)cc(OC)c1OC,9,9.645791,9.645791,0.459195,0.459195,0.604738,12.923077,353.374,...,0,0,0,0,0,0,0,0,0,0
2,2199496,COC(=O)[C@@H]1CCCN1Cc1ccc(-c2ncc(-c3ccc(OCC=C(...,6,11.953178,11.953178,0.169552,-0.173158,0.359463,15.909091,447.535,...,0,0,0,0,0,0,0,0,0,0
3,2221960,O=C(/C=C/c1cccn(C/C=C/c2ccccc2Br)c1=O)NO,4,12.253458,12.253458,0.216419,-0.686457,0.479732,11.217391,375.222,...,0,0,0,0,0,0,0,0,0,0
4,2879093,Cc1cc(C2c3c(-c4cccc5[nH]c(=O)oc45)n[nH]c3C(=O)...,2,14.128489,14.128489,0.124437,-3.116139,0.437556,16.121212,472.879,...,0,0,0,0,0,0,0,0,0,0



First 5 rows of y_train:


Unnamed: 0,pGI50
14387,5.734742
12543,7.164746
12810,4.928428
13172,6.882724
18712,6.094208


# Prepare Data for XGBoost

In [5]:
# Drop identifier columns which are not features for the model
print("\nPreparing X for XGBoost training (dropping identifiers)...")
X_train_xgb = X_train.drop(columns=['molregno', 'canonical_smiles'], errors='ignore')
X_val_xgb = X_val.drop(columns=['molregno', 'canonical_smiles'], errors='ignore')
X_test_xgb = X_test.drop(columns=['molregno', 'canonical_smiles'], errors='ignore')

print(f"X_train_xgb shape (numerical features only): {X_train_xgb.shape}")
print(f"X_val_xgb shape (numerical features only): {X_val_xgb.shape}")
print(f"X_test_xgb shape (numerical features only): {X_test_xgb.shape}")

display(X_train_xgb.head())
display(y_train.head())

print("Converting data to numpy arrays...")
# XGBoost typically works well with NumPy arrays
X_train_xgb = X_train_xgb.values.astype(np.float32)
y_train = y_train.values.astype(np.float32)

X_val_xgb = X_val_xgb.values.astype(np.float32)
y_val = y_val.values.astype(np.float32)

X_test_xgb = X_test_xgb.values.astype(np.float32)
y_test = y_test.values.astype(np.float32)

print(f"X_train_xgb type after conversion: {type(X_train_xgb)}")
print(f"X_train_xgb dtype: {X_train_xgb.dtype}")
print("\nData preparation for XGBoost complete. Ready for model definition and training.")


Preparing X for XGBoost training (dropping identifiers)...
X_train_xgb shape (numerical features only): (13119, 2266)
X_val_xgb shape (numerical features only): (2812, 2266)
X_test_xgb shape (numerical features only): (2812, 2266)


Unnamed: 0,num_activities,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,morgan_fp_2038,morgan_fp_2039,morgan_fp_2040,morgan_fp_2041,morgan_fp_2042,morgan_fp_2043,morgan_fp_2044,morgan_fp_2045,morgan_fp_2046,morgan_fp_2047
0,6,6.033142,6.033142,0.494176,0.494176,0.476742,12.56,328.371,312.243,328.121178,...,0,0,0,0,0,0,0,0,0,0
1,9,9.645791,9.645791,0.459195,0.459195,0.604738,12.923077,353.374,334.222,353.126323,...,0,0,0,0,0,0,0,0,0,0
2,6,11.953178,11.953178,0.169552,-0.173158,0.359463,15.909091,447.535,418.303,447.215806,...,0,0,0,0,0,0,0,0,0,0
3,4,12.253458,12.253458,0.216419,-0.686457,0.479732,11.217391,375.222,360.102,374.026604,...,0,0,0,0,0,0,0,0,0,0
4,2,14.128489,14.128489,0.124437,-3.116139,0.437556,16.121212,472.879,453.727,472.111375,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,pGI50
14387,5.734742
12543,7.164746
12810,4.928428
13172,6.882724
18712,6.094208


Converting data to numpy arrays...
X_train_xgb type after conversion: <class 'numpy.ndarray'>
X_train_xgb dtype: float32

Data preparation for XGBoost complete. Ready for model definition and training.


# Optimize Hyperparameters

## Define Optuna Objective Function

In [6]:
def objective(trial):
    # Suggest these hyperparameters to Optuna. Ranges are defined based on common practice for XGBoost.
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3, log=True)
    max_depth = trial.suggest_int("max_depth", 3, 12) # Max depth of a tree
    subsample = trial.suggest_float("subsample", 0.6, 1.0) # Subsample ratio of the training instance
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.6, 1.0) # Subsample ratio of columns when constructing each tree
    reg_alpha = trial.suggest_float("reg_alpha", 1e-4, 1.0, log=True) # L1 regularization term
    reg_lambda = trial.suggest_float("reg_lambda", 1e-4, 1.0, log=True) # L2 regularization term
    gamma = trial.suggest_float("gamma", 1e-4, 1.0, log=True) # Minimum loss reduction required to make a further partition on a leaf node
    n_estimators = trial.suggest_int("n_estimators", 500, 3000) # Number of boosting rounds (trees)
    min_child_weight = trial.suggest_int("min_child_weight", 1, 10) # Minimum sum of instance weight needed in a child

    # Initialize model with suggested hyperparams
    model = xgb.XGBRegressor(
        objective='reg:squarederror',  # Objective function (minimizes squared error)
        eval_metric='rmse',  # Evaluation metric to be monitored during training (used for early stopping)
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        gamma=gamma,
        min_child_weight=min_child_weight,
        random_state=42,  # Seed for reproducibility of model training (tree building)
        tree_method='hist', # Use histogram-based algorithm for faster training
        device='cuda:0',  # Specify to use GPU if available
        early_stopping_rounds=50, # Stop if validation metric doesn't improve for 50 rounds
    )

    # Train model on training data, evaluating on validation set for early stopping
    try:
        model.fit(
            X_train_xgb, y_train,
            eval_set=[(X_val_xgb, y_val)], # Use the validation set for early stopping and pruning decisions
            verbose=False,
        )
    except xgb.core.XGBoostError as e:
        # Handle cases where specific hyperparameter combinations might lead to training errors
        print(f"XGBoost training error for trial {trial.number}: {e}")
        trial.set_user_attr("exception_type", "XGBoostError")
        trial.set_user_attr("exception_message", str(e))
        return float('inf') # Return a very high value (infinity) to Optuna to discourage this problematic trial

    # Make predictions on validation set using the best booster from early stopping
    y_pred_val = model.predict(X_val_xgb)
    
    # Calculate RMSE and R2 score on validation set
    rmse = float(np.sqrt(mean_squared_error(y_val, y_pred_val)))
    r2 = float(r2_score(y_val, y_pred_val))
    
    trial.set_user_attr("r2_score", r2)  # Store R2 score as a user attribute for later inspection

    return rmse  # Optuna minimizes this RMSE value to find the best trial

## Run Optuna Study

In [7]:
optuna.logging.set_verbosity(optuna.logging.INFO) # Set Optuna logging level

print("Optuna logging verbosity set to INFO.")
# Define the path for the Optuna study database storage
study_dir = Path("../studies/xgboost_study")
study_dir.mkdir(parents=True, exist_ok=True)

study_db_path = f"sqlite:///{study_dir / 'xgb_optuna_study.db'}"
study_name = "xgboost_regression_pGI50"
print(f"Optuna study will be stored at: {study_db_path}")

# Create a Pruner object to stop unpromising trials early
pruner = optuna.pruners.MedianPruner(
    n_startup_trials=10,  # Number of trials to run completely before starting pruning
    n_warmup_steps=10,  # Don't prune trials until they've completed at least this many boosting rounds (iterations)
    interval_steps=10  # Check for pruning every this many boosting rounds
)

# Check if a study with the same name already exists in the database
# If it does, load it to resume the optimization
try:
    study = optuna.load_study(study_name=study_name, storage=study_db_path)
    print(f"Loaded existing study '{study_name}' from {study_db_path}. Resuming optimization.")
except KeyError:
    # If the study does not exist, create a new one
    print(f"Creating new study '{study_name}' at {study_db_path}.")
    study = optuna.create_study(
        study_name=study_name,
        direction="minimize", # Minimize the RMSE
        storage=study_db_path,
        pruner=pruner,
    )

print("\nStarting Optuna optimization...")

# Run up to 300 trials or for 2 hours (7200 seconds)
study.optimize(objective, n_trials=300, timeout=7200, show_progress_bar=True)

print("\nOptuna optimization finished.")

# Print best trial results from the study
print("\n--- Best Trial Results ---")
print(f"Best trial number: {study.best_trial.number}")
print(f"Best RMSE (Validation): {study.best_value:.4f}")
print("Best hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

# Access and print the R2 score stored as a user attribute for the best trial
if "r2_score" in study.best_trial.user_attrs:
    print(f"Best R2 Score (Validation): {study.best_trial.user_attrs['r2_score']:.4f}")

Optuna logging verbosity set to INFO.
Optuna study will be stored at: sqlite:///..\studies\xgboost_study\xgb_optuna_study.db
Loaded existing study 'xgboost_regression_pGI50' from sqlite:///..\studies\xgboost_study\xgb_optuna_study.db. Resuming optimization.

Starting Optuna optimization...


  0%|          | 0/300 [00:00<?, ?it/s]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


[I 2025-07-15 23:20:09,394] Trial 1063 finished with value: 0.7694555719797417 and parameters: {'learning_rate': 0.0037184251773346028, 'max_depth': 6, 'subsample': 0.9982931829052706, 'colsample_bytree': 0.6673293221697834, 'reg_alpha': 0.00010284858978825288, 'reg_lambda': 0.000757183626390342, 'gamma': 0.9225937105407367, 'n_estimators': 744, 'min_child_weight': 4}. Best is trial 1056 with value: 0.7633740559033817.
[I 2025-07-15 23:20:31,069] Trial 1064 finished with value: 0.7661969131325894 and parameters: {'learning_rate': 0.003811779089832076, 'max_depth': 6, 'subsample': 0.9980310030228269, 'colsample_bytree': 0.6661423256948277, 'reg_alpha': 0.00010053570082067321, 'reg_lambda': 0.0007550071794087357, 'gamma': 0.9949509709163977, 'n_estimators': 774, 'min_child_weight': 4}. Best is trial 1056 with value: 0.7633740559033817.
[I 2025-07-15 23:20:52,646] Trial 1065 finished with value: 0.7681481183936383 and parameters: {'learning_rate': 0.0036942161729721598, 'max_depth': 6, 's

KeyboardInterrupt: 

# Train Final XGBoost Model

## Retrieve Best Hyperparams from Optuna Study

In [8]:
# Re-load the study to ensure the latest best parameters
study_dir = Path("../studies/xgboost_study")
study_db_path = f"sqlite:///{study_dir / 'xgb_optuna_study.db'}"
study_name = "xgboost_regression_pGI50"

try:
    study = optuna.load_study(study_name=study_name, storage=study_db_path)
    print("Best trial parameters (XGBoost):", study.best_trial.params)
    best_params = study.best_trial.params
except KeyError:
    print("Study does not exist. Please make sure that the previous Optuna study cell has been run.")

Best trial parameters (XGBoost): {'learning_rate': 0.00820193368271431, 'max_depth': 6, 'subsample': 0.9897893799354487, 'colsample_bytree': 0.7223075745984062, 'reg_alpha': 0.00012824377407451583, 'reg_lambda': 0.00397611471104642, 'gamma': 0.6775917319015243, 'n_estimators': 1148, 'min_child_weight': 9}


## Initialize Final XGBoost Model

In [9]:
# Add fixed parameters that were not part of Optuna's search but are required for the model
best_params['objective'] = 'reg:squarederror' # Objective function for regression
best_params['eval_metric'] = 'rmse' # Evaluation metric
best_params['random_state'] = 42 # For reproducibility of the final model
best_params['tree_method'] = 'hist' # Histogram-based method for efficiency
best_params['device'] = 'cuda:0' # Set device for final training ('cuda:0' for GPU, 'cpu' for CPU)

# Initialize the final XGBoost model with the best parameters found by Optuna
final_xgb_model = xgb.XGBRegressor(**best_params)
print("Final model has been initialized with best parameters. Ready for training.")

Final model has been initialized with best parameters. Ready for training.


## Train Model

In [10]:
print("\nTraining final XGBoost model on the full training set (combined original training data + original validation data)...")

# Concatenate the original training and validation data for final model training
X_train_xgb_final = np.concatenate([X_train_xgb, X_val_xgb]).astype(np.float32)
y_train_xgb_final = np.concatenate([y_train, y_val]).astype(np.float32)

# Train the model
# No early stopping here as training is done on combined set (hence no separate validation set).
final_xgb_model.fit(X_train_xgb_final, y_train_xgb_final, verbose=False)
print("Final model training complete. Ready for test set evaluation.")


Training final XGBoost model on the full training set (combined original training data + original validation data)...
Final model training complete. Ready for test set evaluation.


## Evaluate Model

In [11]:
print("Making predictions and evaluating on the test set...")

# Set device to 'cpu' for prediction to prevent potential device
# mismatch errors if the model was trained on GPU
final_xgb_model.set_params(device='cuda:0')
y_pred_test_xgb = final_xgb_model.predict(X_test_xgb)

# Calculate final RMSE and R2 score on the test set
rmse_test_xgb = np.sqrt(mean_squared_error(y_test, y_pred_test_xgb))
r2_test_xgb = r2_score(y_test, y_pred_test_xgb)

print(f"\n--- Final XGBoost Model Performance on Test Set ---")
print(f"Test RMSE: {rmse_test_xgb:.4f}")
print(f"Test R2 Score: {r2_test_xgb:.4f}")

# For comparison, print the best validation RMSE from the Optuna study
print(f"Compared with Best Validation RMSE from Optuna Study: {study.best_value:.4f}")

Making predictions and evaluating on the test set...

--- Final XGBoost Model Performance on Test Set ---
Test RMSE: 0.6955
Test R2 Score: 0.4953
Compared with Best Validation RMSE from Optuna Study: 0.6995


# Save Final XGBoost Model

In [30]:
# Construct filename including final performance metrics for easy identification
model_filename = Path(f'final_xgboost_model_rmse_{rmse_test_xgb:.4f}_r2_{r2_test_xgb:.4f}.joblib')

# Directory to save the model
save_dir = Path('../models/xgb')
save_dir.mkdir(parents=True, exist_ok=True)t

full_path = save_dir / model_filename

print(f"\nSaving the final XGBoost model to: {full_path}...")
try:
    # Use joblib to save the trained XGBoost model
    joblib.dump(final_xgb_model, full_path)
    print("Model saved successfully!")
except Exception as e:
    print(f"Error saving model: {e}")


Saving the final XGBoost model to: ..\models\xgb\final_xgboost_model_rmse_0.8660_r2_0.2176.joblib...
Model saved successfully!
