In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import numpy as np

# Assume X and Y are your preprocessed features and target values.
# For example, you might load them like:
# X = np.load('preprocessed_features.npy')
# Y = np.load('target_kd_values.npy')

# 1. Split data into 70% training and 30% temporary (which will be split into validation and test)
X_train, X_temp, y_train, y_temp = train_test_split(X, Y, test_size=0.30, random_state=42)

# 2. Split the temporary set equally into 15% validation and 15% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.50, random_state=42)

print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

# 3. Evaluate different max_depth values using training and validation sets
max_depths = [3, 4, 5, 6, 7, 8, 9]
val_mae_values = []

for depth in max_depths:
    model = xgb.XGBRegressor(
        objective='reg:absoluteerror',  # Optimize using MAE loss
        max_depth=depth,
        eval_metric='mae',              # Report MAE during training
        random_state=42
    )
    model.fit(X_train, y_train)
    y_pred_val = model.predict(X_val)
    mae_val = mean_absolute_error(y_val, y_pred_val)
    val_mae_values.append(mae_val)
    print(f"max_depth: {depth}, Validation MAE: {mae_val:.2f}")

# Find best max_depth based on the lowest validation MAE
best_index = np.argmin(val_mae_values)
best_depth = max_depths[best_index]
print(f"\nBest max_depth: {best_depth} with Validation MAE: {val_mae_values[best_index]:.2f}")

# 4. Plot Validation MAE vs. max_depth
plt.figure(figsize=(8, 6))
plt.plot(max_depths, val_mae_values, marker='o', linestyle='-')
plt.xlabel('max_depth')
plt.ylabel('Validation MAE')
plt.title('Validation MAE vs. max_depth for XGBoost Regressor')
plt.grid(True)
plt.show()

# 5. Retrain final model on the combined training + validation data with best max_depth
X_train_val = np.concatenate((X_train, X_val), axis=0)
y_train_val = np.concatenate((y_train, y_val), axis=0)

final_model = xgb.XGBRegressor(
    objective='reg:absoluteerror',
    max_depth=best_depth,
    eval_metric='mae',
    random_state=42
)
final_model.fit(X_train_val, y_train_val)

# Evaluate the final model on the unseen test set
y_pred_test = final_model.predict(X_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
print(f"\nFinal Test MAE: {test_mae:.2f} nM")
