In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_percentage_error, make_scorer
from xgboost import XGBRegressor

def train_xgb_regressor_gridsearch(
    train_csv,
    test_csv=None,
    target_column="target",
    use_split=False
):
    """
    Trains an XGBoost regressor with GridSearchCV and computes MAPE.
    
    Args:
        train_csv (str): Path to training CSV file.
        test_csv (str): Path to test CSV file (optional if use_split=True).
        target_column (str): Target column name in CSV.
        use_split (bool): If True, splits the training CSV into 80-20.
    """

    # --- Load data ---
    df_train = pd.read_csv(train_csv)
    if test_csv:
        df_test = pd.read_csv(test_csv)
    elif use_split:
        df_train, df_test = train_test_split(df_train, test_size=0.2, random_state=42)
    else:
        raise ValueError("Either provide a test CSV or set use_split=True.")

    # --- Fill NaN values ---
    df_train = df_train.fillna(df_train.mean(numeric_only=True))
    df_test = df_test.fillna(df_test.mean(numeric_only=True))

    # --- Split features and target ---
    X_train = df_train.drop(columns=[target_column]).values
    y_train = df_train[target_column].values.reshape(-1, 1)
    X_test = df_test.drop(columns=[target_column]).values
    y_test = df_test[target_column].values.reshape(-1, 1)

    # --- Scale features ---
    scaler_X = StandardScaler()
    X_train = scaler_X.fit_transform(X_train)
    X_test = scaler_X.transform(X_test)

    # --- Scale target to (0,1) ---
    scaler_y = MinMaxScaler(feature_range=(0, 1))
    y_train_scaled = scaler_y.fit_transform(y_train).ravel()
    y_test_scaled = scaler_y.transform(y_test).ravel()

    # --- Base XGB model ---
    xgb_base = XGBRegressor(
        objective="reg:squarederror",
        random_state=42,
        verbosity=0
    )

    # --- Define parameter grid ---
    param_grid = {
        "n_estimators": [50,75,100,200, 500, 800],
        "max_depth": [2,3,4, 6, 8],
        "learning_rate": [0.01, 0.05, 0.1,0.15,0.2],
        "subsample": [0.8, 1.0,2.0],
        "colsample_bytree": [0.8, 1.0],
        "reg_lambda": [0.5, 1.0, 2.0]
    }

    # --- GridSearchCV setup ---
    scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
    grid_search = GridSearchCV(
        estimator=xgb_base,
        param_grid=param_grid,
        scoring=scorer,
        cv=3,
        n_jobs=-1,
        verbose=2
    )

    print("Running grid search...")
    grid_search.fit(X_train, y_train_scaled)

    print("\nBest Parameters Found:")
    print(grid_search.best_params_)

    # --- Train best model on full training data ---
    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train_scaled)

    # --- Predict and inverse-transform ---
    y_pred_scaled = best_model.predict(X_test).reshape(-1, 1)
    y_pred = scaler_y.inverse_transform(y_pred_scaled)

    # --- Evaluate ---
    mape = mean_absolute_percentage_error(y_test, y_pred) * 100
    print(f"\nBest Test MAPE (original scale): {mape:.3f}%")

    return best_model, grid_search.best_params_, mape, y_pred, y_test


# Example usage
model, best_params, mape, y_pred, y_test = train_xgb_regressor_gridsearch(
    train_csv="amgOriginal.csv",
    test_csv="/g/g90/dhakal1/All/MIN_example/generated_cycles/generated_cycle_01.csv",
    target_column="relative_runtime"
)

Running grid search...
Fitting 3 folds for each of 2700 candidates, totalling 8100 fits
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=2, n_estimators=50, reg_lambda=2.0, subsample=2.0; total time=   3.8s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=2, n_estimators=500, reg_lambda=0.5, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=2, n_estimators=500, reg_lambda=0.5, subsample=1.0; total time=   0.2s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=2, n_estimators=500, reg_lambda=0.5, subsample=2.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=2, n_estimators=500, reg_lambda=0.5, subsample=2.0; total time=   0.0s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=2, n_estimators=500, reg_lambda=0.5, subsample=2.0; total time=   0.1s
[CV] END colsample_bytree=0.8, learning_rate=0.01, max_depth=2, n_estimators=500, reg_lambda=1.0, subsample=0.8; total ti

2700 fits failed out of a total of 8100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2700 fits failed with the following error:
Traceback (most recent call last):
  File "/g/g90/dhakal1/All/dev/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/g/g90/dhakal1/All/dev/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/g/g90/dhakal1/All/dev/lib/python3.9/site-packages/xgboost/sklearn.py", line 1108, in fit
    self._Booster = train(
  File "/g/g90/dhakal1/All/dev/lib/python3.9/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/g/g90/dhakal1/All/de


Best Parameters Found:
{'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'reg_lambda': 0.5, 'subsample': 1.0}

Best Test MAPE (original scale): 79.857%
