# Baseline XGBoost Example — with Plots

This notebook generates synthetic data (if missing), prepares features, runs a rolling time-series CV evaluation, plots fold MAE/RMSE and feature importances, trains a final model and saves it.

In [None]:
import subprocess, sys
from pathlib import Path
ROOT = Path('0_LiteratureReview')
CSV = ROOT / 'feature_template.csv'
GEN = ROOT / 'generate_synthetic_data.py'
if not CSV.exists():
    # run generator using current python interpreter
    if GEN.exists():
        print('Generating synthetic CSV via', GEN)
        subprocess.check_call([sys.executable, str(GEN)])
    else:
        raise FileNotFoundError('generate_synthetic_data.py not found under 0_LiteratureReview')
print('CSV ready:', CSV.exists())


In [None]:
# Load and prepare data
import pandas as pd, numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import joblib, math
DF = pd.read_csv(str(CSV), parse_dates=['timestamp']).sort_values('timestamp').set_index('timestamp')
DF['target'] = DF['price'].shift(-1)
DF = DF.dropna(subset=['target'])
features = [c for c in DF.columns if c != 'target' and pd.api.types.is_numeric_dtype(DF[c])]
X = DF[features].ffill().bfill().fillna(0)
y = DF['target'].values
print('X shape', X.shape)


In [None]:
# Rolling CV evaluation (collect metrics and feature importances)
n_splits = 5
tscv = TimeSeriesSplit(n_splits=n_splits)
maes = []
rmses = []
fold_imps = []
for fold, (tr, te) in enumerate(tscv.split(X), 1):
    X_train, X_test = X.iloc[tr], X.iloc[te]
    y_train, y_test = y[tr], y[te]
    model = XGBRegressor(n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, verbosity=0)
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    rmse = math.sqrt(mean_squared_error(y_test, pred))
    maes.append(mae)
    rmses.append(rmse)
    # feature importance (gain-based, fallback to feature_importances_)
    try:
        imp = model.get_booster().get_score(importance_type='gain')
        # convert dict to array aligned with features
        imp_arr = [imp.get(f, 0.0) for f in [f'f{i}' for i in range(len(features))]]
    except Exception:
        # XGB sklearn API
        imp_arr = list(model.feature_importances_)
    fold_imps.append(imp_arr)
    print(f'Fold {fold}: MAE={mae:.4f}, RMSE={rmse:.4f}')
print('\nCV Mean MAE={:.4f}, RMSE={:.4f}'.format(np.mean(maes), np.mean(rmses)))


In [None]:
# Plot fold MAE/RMSE and aggregate feature importance
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')
fig, ax = plt.subplots(1,2, figsize=(12,4))
ax[0].plot(range(1, len(maes)+1), maes, marker='o', label='MAE')
ax[0].plot(range(1, len(rmses)+1), rmses, marker='o', label='RMSE')
ax[0].set_xlabel('CV Fold')
ax[0].set_xticks(range(1, len(maes)+1))
ax[0].set_ylabel('Error')
ax[0].legend()
ax[0].set_title('Fold MAE and RMSE')
# Aggregate feature importances across folds (mean)
import numpy as _np
imp_matrix = _np.array(fold_imps)
mean_imp = _np.mean(imp_matrix, axis=0)
feat_order = _np.argsort(mean_imp)[::-1]
top_n = min(20, len(features))
top_idx = feat_order[:top_n]
ax[1].barh([features[i] for i in top_idx[::-1]], mean_imp[top_idx[::-1]])
ax[1].set_title('Mean Feature Importance (top {})'.format(top_n))
plt.tight_layout()
plt.show()


In [None]:
# Train final model on full data and save
final = XGBRegressor(n_estimators=200, max_depth=6, learning_rate=0.05, random_state=42, verbosity=0)
final.fit(X, y)
joblib.dump({'model': final, 'features': features}, str(ROOT / 'baseline_model.pkl'))
print('Saved final model to', str(ROOT / 'baseline_model.pkl'))
