# Baseline XGBoost Example

This notebook runs the synthetic data generator, prepares features, evaluates a simple XGBoost baseline with time-series CV, and saves a final model.


In [None]:
import subprocess, sys
from pathlib import Path
ROOT = Path('0_LiteratureReview')
candidates = [ROOT / 'generate_synthetic_data.py', Path('generate_synthetic_data.py')]
GEN = next((p for p in candidates if p.exists()), None)
if GEN is None:
    raise FileNotFoundError('generate_synthetic_data.py not found in expected locations: ' + ','.join(str(p) for p in candidates))
print('Using generator at', GEN)
subprocess.check_call([sys.executable, str(GEN)])
print('CSV ready:', (ROOT / 'feature_template.csv').exists())


In [None]:
import pandas as pd, numpy as np, joblib, math
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

# load & prepare
DF=pd.read_csv('0_LiteratureReview/feature_template.csv', parse_dates=['timestamp']).sort_values('timestamp').set_index('timestamp')
DF['target']=DF['price'].shift(-1)
DF=DF.dropna(subset=['target'])
features=[c for c in DF.columns if c!='target' and pd.api.types.is_numeric_dtype(DF[c])]
X=DF[features].ffill().bfill().fillna(0)
y=DF['target'].values
print('X shape', X.shape)
# simple CV
tscv=TimeSeriesSplit(n_splits=5)
for tr,te in tscv.split(X):
    m=XGBRegressor(n_estimators=50).fit(X.iloc[tr], y[tr])
    p=m.predict(X.iloc[te])
    print('MAE', mean_absolute_error(y[te], p))
# save model
joblib.dump({'model':m,'features':features}, '0_LiteratureReview/baseline_model.pkl')
print('Saved model')
