# Batch XGBoost Training for All Municipalities

This notebook trains an XGBoost model for every municipality in the dataset using the best hyperparameters found for São Paulo. Results (predictions and metrics) are saved in an organized structure for further analysis or reruns on other time series.

In [None]:
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from src.preprocessing import load_city_data, filter_city, clean_timeseries, prepare_data_for_model
from src.train import evaluate_model, save_predictions, save_metrics
from tqdm import tqdm

# --- Set the best parameters found for São Paulo ---
xgb_best_params = {
    'n_estimators': 200,
    'max_depth': 4,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'min_child_weight': 1,
    'random_state': 42
}

# --- Data and output paths ---
data_path = '../data/df_base_morb_resp.csv'
results_dir = '../results/xgboost_batch_all_municipalities'
os.makedirs(results_dir, exist_ok=True)

# --- Load all city codes ---
df = load_city_data(data_path)
city_codes = sorted(df['CD_MUN'].unique())

# --- Model parameters ---
model_params = {
    'sequence_length': 8,
    'forecast_horizon': 4,
    'normalization': None,
    'val_size': None
}
target_column = 'target'

# --- Batch training ---
for cd_mun in tqdm(city_codes, desc='Municipalities'):
    city_name = str(cd_mun)
    df_city = filter_city(df, cd_mun=cd_mun)
    df_city = clean_timeseries(df_city, target_column=target_column)
    data_dict = prepare_data_for_model(
        df=df_city,
        target_column=target_column,
        sequence_length=model_params['sequence_length'],
        forecast_horizon=model_params['forecast_horizon'],
        normalization=model_params['normalization'],
        val_size=model_params.get('val_size', None)
    )
    X_train = data_dict['X_train']
    y_train = data_dict['y_train']
    X_test = data_dict['X_test']
    y_test = data_dict['y_test']
    test_df = data_dict['test_df']
    # Flatten for XGBoost
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)
    # Train and predict
    xgb = XGBRegressor(**xgb_best_params)
    xgb.fit(X_train, y_train)
    y_pred = xgb.predict(X_test)
    # Save predictions and metrics
    test_dates = test_df['week'].values[-len(y_test):] if 'week' in test_df.columns else np.arange(len(y_test))
    preds_file = save_predictions(
        y_true=y_test,
        y_pred=y_pred,
        dates=test_dates,
        city_name=city_name,
        model_name='xgboost',
        output_dir=results_dir
    )
    metrics = evaluate_model(xgb, X_test, y_test)
    metrics_file = save_metrics(
        metrics=metrics,
        city_name=city_name,
        model_name='xgboost',
        output_dir=results_dir,
        params=model_params
    )

All predictions and metrics are saved in the results/xgboost_batch_all_municipalities directory, one file per city. You can rerun this notebook for other time series by changing the data path and parameters.