# Batch XGBoost Training for All Municipalities

This notebook trains an XGBoost model for every municipality in the dataset using the best hyperparameters found for São Paulo. Results (predictions, metrics, model, and scaler) are saved in a subfolder for each municipality for reproducibility and easy deployment.

In [None]:
%pip install tqdm

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1


In [None]:
import sys
import os

# Get the absolute path to the project root
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
print(f"Project root: {project_root}")

# Add the project root to sys.path (not the src directory)
if project_root not in sys.path:
    sys.path.insert(0, project_root)
    print(f"Added {project_root} to sys.path")
import sys
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from src.preprocessing import load_city_data, filter_city, clean_timeseries, prepare_data_for_model
from src.train import evaluate_model, save_predictions, save_metrics
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import joblib

# --- Set the best parameters found for São Paulo ---
xgb_best_params = {
    'n_estimators': 200,
    'max_depth': 4,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0,
    'min_child_weight': 1,
    'random_state': 42
}

# --- Data and output paths ---
data_path = '../data/df_base_morb_resp.csv'
results_dir = '../results/xgboost_batch_all_municipalities(morbresp)'
os.makedirs(results_dir, exist_ok=True)

# --- Load all city codes ---
df = load_city_data(data_path)
city_codes = sorted(df['CD_MUN'].unique())

# --- Model parameters ---
model_params = {
    'sequence_length': 8,
    'forecast_horizon': 4,
    'normalization': None,
    'val_size': None
}
target_column = 'target'

def reduce_to_1d(arr):
    arr = np.asarray(arr)
    if arr.ndim == 1:
        return arr
    if arr.ndim == 2:
        if arr.shape[1] == 1:
            return arr.ravel()
        else:
            return arr.sum(axis=1)
    raise ValueError(f"Unexpected array shape: {arr.shape}")

skipped_cities = []  # Collect skipped cities and reasons

# --- Batch training ---
for cd_mun in tqdm(city_codes, desc='Municipalities'):
    city_name = str(cd_mun)
    df_city = filter_city(df, cd_mun=cd_mun)
    df_city = clean_timeseries(df_city, target_column=target_column)

    # --- Handle NaNs in target column ---
    target_nans = df_city[target_column].isna().sum()
    total_rows = len(df_city)
    nan_ratio = target_nans / total_rows if total_rows > 0 else 1
    if nan_ratio > 0.3:
        skipped_cities.append((city_name, f"too many NaNs in target column ({nan_ratio:.1%})"))
        continue
    if target_nans > 0:
        df_city[target_column] = df_city[target_column].interpolate(method='linear', limit_direction='both')

    data_dict = prepare_data_for_model(
        df=df_city,
        target_column=target_column,
        sequence_length=model_params['sequence_length'],
        forecast_horizon=model_params['forecast_horizon'],
        normalization=model_params['normalization'],
        val_size=model_params.get('val_size', None)
    )
    X_train = data_dict['X_train']
    y_train = data_dict['y_train']
    X_test = data_dict['X_test']
    y_test = data_dict['y_test']
    test_df = data_dict['test_df']

    # --- Data quality checks ---
    skip_reason = None
    if X_train.shape[0] < 2 or X_test.shape[0] < 1:
        skip_reason = f"Not enough samples (train: {X_train.shape[0]}, test: {X_test.shape[0]})"
    elif np.isnan(X_train).any() or np.isnan(X_test).any() or np.isnan(y_train).any() or np.isnan(y_test).any():
        skip_reason = "NaN values present in features or targets"
    elif np.all(X_train == 0) or np.all(X_test == 0):
        skip_reason = "All features are zero"
    elif np.all(y_train == 0) or np.all(y_test == 0):
        skip_reason = "All targets are zero"
    elif np.unique(y_train).size == 1:
        skip_reason = f"Target is constant: {y_train[0]}"

    if skip_reason:
        skipped_cities.append((city_name, skip_reason))
        continue

    # Flatten for XGBoost
    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)
    # --- Feature scaling ---
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    # Train and predict
    xgb = XGBRegressor(**xgb_best_params)
    xgb.fit(X_train_scaled, y_train)
    y_pred = xgb.predict(X_test_scaled)
    # Save model, scaler, predictions, and metrics in subfolder per city
    city_dir = os.path.join(results_dir, city_name)
    os.makedirs(city_dir, exist_ok=True)
    model_file = os.path.join(city_dir, f'{city_name}_xgboost_model.json')
    scaler_file = os.path.join(city_dir, f'{city_name}_scaler.pkl')
    xgb.save_model(model_file)
    joblib.dump(scaler, scaler_file)
    # Save predictions and metrics
    y_test_1d = reduce_to_1d(y_test)
    y_pred_1d = reduce_to_1d(y_pred)
    test_dates = test_df['week'].values[-len(y_test_1d):] if 'week' in test_df.columns else np.arange(len(y_test_1d))
    preds_file = save_predictions(
        y_true=y_test_1d,
        y_pred=y_pred_1d,
        dates=test_dates,
        city_name=city_name,
        model_name='xgboost',
        output_dir=city_dir
    )
    metrics = evaluate_model(xgb, X_test_scaled, y_test)
    metrics_file = save_metrics(
        metrics=metrics,
        city_name=city_name,
        model_name='xgboost',
        output_dir=city_dir,
        params=model_params
    )

# --- Print skipped cities summary ---
if skipped_cities:
    print("\nSummary of skipped cities:")
    for city, reason in skipped_cities:
        print(f"City {city} skipped: {reason}")
else:
    print("\nNo cities were skipped.")

All predictions, metrics, models, and scalers are saved in a subfolder for each city under the results directory. This ensures full reproducibility and correct deployment for each municipality.