In [5]:
# Objetivo: Baseline XGBoost multiclasses (corrigido) com walk-forward, embargo e calibração.
# Correção: agora padroniza rótulos usando LabelEncoder (0..K-1) antes de treinar/calibrar para evitar erro base_score.
# Entradas/saídas e premissas iguais às versões anteriores.

from pathlib import Path
from datetime import datetime
import json
import logging
import hashlib
import platform
import sys

import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, f1_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# Config
SEED = 42
np.random.seed(SEED)

GOLD_DIR = Path("/home/wrm/BOLSA_2026/gold/IBOV")
GOLD_DIR.mkdir(parents=True, exist_ok=True)
GOLD_PARQUET = GOLD_DIR / "gold_ibov_features.parquet"
PROV = GOLD_DIR / "PROVENANCE_IBOV.txt"

# Outputs
PARTITIONS_CSV = GOLD_DIR / "partitions_ibov.csv"
METRICS_CSV = GOLD_DIR / "metrics_ibov_baseline.csv"
THRESHOLDS_JSON = GOLD_DIR / "thresholds_tau_ibov.json"

# Hyperparameters (conservative defaults) - removed use_label_encoder (deprecated/unused)
HYPER = {
    "max_depth": 4,
    "min_child_weight": 3,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "n_estimators": 400,
    "learning_rate": 0.08,
    "random_state": SEED,
    "eval_metric": "mlogloss",
}

EMBARGO = 5  # pregões

def sha256_of_file(p: Path) -> str:
    h = hashlib.sha256()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(8192), b""):
            h.update(chunk)
    return h.hexdigest()

def load_gold(p: Path) -> pd.DataFrame:
    if not p.exists():
        raise FileNotFoundError(p)
    df = pd.read_parquet(p)
    return df

def select_features(df: pd.DataFrame) -> dict:
    banned_prefix = ("r_d", "y_d", "k_d")
    banned_exact = {"date"}
    banned_contains = {"ticker"}
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    features = [
        c
        for c in numeric_cols
        if not any(c.startswith(bp) for bp in banned_prefix)
        and c not in banned_exact
        and not any(b in c.lower() for b in banned_contains)
    ]
    feat_map = {1: features, 3: features, 5: features}
    return feat_map

def make_partitions(df: pd.DataFrame, n_splits=5) -> pd.DataFrame:
    dates = pd.to_datetime(df["date"]).sort_values().unique()
    n = len(dates)
    k = min(max(4, n_splits), 6)
    block_size = max(1, n // k)
    partitions = []
    for i in range(k):
        test_start_idx = i * block_size
        test_end_idx = min(n - 1, (i + 1) * block_size - 1) if i < k - 1 else n - 1
        test_start = dates[test_start_idx]
        test_end = dates[test_end_idx]

        train_start = dates[0]
        train_end = dates[max(0, test_start_idx - 1)]

        embargo_start = dates[max(0, test_start_idx - EMBARGO)] if test_start_idx - EMBARGO >= 0 else train_end
        embargo_end = train_end

        partitions.append(
            {
                "block_id": i + 1,
                "train_start": str(train_start.date()),
                "train_end": str(train_end.date()),
                "embargo_start": str(embargo_start.date()) if embargo_start is not None else None,
                "embargo_end": str(embargo_end.date()) if embargo_end is not None else None,
                "test_start": str(test_start.date()),
                "test_end": str(test_end.date()),
            }
        )
    pdf = pd.DataFrame(partitions)
    pdf.to_csv(PARTITIONS_CSV, index=False)
    logging.info(f"Partitions saved: {PARTITIONS_CSV}")
    return pdf

def train_and_evaluate(df: pd.DataFrame, feat_map: dict, partitions: pd.DataFrame):
    metrics_rows = []
    thresholds = {}
    targets = {1: (0.45, 0.55), 3: (0.38, 0.45), 5: (0.30, 0.38)}

    for h in (1, 3, 5):
        features = feat_map[h]
        (GOLD_DIR / f"feature_list_d{h}.txt").write_text("\n".join(features))
        thresholds[h] = None

        for _, row in partitions.iterrows():
            bid = int(row["block_id"])
            # masks by date
            test_mask = (
                (pd.to_datetime(df["date"]).dt.date >= pd.to_datetime(row["test_start"]).date())
                & (pd.to_datetime(df["date"]).dt.date <= pd.to_datetime(row["test_end"]).date())
            )
            train_mask = (
                (pd.to_datetime(df["date"]).dt.date >= pd.to_datetime(row["train_start"]).date())
                & (pd.to_datetime(df["date"]).dt.date <= pd.to_datetime(row["train_end"]).date())
            )

            X_train = df.loc[train_mask, features].copy()
            y_train = df.loc[train_mask, f"y_d{h}_cls"].copy()
            X_test = df.loc[test_mask, features].copy()
            y_test = df.loc[test_mask, f"y_d{h}_cls"].copy()

            if y_train.dropna().empty or y_test.dropna().empty:
                logging.warning(f"Empty train or test for horizon {h} block {bid}; skipping")
                continue

            # compute class weight map on original labels
            classes_orig = np.unique(y_train.dropna())
            try:
                cls_w = compute_class_weight(class_weight="balanced", classes=classes_orig, y=y_train.dropna())
                class_weight_map = {c: float(w) for c, w in zip(classes_orig, cls_w)}
            except Exception:
                class_weight_map = {c: 1.0 for c in classes_orig}
            if 1 in class_weight_map:
                class_weight_map[1] *= 1.3

            # sample weights aligned with original y_train
            sample_weight = y_train.map(lambda v: class_weight_map.get(v, 1.0)).values

            # Encode labels to 0..K-1 for XGBoost & calibration
            le = LabelEncoder()
            # fit on train labels (dropna). We must handle NaNs if present - drop rows with NaN labels in training
            notna_train = y_train.notna()
            X_train_enc = X_train.loc[notna_train]
            y_train_enc = y_train.loc[notna_train].astype(int)
            y_train_le = le.fit_transform(y_train_enc)
            # rebuild sample_weight to match encoded rows
            sample_weight_enc = y_train.loc[notna_train].map(lambda v: class_weight_map.get(v, 1.0)).values

            n_classes = len(le.classes_)
            if n_classes < 2:
                logging.warning(f"Not enough classes in training for horizon {h} block {bid}; skipping")
                continue

            # construct classifier with explicit multiclass objective/num_class
            clf = xgb.XGBClassifier(**HYPER, objective="multi:softprob", num_class=n_classes)
            clf.fit(X_train_enc, y_train_le, sample_weight=sample_weight_enc)

            # Calibration
            if len(X_train_enc) < 50:
                calibrated = clf
                calibrator_name = "none"
            else:
                X_tr, X_cal, y_tr_enc, y_cal_enc = train_test_split(
                    X_train_enc, y_train_le, test_size=0.1, random_state=SEED, stratify=y_train_le
                )
                clf_inner = xgb.XGBClassifier(**HYPER, objective="multi:softprob", num_class=n_classes)
                sw_tr = pd.Series(y_tr_enc).map(lambda enc: 1.0).values  # we already used class weights earlier; optional
                clf_inner.fit(X_tr, y_tr_enc)

                best_cal = None
                best_score = float("inf")
                for method in ["sigmoid", "isotonic"]:
                    try:
                        cal = CalibratedClassifierCV(clf_inner, cv="prefit", method=method)
                        cal.fit(X_cal, y_cal_enc)
                        probs_cal = cal.predict_proba(X_cal)
                        eps = 1e-15
                        ll = -np.mean(np.log(np.maximum(eps, probs_cal[np.arange(len(y_cal_enc)), y_cal_enc])))
                        if ll < best_score:
                            best_score = ll
                            best_cal = (method, cal)
                    except Exception as e:
                        logging.debug(f"Calibration method {method} failed: {e}")

                if best_cal is None:
                    calibrated = clf_inner
                    calibrator_name = "none"
                else:
                    calibrator_name, calibrated = best_cal

            # Prepare test labels encoded (only for rows where label not NaN)
            test_notna = y_test.notna()
            X_test_enc = X_test.loc[test_notna]
            y_test_orig = y_test.loc[test_notna].astype(int)
            # If some classes in test were unseen by train, label encoder can't transform them -> handle by mapping where possible
            # For simplicity, map only values present in le.classes_
            mapping_orig_to_enc = {orig: enc for enc, orig in enumerate(le.classes_)}
            y_test_enc = []
            keep_idx = []
            for i, v in enumerate(y_test_orig.values):
                if v in mapping_orig_to_enc:
                    y_test_enc.append(mapping_orig_to_enc[v])
                    keep_idx.append(i)
                else:
                    # skip rows with unseen label
                    pass
            if len(y_test_enc) == 0:
                logging.warning(f"No test rows have labels seen during train for horizon {h} block {bid}; skipping")
                continue
            # align X_test_enc to rows we kept
            X_test_enc = X_test_enc.iloc[keep_idx]
            y_test_enc = np.array(y_test_enc)

            # Predict probabilities on test
            probs = calibrated.predict_proba(X_test_enc)
            classes_order = list(calibrated.classes_)  # encoded classes

            # find encoded indices corresponding to original 1 and -1 (if present)
            encoded_up = mapping_orig_to_enc.get(1, None)
            encoded_down = mapping_orig_to_enc.get(-1, None)
            idx_up = classes_order.index(encoded_up) if (encoded_up is not None and encoded_up in classes_order) else None
            idx_down = classes_order.index(encoded_down) if (encoded_down is not None and encoded_down in classes_order) else None

            if idx_up is None or idx_down is None:
                logging.warning(f"Classes 1 or -1 missing in model.classes_ for horizon {h} block {bid}")

            def compute_neutral_fraction(tau, probs_arr, idx_u, idx_d):
                if idx_u is None or idx_d is None:
                    return 0.0
                p_up = probs_arr[:, idx_u]
                p_down = probs_arr[:, idx_d]
                dominant = np.maximum(p_up, p_down)
                neutral = dominant < tau
                return float(neutral.mean())

            lo, hi = targets[h]
            taus = np.linspace(0.5, 0.95, 91)
            best_tau = None
            best_gap = float("inf")
            for t in taus:
                frac = compute_neutral_fraction(t, probs, idx_up, idx_down)
                if lo <= frac <= hi:
                    best_tau = float(t)
                    break
                gap = min(abs(frac - lo), abs(frac - hi))
                if gap < best_gap:
                    best_gap = gap
                    best_tau = float(t)

            thresholds[h] = best_tau if thresholds[h] is None else thresholds[h]

            # Decision on test (use probs from calibrated and encoded indices)
            if idx_up is None or idx_down is None:
                y_pred_enc = calibrated.predict(X_test_enc)
            else:
                p_up_test = probs[:, idx_up]
                p_down_test = probs[:, idx_down]
                y_pred_enc = np.where(np.maximum(p_up_test, p_down_test) < best_tau,  # neutral
                                      -999,  # temporary marker
                                      np.where(p_up_test > p_down_test, encoded_up, encoded_down))

                # map -999 (neutral) to encoded class for neutral: we need to choose an encoded label for neutral (there is none)
                # we'll decode encoded preds back to original domain by mapping encoded_up->1, encoded_down->-1, neutral->0
                # create y_pred_orig accordingly
                y_pred_orig = []
                for val in y_pred_enc:
                    if val == -999:
                        y_pred_orig.append(0)
                    elif val == encoded_up:
                        y_pred_orig.append(1)
                    elif val == encoded_down:
                        y_pred_orig.append(-1)
                    else:
                        # fallback decode if possible
                        decoded = le.inverse_transform([val])[0] if val in range(len(le.classes_)) else 0
                        y_pred_orig.append(int(decoded))
                # prepare y_test_orig_aligned for metric computation
                y_test_orig_aligned = y_test_orig.iloc[keep_idx].values
                y_pred = np.array(y_pred_orig)
            # If idx missing and we used predict, decode encoded predictions back to original labels
            if (idx_up is None or idx_down is None):
                # decode encoded predictions
                y_pred = le.inverse_transform(y_pred_enc)

                # The test labels to compare need to be decoded similarly:
                y_test_orig_aligned = y_test_enc  # these are encoded; decode
                y_test_orig_aligned = le.inverse_transform(y_test_enc)

            # compute metrics on original label space (-1,0,1 expected)
            acc_bal = balanced_accuracy_score(y_test_orig_aligned, y_pred)
            try:
                mcc = matthews_corrcoef(y_test_orig_aligned, y_pred)
            except Exception:
                mcc = float("nan")
            f1_up = f1_score(y_test_orig_aligned, y_pred, labels=[1], average="macro", zero_division=0)
            f1_down = f1_score(y_test_orig_aligned, y_pred, labels=[-1], average="macro", zero_division=0)

            metrics_rows.append(
                {
                    "horizon": h,
                    "block_id": bid,
                    "acc_bal": float(acc_bal),
                    "mcc_macro": float(mcc),
                    "f1_up": float(f1_up),
                    "f1_down": float(f1_down),
                    "n_test": int(len(y_test_orig_aligned)),
                    "calibrator": calibrator_name,
                    "tau_used": best_tau,
                }
            )

            # Optional calibration curve for 'up' class
            try:
                if idx_up is not None:
                    p_up = probs[:, idx_up]
                    frac_pos, mean_pred = calibration_curve((y_test_orig_aligned == 1).astype(int), p_up, n_bins=10)
                    plt.figure()
                    plt.plot(mean_pred, frac_pos, marker="o")
                    plt.plot([0, 1], [0, 1], linestyle="--", color="k")
                    plt.xlabel("Mean predicted prob (up)")
                    plt.ylabel("Fraction of positives")
                    plt.title(f"Calibration up: h{h} block{bid}")
                    png = GOLD_DIR / f"calibration_curve_d{h}_block{bid}.png"
                    plt.savefig(png)
                    plt.close()
            except Exception as e:
                logging.debug(f"Calibration plot failed: {e}")

    # save metrics and thresholds
    pd.DataFrame(metrics_rows).to_csv(METRICS_CSV, index=False)
    with THRESHOLDS_JSON.open("w") as f:
        json.dump({f"d{h}": thresholds[h] for h in thresholds}, f, indent=2)
    logging.info(f"Metrics saved: {METRICS_CSV}")
    logging.info(f"Thresholds saved: {THRESHOLDS_JSON}")

    # append provenance
    prov_summary = {
        "seed": SEED,
        "partitions": str(PARTITIONS_CSV),
        "hyperparameters": HYPER,
        "class_weighting_note": "positive class weight multiplied by 1.3 if present",
        "metrics_file": str(METRICS_CSV),
        "thresholds_file": str(THRESHOLDS_JSON),
        "gold_parquet_sha256": sha256_of_file(GOLD_PARQUET) if GOLD_PARQUET.exists() else None,
        "outputs_mtime": {
            "metrics": METRICS_CSV.stat().st_mtime if METRICS_CSV.exists() else None,
            "thresholds": THRESHOLDS_JSON.stat().st_mtime if THRESHOLDS_JSON.exists() else None,
        },
        "timestamp": datetime.now().isoformat(),
        "python": sys.version,
        "platform": platform.platform(),
    }
    line = f"[{datetime.now().isoformat()}] BASELINE_XGB: {json.dumps(prov_summary, default=str)}\n"
    PROV.write_text(PROV.read_text() + line if PROV.exists() else line)
    logging.info(f"Provenance appended: {PROV}")

def main():
    df = load_gold(GOLD_PARQUET)
    feat_map = select_features(df)
    partitions = make_partitions(df, n_splits=5)
    train_and_evaluate(df, feat_map, partitions)

# Execute
main()

INFO: Partitions saved: /home/wrm/BOLSA_2026/gold/IBOV/partitions_ibov.csv
INFO: Metrics saved: /home/wrm/BOLSA_2026/gold/IBOV/metrics_ibov_baseline.csv
INFO: Thresholds saved: /home/wrm/BOLSA_2026/gold/IBOV/thresholds_tau_ibov.json
INFO: Provenance appended: /home/wrm/BOLSA_2026/gold/IBOV/PROVENANCE_IBOV.txt
INFO: Metrics saved: /home/wrm/BOLSA_2026/gold/IBOV/metrics_ibov_baseline.csv
INFO: Thresholds saved: /home/wrm/BOLSA_2026/gold/IBOV/thresholds_tau_ibov.json
INFO: Provenance appended: /home/wrm/BOLSA_2026/gold/IBOV/PROVENANCE_IBOV.txt


In [7]:
# Diagnóstico de classes por partição
# Cola esta célula no final do notebook e execute após ter carregado/definido `df`, `PARTITIONS_CSV` e a função `make_partitions`.
# O objetivo: mostrar por bloco e por horizonte a distribuição de rótulos no conjunto de treino/teste
# e sugerir uma alternativa de particionamento (por exemplo n_splits=4) caso haja blocos com poucas classes.

import pandas as pd
from pathlib import Path

print('Running partition/class diagnostics...')

if not Path(PARTITIONS_CSV).exists():
    print(f'Partitions CSV not found: {PARTITIONS_CSV} — execute make_partitions(df, n_splits=5) first')
else:
    parts = pd.read_csv(PARTITIONS_CSV)
    parts_display = []
    issues = []
    for _, prow in parts.iterrows():
        bid = int(prow['block_id'])
        train_mask = (
            (pd.to_datetime(df['date']).dt.date >= pd.to_datetime(prow['train_start']).date())
            & (pd.to_datetime(df['date']).dt.date <= pd.to_datetime(prow['train_end']).date())
        )
        test_mask = (
            (pd.to_datetime(df['date']).dt.date >= pd.to_datetime(prow['test_start']).date())
            & (pd.to_datetime(df['date']).dt.date <= pd.to_datetime(prow['test_end']).date())
        )
        row = {'block_id': bid, 'train_rows': int(train_mask.sum()), 'test_rows': int(test_mask.sum())}
        for h in (1, 3, 5):
            col = f'y_d{h}_cls'
            if col not in df.columns:
                row[f'train_unique_d{h}'] = None
                row[f'test_unique_d{h}'] = None
                row[f'train_counts_d{h}'] = ''
                row[f'test_counts_d{h}'] = ''
                continue
            train_counts = df.loc[train_mask, col].value_counts(dropna=True).to_dict()
            test_counts = df.loc[test_mask, col].value_counts(dropna=True).to_dict()
            row[f'train_unique_d{h}'] = len(train_counts)
            row[f'test_unique_d{h}'] = len(test_counts)
            row[f'train_counts_d{h}'] = str(train_counts)
            row[f'test_counts_d{h}'] = str(test_counts)

            # detect small-class issues: fewer than 2 classes or any class with < 5 samples
            if row[f'train_unique_d{h}'] < 2:
                issues.append((bid, h, 'few_classes_train'))
            else:
                if any(v < 5 for v in train_counts.values()):
                    issues.append((bid, h, 'small_class_count'))

        parts_display.append(row)

    df_parts = pd.DataFrame(parts_display).sort_values('block_id')
    pd.set_option('display.max_colwidth', 200)
    print('\nPartition overview:')
    display(df_parts)

    if not issues:
        print('\nNo immediate class-balance/coverage issues detected.')
    else:
        print('\nDetected issues (block_id, horizon, problem):')
        for it in issues:
            print(' -', it)

        # Simple suggestion heuristics
        # If many blocks have issues, propose fewer splits (e.g., n_splits=4)
        n_issues = len({(b, h) for b, h, _ in issues})
        if n_issues >= 2:
            print('\nSuggestion: several blocks have insufficient classes. Try reducing the number of splits (e.g. n_splits=4)')
            try:
                proposed = make_partitions(df, n_splits=4)
                outp = Path(PARTITIONS_CSV).with_name('partitions_proposed_k4.csv')
                proposed.to_csv(outp, index=False)
                print(f'Proposed partitions (n_splits=4) saved to: {outp}')
                display(proposed)
            except Exception as e:
                print('Could not generate proposed partitions automatically:', e)
        else:
            print('\nSuggestion: for the affected blocks you can:')
            print(' - expand the training window (increase train_end for previous block)')
            print(' - merge the affected block with adjacent block(s)')
            print(' - reduce embargo (if safe from leakage) to increase available train data')
            print('\nExample: to quickly try fewer splits call:')
            print('  proposed = make_partitions(df, n_splits=4)')
            print('  proposed.to_csv(Path(PARTITIONS_CSV).with_name("partitions_proposed_k4.csv"), index=False)')

print('\nDiagnostics complete.')


Running partition/class diagnostics...


NameError: name 'df' is not defined

In [8]:
# Re-run baseline usando partitions_proposed_k4.csv e salvar artefatos com sufixo _k4
# Use esta célula após ter executado a célula de diagnóstico que salva `partitions_proposed_k4.csv`.
# Ela carrega o parquet GOLD_PARQUET, a partição proposta e executa train_and_evaluate,
# salvando métricas/thresholds/provenance com sufixo _k4 para comparação.

from pathlib import Path
import pandas as pd
import json
import logging

SUFFIX = "_k4"
proposed_csv = Path(PARTITIONS_CSV).with_name('partitions_proposed_k4.csv')
if not proposed_csv.exists():
    raise FileNotFoundError(f'Arquivo de partições proposto não encontrado: {proposed_csv} — execute a célula de diagnóstico primeiro')

# ajusta nomes de saída locais com sufixo
METRICS_K4 = Path(METRICS_CSV).with_name(Path(METRICS_CSV).stem + SUFFIX + Path(METRICS_CSV).suffix)
THRESHOLDS_K4 = Path(THRESHOLDS_JSON).with_name(Path(THRESHOLDS_JSON).stem + SUFFIX + Path(THRESHOLDS_JSON).suffix)
PROV_K4 = PROV.with_name(PROV.stem + SUFFIX + PROV.suffix)

# Carrega dados e partições propostas
print('Loading gold parquet and proposed partitions...')
df = load_gold(GOLD_PARQUET)
partitions_prop = pd.read_csv(proposed_csv)

# Função wrapper que escreve para arquivos sufixados
def train_and_evaluate_k4(df, feat_map, partitions):
    # reusa a função original train_and_evaluate mas captura seus resultados
    # para não duplicar lógica, chamamos a função existente e depois renomeamos os arquivos salvos
    train_and_evaluate(df, feat_map, partitions)
    # os arquivos originais foram escritos em METRICS_CSV e THRESHOLDS_JSON; renomeie/copiar para _k4
    import shutil
    if Path(METRICS_CSV).exists():
        shutil.copy2(METRICS_CSV, METRICS_K4)
        logging.info(f'Copied metrics -> {METRICS_K4}')
    if Path(THRESHOLDS_JSON).exists():
        shutil.copy2(THRESHOLDS_JSON, THRESHOLDS_K4)
        logging.info(f'Copied thresholds -> {THRESHOLDS_K4}')
    # append provenance indicating this was a k4 run
    prov_line = f"[{pd.Timestamp.now().isoformat()}] BASELINE_XGB_K4: partitions={proposed_csv}, metrics={METRICS_K4}, thresholds={THRESHOLDS_K4}\n"
    PROV_K4.write_text(PROV_K4.read_text() + prov_line if PROV_K4.exists() else prov_line)
    logging.info(f'Provenance appended: {PROV_K4}')

# Execute
feat_map = select_features(df)
print('Starting train_and_evaluate with proposed partitions (k=4)...')
train_and_evaluate_k4(df, feat_map, partitions_prop)
print('Finished. Outputs:')
print(' - metrics:', METRICS_K4)
print(' - thresholds:', THRESHOLDS_K4)
print(' - provenance:', PROV_K4)


FileNotFoundError: Arquivo de partições proposto não encontrado: /home/wrm/BOLSA_2026/gold/IBOV/partitions_proposed_k4.csv — execute a célula de diagnóstico primeiro