In [3]:
# ibov_baseline_optA: 3-class XGBoost baseline (argmax decision after calibration)
# Instruções: copie/cole esta célula inteira em um notebook novo e execute no kernel do .venv.
# Saídas: metrics_ibov_optA.csv, partitions_ibov.csv, feature_list_d{h}.txt, PROVENANCE_IBOV.txt append.

from pathlib import Path
from datetime import datetime
import json, logging, hashlib, platform, sys
import numpy as np
import pandas as pd
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, f1_score
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import matplotlib.pyplot as plt

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
SEED = 42
np.random.seed(SEED)

GOLD_DIR = Path('/home/wrm/BOLSA_2026/gold/IBOV')
GOLD_DIR.mkdir(parents=True, exist_ok=True)
GOLD_PARQUET = GOLD_DIR / 'gold_ibov_features.parquet'
PROV = GOLD_DIR / 'PROVENANCE_IBOV.txt'
PARTITIONS_CSV = GOLD_DIR / 'partitions_ibov.csv'
METRICS_OPT_A = GOLD_DIR / 'metrics_ibov_optA.csv'
CALIB_PNG_DIR = GOLD_DIR

HYPER = {
    'max_depth': 4,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 400,
    'learning_rate': 0.08,
    'random_state': SEED,
    'tree_method': 'hist',
}
EMBARGO = 5
HORIZONS = (1,3,5)

def sha256_of_file(p: Path) -> str:
    h = hashlib.sha256()
    with p.open('rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            h.update(chunk)
    return h.hexdigest()

def load_gold(p: Path) -> pd.DataFrame:
    if not p.exists():
        raise FileNotFoundError(p)
    return pd.read_parquet(p)

def select_features_and_save(df: pd.DataFrame):
    banned_prefix = ('r_d','y_d','k_d')
    banned_exact = {'date'}
    banned_contains = {'ticker'}
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    features = [c for c in numeric_cols if not any(c.startswith(bp) for bp in banned_prefix) and c not in banned_exact and not any(b in c.lower() for b in banned_contains)]
    for h in HORIZONS:
        (GOLD_DIR / f'feature_list_d{h}.txt').write_text('\n'.join(features))
    return {h: features for h in HORIZONS}

def make_partitions(df: pd.DataFrame, n_splits=5):
    dates = pd.to_datetime(df['date']).sort_values().unique()
    n = len(dates)
    k = min(max(4, n_splits), 6)
    block_size = max(1, n // k)
    parts = []
    for i in range(k):
        test_start_idx = i * block_size
        test_end_idx = min(n-1, (i+1)*block_size -1) if i < k-1 else n-1
        test_start = dates[test_start_idx]
        test_end = dates[test_end_idx]
        train_start = dates[0]
        train_end = dates[max(0, test_start_idx-1)]
        embargo_start = dates[max(0, test_start_idx-EMBARGO)] if test_start_idx-EMBARGO >=0 else train_end
        embargo_end = train_end
        parts.append({'block_id': i+1, 'train_start': str(train_start.date()), 'train_end': str(train_end.date()), 'embargo_start': str(embargo_start.date()) if embargo_start is not None else None, 'embargo_end': str(embargo_end.date()) if embargo_end is not None else None, 'test_start': str(test_start.date()), 'test_end': str(test_end.date())})
    pdf = pd.DataFrame(parts)
    pdf.to_csv(PARTITIONS_CSV, index=False)
    logging.info(f'Partitions saved: {PARTITIONS_CSV}')
    return pdf

def compute_class_weights(y):
    classes = np.unique(y.dropna())
    try:
        cw = compute_class_weight(class_weight='balanced', classes=classes, y=y.dropna())
        m = {c: float(w) for c,w in zip(classes, cw)}
    except Exception:
        m = {c: 1.0 for c in classes}
    if 1 in m:
        m[1] = m[1] * 1.15
    return m

def choose_calibrator_and_fit(clf, X_cal, y_cal):
    best = (None, None, float('inf'))
    for method in ['sigmoid','isotonic']:
        try:
            cal = CalibratedClassifierCV(clf, cv='prefit', method=method)
            cal.fit(X_cal, y_cal)
            probs = cal.predict_proba(X_cal)
            eps = 1e-15
            ll = -np.mean(np.log(np.maximum(eps, probs[np.arange(len(y_cal)), y_cal])))
            if ll < best[2]:
                best = (method, cal, ll)
        except Exception as e:
            logging.debug(f'Calibration {method} failed: {e}')
    if best[0] is None:
        return None, None
    return best[0], best[1]

def train_and_eval_optA(df, feat_map, partitions):
    rows = []
    for h in HORIZONS:
        features = feat_map[h]
        for _, prow in partitions.iterrows():
            bid = int(prow['block_id'])
            test_mask = (pd.to_datetime(df['date']).dt.date >= pd.to_datetime(prow['test_start']).date()) & (pd.to_datetime(df['date']).dt.date <= pd.to_datetime(prow['test_end']).date())
            train_mask = (pd.to_datetime(df['date']).dt.date >= pd.to_datetime(prow['train_start']).date()) & (pd.to_datetime(df['date']).dt.date <= pd.to_datetime(prow['train_end']).date())
            X_tr = df.loc[train_mask, features].copy()
            y_tr = df.loc[train_mask, f'y_d{h}_cls'].copy()
            X_te = df.loc[test_mask, features].copy()
            y_te = df.loc[test_mask, f'y_d{h}_cls'].copy()
            if y_tr.dropna().empty or y_te.dropna().empty:
                logging.warning(f'Empty train/test for horizon {h} block {bid} — skipping')
                continue
            cw = compute_class_weights(y_tr)
            notna_tr = y_tr.notna()
            X_tr_enc = X_tr.loc[notna_tr]
            y_tr_enc = y_tr.loc[notna_tr].astype(int)
            le = LabelEncoder()
            y_tr_le = le.fit_transform(y_tr_enc)
            sample_w = y_tr.loc[notna_tr].map(lambda v: cw.get(v,1.0)).values
            if len(np.unique(y_tr_le)) < 2:
                logging.warning(f'Not enough classes in train for h{h} b{bid} — skipping')
                continue
            clf = xgb.XGBClassifier(**HYPER, objective='multi:softprob', num_class=len(le.classes_))
            clf.fit(X_tr_enc, y_tr_le, sample_weight=sample_w)
            # calibration
            if len(X_tr_enc) < 50:
                calibrated = clf
                cal_name = 'none'
            else:
                Xtr_sub, Xcal, ytr_sub, ycal = train_test_split(X_tr_enc, y_tr_le, test_size=0.1, random_state=SEED, stratify=y_tr_le)
                clf_inner = xgb.XGBClassifier(**HYPER, objective='multi:softprob', num_class=len(le.classes_))
                clf_inner.fit(Xtr_sub, ytr_sub)
                method, cal = choose_calibrator_and_fit(clf_inner, Xcal, ycal)
                if cal is None:
                    calibrated = clf_inner
                    cal_name = 'none'
                else:
                    calibrated = cal
                    cal_name = method
            # prepare test rows where label seen in train
            test_notna = y_te.notna()
            X_te_enc = X_te.loc[test_notna] if not X_te.empty else X_te
            y_te_orig = y_te.loc[test_notna].astype(int) if not y_te.empty else y_te
            # map only labels present in train
            mapping = {orig: enc for enc, orig in enumerate(le.classes_)}
            keep = []
            for i,v in enumerate(y_te_orig.values if len(y_te_orig)>0 else []):
                if v in mapping:
                    keep.append(i)
            if len(keep) == 0:
                logging.warning(f'No test rows with seen labels for h{h} b{bid} — skipping')
                continue
            X_te_enc = X_te_enc.iloc[keep]
            y_te_aligned = y_te_orig.iloc[keep].values
            probs = calibrated.predict_proba(X_te_enc)
            # argmax decision over all classes (decoded back to original labels)
            # calibrated.classes_ are encoded classes; map back using le.inverse_transform
            pred_enc = np.argmax(probs, axis=1)
            pred_orig = le.inverse_transform(pred_enc)
            # diagnostics: proportion of zeros predicted
            prop_zero = float((pred_orig == 0).mean())
            acc_bal = balanced_accuracy_score(y_te_aligned, pred_orig)
            try:
                mcc = matthews_corrcoef(y_te_aligned, pred_orig)
            except Exception:
                mcc = float('nan')
            f1_up = f1_score(y_te_aligned, pred_orig, labels=[1], average='macro', zero_division=0)
            f1_down = f1_score(y_te_aligned, pred_orig, labels=[-1], average='macro', zero_division=0)
            rows.append({'horizon': h, 'block_id': bid, 'acc_bal': float(acc_bal), 'mcc_macro': float(mcc), 'f1_up': float(f1_up), 'f1_down': float(f1_down), 'n_test': int(len(y_te_aligned)), 'prop_zero': prop_zero, 'calibrator': cal_name})
            # optional calibration plot for 'up' class if present
            try:
                if 1 in mapping:
                    idx_up = list(calibrated.classes_).index(mapping[1]) if mapping[1] in list(calibrated.classes_) else None
                    if idx_up is not None:
                        p_up = probs[:, idx_up]
                        frac_pos, mean_pred = calibration_curve((y_te_aligned == 1).astype(int), p_up, n_bins=10)
                        plt.figure()
                        plt.plot(mean_pred, frac_pos, marker='o')
                        plt.plot([0,1],[0,1], linestyle='--', color='k')
                        plt.xlabel('Mean predicted prob (up)')
                        plt.ylabel('Fraction of positives')
                        plt.title(f'Calibration up: h{h} block{bid}')
                        png = CALIB_PNG_DIR / f'calibration_optA_d{h}_block{bid}.png'
                        plt.savefig(png)
                        plt.close()
            except Exception as e:
                logging.debug(f'Cal plot failed: {e}')

    # write metrics and provenance
    pd.DataFrame(rows).to_csv(METRICS_OPT_A, index=False)
    prov = {
        'timestamp': datetime.now().isoformat(),
        'mode': 'OptA argmax 3-classes',
        'seed': SEED,
        'hyperparameters': HYPER,
        'class_weight_note': 'positive class weight *1.15 if present',
        'metrics_file': str(METRICS_OPT_A),
        'partitions_file': str(PARTITIONS_CSV),
        'gold_sha256': sha256_of_file(GOLD_PARQUET) if GOLD_PARQUET.exists() else None,
        'python': sys.version,
        'platform': platform.platform(),
    }
    line = f"[{datetime.now().isoformat()}] OPTA_BASELINE: {json.dumps(prov, default=str)}\n"
    PROV.write_text(PROV.read_text() + line if PROV.exists() else line)
    logging.info(f'OPT-A metrics saved: {METRICS_OPT_A}')
    logging.info(f'Provenance appended: {PROV}')

# Runner helper
def run_optA(n_splits=5):
    df = load_gold(GOLD_PARQUET)
    if not PARTITIONS_CSV.exists():
        make_partitions(df, n_splits=n_splits)
    parts = pd.read_csv(PARTITIONS_CSV)
    feat_map = select_features_and_save(df)
    train_and_eval_optA(df, feat_map, parts)

print('ibov_baseline_optA cell ready. Run run_optA(n_splits=5) to execute.')

ibov_baseline_optA cell ready. Run run_optA(n_splits=5) to execute.


In [5]:
run_optA(n_splits=5) 

INFO: OPT-A metrics saved: /home/wrm/BOLSA_2026/gold/IBOV/metrics_ibov_optA.csv
INFO: Provenance appended: /home/wrm/BOLSA_2026/gold/IBOV/PROVENANCE_IBOV.txt
INFO: OPT-A metrics saved: /home/wrm/BOLSA_2026/gold/IBOV/metrics_ibov_optA.csv
INFO: Provenance appended: /home/wrm/BOLSA_2026/gold/IBOV/PROVENANCE_IBOV.txt
