In [3]:
# AGENTE — Execução Inicial (dry_run)
# Objetivo: Carregar amostra do dataset GOLD (IBOV) para inspeção sem persistência.
# Regras: bloco único, auto-contido, sem efeitos colaterais. dry_run=True.

import os
import sys
import json
from datetime import datetime
from typing import Dict, Any, List, Tuple

import pandas as pd

# ------------------ Configuração ------------------
CONFIG = {
    "dry_run": True,
    "gold_root": "/home/wrm/BOLSA_2026/gold/IBOV_gold.parquet",
    # leitura particionada por ano, com amostra leve para inicialização
    "sample": {
        "max_years": 3,           # quantos anos mais recentes ler
        "rows_per_year": 5000     # máximo de linhas por ano (aprox., após leitura)
    },
    # colunas esperadas mínimas (ajustar conforme evolução do GOLD)
    "expected_columns_any": [
        "date", "ticker", "open", "high", "low", "close", "volume"
    ]
}

# ------------------ Helpers ------------------

def norm_error(msg: str) -> None:
    print(f"VALIDATION_ERROR: {msg}")

def checklist_error(msg: str) -> None:
    print(f"CHECKLIST_FAILURE: {msg}")


def list_partition_years(root: str) -> List[int]:
    if not os.path.isdir(root):
        norm_error("diretório GOLD inexistente ou inválido.")
        return []
    years = []
    for name in os.listdir(root):
        if name.startswith("year="):
            try:
                years.append(int(name.split("=", 1)[1]))
            except Exception:
                continue
    return sorted(years)


def read_year_partition(root: str, year: int) -> pd.DataFrame:
    path = os.path.join(root, f"year={year}")
    if not os.path.isdir(path):
        norm_error(f"partição ausente para ano {year}.")
        return pd.DataFrame()
    # Leitura de parquet particionado (arquivos múltiplos por ano)
    # Pandas pode ler diretórios contendo arquivos .parquet
    try:
        df = pd.read_parquet(path)
    except Exception as e:
        norm_error(f"falha ao ler parquet do ano {year}: {e}")
        return pd.DataFrame()
    return df


def validate_structure(df: pd.DataFrame) -> Tuple[bool, List[str]]:
    missing = [c for c in CONFIG["expected_columns_any"] if c not in df.columns]
    return (len(missing) == 0, missing)


def quality_report(df: pd.DataFrame) -> Dict[str, Any]:
    rep: Dict[str, Any] = {}
    if df.empty:
        rep["empty"] = True
        rep["rows"] = 0
        rep["cols"] = 0
        rep["columns"] = []
        return rep
    rep["empty"] = False
    rep["rows"] = int(df.shape[0])
    rep["cols"] = int(df.shape[1])
    rep["columns"] = list(map(str, df.columns.tolist()))
    # intervalo temporal se houver coluna 'date'
    if "date" in df.columns:
        try:
            s = pd.to_datetime(df["date"])  # robusto a string/datetime
            rep["date_range"] = {
                "min": str(s.min()),
                "max": str(s.max())
            }
        except Exception as e:
            rep["date_range_error"] = str(e)
    # amostra inicial
    rep["sample_head"] = df.head(5).to_dict(orient="records")
    # completude/qualidade simples
    null_pct = df.isna().mean().sort_values(ascending=False)
    rep["nulls_top"] = null_pct.head(5).round(4).to_dict()
    return rep


def info_like(df: pd.DataFrame) -> str:
    from io import StringIO
    buf = StringIO()
    df.info(buf=buf)
    return buf.getvalue()


# ------------------ Execução ------------------
if __name__ == "__main__":
    start_ts = datetime.utcnow().isoformat() + "Z"

    dry_run = bool(CONFIG.get("dry_run", True))
    root = CONFIG["gold_root"]

    # 1) validar existência do GOLD
    if not os.path.isdir(root):
        checklist_error("caminho GOLD não encontrado: " + root)
        # relatório mínimo e encerramento
        print(json.dumps({
            "start": start_ts,
            "dry_run": dry_run,
            "gold_root": root,
            "status": "error",
            "error": "GOLD path missing"
        }, ensure_ascii=False, indent=2))
        raise SystemExit(0)

    # 2) listar partições por ano e selecionar amostra
    years = list_partition_years(root)
    if not years:
        checklist_error("nenhuma partição year=* encontrada no GOLD.")
    recent_years = sorted(years)[-CONFIG["sample"]["max_years"]:] if years else []

    # 3) ler e concatenar amostra
    frames: List[pd.DataFrame] = []
    for y in recent_years:
        df_y = read_year_partition(root, y)
        if df_y.empty:
            continue
        # limitar linhas por ano para amostra leve
        if CONFIG["sample"]["rows_per_year"] > 0 and df_y.shape[0] > CONFIG["sample"]["rows_per_year"]:
            df_y = df_y.sample(CONFIG["sample"]["rows_per_year"], random_state=42)
        frames.append(df_y)

    df = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()

    # 4) validação de estrutura mínima
    ok, missing = validate_structure(df) if not df.empty else (False, CONFIG["expected_columns_any"])
    if not ok:
        norm_error("estrutura divergente do esperado. Faltando colunas: " + ", ".join(map(str, missing)))

    # 5) relatórios obrigatórios
    report = quality_report(df)

    # 6) exibir estrutura estilo info()
    info_text = info_like(df) if not df.empty else "<DataFrame vazio>"

    # 7) montar saída consolidada
    out = {
        "start": start_ts,
        "end": datetime.utcnow().isoformat() + "Z",
        "dry_run": dry_run,
        "gold_root": root,
        "years_available": years,
        "years_loaded": recent_years,
        "status": "ok" if (not df.empty and ok) else ("warning" if not df.empty else "empty"),
        "structure": {
            "info": info_text,
            "columns": report.get("columns", []),
            "shape": [report.get("rows", 0), report.get("cols", 0)]
        },
        "sample": report.get("sample_head", []),
        "range": report.get("date_range", None),
        "count_total": report.get("rows", 0),
        "quality": {
            "nulls_top": report.get("nulls_top", {}),
            "empty": report.get("empty", True)
        }
    }

    # 8) impressão final
    # garantir serialização de types como pandas.Timestamp via default=str
    print(json.dumps(out, ensure_ascii=False, indent=2, default=str))


{
  "start": "2025-09-19T18:28:34.246303Z",
  "end": "2025-09-19T18:28:34.261935Z",
  "dry_run": true,
  "gold_root": "/home/wrm/BOLSA_2026/gold/IBOV_gold.parquet",
  "years_available": [
    2012,
    2013,
    2014,
    2015,
    2016,
    2017,
    2018,
    2019,
    2020,
    2021,
    2022,
    2023,
    2024,
    2025
  ],
  "years_loaded": [
    2023,
    2024,
    2025
  ],
  "status": "ok",
  "structure": {
    "info": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 680 entries, 0 to 679\nData columns (total 23 columns):\n #   Column         Non-Null Count  Dtype                 \n---  ------         --------------  -----                 \n 0   date           680 non-null    timestamp[ns][pyarrow]\n 1   open           680 non-null    double[pyarrow]       \n 2   high           680 non-null    double[pyarrow]       \n 3   low            680 non-null    double[pyarrow]       \n 4   close          680 non-null    double[pyarrow]       \n 5   volume         680 non-null    in

  start_ts = datetime.utcnow().isoformat() + "Z"
  "end": datetime.utcnow().isoformat() + "Z",


---
# MODELOS E MODELAGEM
---

## INSTRUÇÃO 4E — Feature Engineering com Janelas (para XGBoost)

  start_ts = datetime.utcnow().isoformat() + "Z"
  df_raw.groupby("ticker", group_keys=False).apply(per_ticker).reset_index(drop=True)
  df_raw.groupby("ticker", group_keys=False).apply(per_ticker).reset_index(drop=True)


{
  "start": "2025-09-19T18:32:57.498759Z",
  "end": "2025-09-19T18:32:57.708771Z",
  "dry_run": true,
  "gold_root": "/home/wrm/BOLSA_2026/gold/IBOV_gold.parquet",
  "splits": {
    "train": [
      "2012-01-01",
      "2021-12-31"
    ],
    "val": [
      "2022-01-01",
      "2023-12-31"
    ],
    "test": [
      "2024-01-01",
      "2025-12-31"
    ]
  },
  "windows": [
    5,
    10,
    15
  ],
  "lags": [
    1,
    2,
    3,
    5,
    10,
    15
  ],
  "reports": {
    "train": {
      "status": "ok",
      "rows_before": 2470,
      "rows_after": 0,
      "%nan_removed": 100.0,
      "n_features": 81,
      "columns_sample": [
        "date",
        "open",
        "high",
        "low",
        "close",
        "volume",
        "ticker",
        "open_norm",
        "high_norm",
        "low_norm"
      ],
      "top20_corr_train": {
        "y_h1": [],
        "y_h3": [],
        "y_h5": []
      }
    },
    "val": {
      "status": "ok",
      "rows_before": 498,
     

  df_raw.groupby("ticker", group_keys=False).apply(per_ticker).reset_index(drop=True)
  "end": datetime.utcnow().isoformat() + "Z",
