In [9]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Instrução 1A-REV3 — Coleta direta Yahoo Chart → Bronze (dry_run)
# Regras:
# - Bloco único, auto-contido.
# - dry_run=True (sem persistência).
# - Provedores em ordem: Yahoo Chart -> yfinance -> Stooq.
# - Sem dados sintéticos.
# - Mensagens normativas: VALIDATION_ERROR / CHECKLIST_FAILURE.

import sys
import json
import time
import math
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple

import pandas as pd
import numpy as np
from pandas.tseries.offsets import BusinessDay as BDay

# =========================
# Parâmetros
# =========================
ROOT_DIR = Path("/home/wrm/BOLSA_2026").resolve()
DRY_RUN = False
TICKER = "^BVSP"

START_DATE_UTC = pd.Timestamp("2012-01-01", tz="UTC")
NOW_UTC = pd.Timestamp(datetime.now(timezone.utc))
END_DATE_UTC = NOW_UTC.normalize()  # 00:00 UTC de hoje
PERIOD2_NOW_UTC = NOW_UTC  # para Yahoo Chart, usar timestamp "agora"

PARQUET_TARGET = ROOT_DIR / "bronze" / "IBOV.parquet"
MANIFESTO_TARGET = ROOT_DIR / "manifestos" / "bronze_ibov_manifesto.csv"

EXPECTED_COLUMNS = ["date", "open", "high", "low", "close", "volume", "ticker"]
EXPECTED_DTYPES = {
    "date": "datetime64[ns]",
    "open": "float64",
    "high": "float64",
    "low": "float64",
    "close": "float64",
    "volume": "int64",
    "ticker": "string",
}

AGORA = datetime.now().astimezone()

# =========================
# Utils
# =========================
def print_section(title: str):
    print("\n" + "=" * 8 + f" {title} " + "=" * 8)

def dtypes_signature(df: pd.DataFrame) -> Dict[str, str]:
    return {c: str(df.dtypes[c]) for c in df.columns}

def percent_nulls(df: pd.DataFrame) -> Dict[str, float]:
    total = len(df)
    if total == 0:
        return {c: 100.0 for c in df.columns}
    return {c: float(df[c].isna().sum()) * 100.0 / float(total) for c in df.columns}

def to_unix_seconds(ts: pd.Timestamp) -> int:
    if ts.tzinfo is None:
        ts = ts.tz_localize("UTC")
    else:
        ts = ts.tz_convert("UTC")
    return int(ts.timestamp())

def bronze_normalize(
    df_pre: pd.DataFrame,
    ticker: str,
    start_utc: pd.Timestamp,
    end_utc: pd.Timestamp
) -> Tuple[pd.DataFrame, Dict[str, int]]:
    """
    df_pre: espera colunas ['date','open','high','low','close','volume'] (date pode ser datetime ou epoch já convertido)
    Retorna df_final no schema Bronze + contagens de limpeza.
    """
    df = df_pre.copy()

    # Garantir colunas
    for c in ["date", "open", "high", "low", "close", "volume"]:
        if c not in df.columns:
            raise RuntimeError(f"SCHEMA_ERROR: coluna ausente em df_pre: {c}")

    # Date -> datetime naive normalizado 00:00
    df["date"] = pd.to_datetime(df["date"], errors="coerce", utc=True).dt.tz_localize(None).dt.normalize()

    # Tipos numéricos
    for c in ["open", "high", "low", "close"]:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # Contagens antes da limpeza
    rows_before_cleaning = int(len(df))

    # Remover linhas com qualquer OHLC nulo
    mask_ohlc_notna = (~df["open"].isna()) & (~df["high"].isna()) & (~df["low"].isna()) & (~df["close"].isna())
    df = df[mask_ohlc_notna].copy()
    rows_after_cleaning = int(len(df))
    rows_dropped_ohlc = int(rows_before_cleaning - rows_after_cleaning)

    # Volume: NaN -> 0, int64
    df["volume"] = pd.to_numeric(df["volume"], errors="coerce").fillna(0).astype("int64")

    # Forçar dtype float64 para OHLC
    for c in ["open", "high", "low", "close"]:
        df[c] = df[c].astype("float64")

    # ticker
    df["ticker"] = pd.Series([ticker] * len(df), dtype="string").astype("string")

    # Filtrar intervalo [start, end]
    start_naive = start_utc.tz_convert(None).tz_localize(None) if start_utc.tzinfo is not None else start_utc
    end_naive = end_utc.tz_convert(None).tz_localize(None) if end_utc.tzinfo is not None else end_utc
    df = df[(df["date"] >= start_naive) & (df["date"] <= end_naive)].copy()

    # Ordenar, deduplicar por date
    df = df.sort_values("date").drop_duplicates(subset=["date"], keep="last").reset_index(drop=True)

    # Reordenar colunas
    df = df[["date", "open", "high", "low", "close", "volume", "ticker"]]

    stats = {
        "rows_before_cleaning": rows_before_cleaning,
        "rows_after_cleaning": rows_after_cleaning,
        "rows_dropped_ohlc": rows_dropped_ohlc,
    }
    return df, stats

# =========================
# Provedores
# =========================
def fetch_yahoo_chart_direct(
    ticker: str,
    start_utc: pd.Timestamp,
    period2_now_utc: pd.Timestamp,
    retries: int = 2,
    backoff_seconds: List[float] = [0.8, 1.6]
) -> Tuple[Optional[pd.DataFrame], Dict[str, int], List[Dict[str, Any]]]:
    """
    Coleta direto do endpoint Chart do Yahoo.
    Retorna (df_final, stats, attempts).
    """
    attempts: List[Dict[str, Any]] = []
    df_final: Optional[pd.DataFrame] = None
    stats: Dict[str, int] = {"rows_before_cleaning": 0, "rows_after_cleaning": 0, "rows_dropped_ohlc": 0}

    base_url = "https://query2.finance.yahoo.com/v8/finance/chart/%5EBVSP"
    params = {
        "period1": str(to_unix_seconds(start_utc)),
        "period2": str(to_unix_seconds(period2_now_utc)),
        "interval": "1d",
        "events": "history",
        "includeAdjustedClose": "false",
    }

    for i in range(retries):
        try:
            # Prefer requests se disponível; caso contrário, urllib
            try:
                import requests  # type: ignore
                r = requests.get(base_url, params=params, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Python"}, timeout=5)
                status_code = r.status_code
                if status_code < 200 or status_code >= 400:
                    raise RuntimeError(f"HTTP_STATUS_{status_code}")
                data = r.json()
            except Exception as e_req:
                # fallback para urllib
                try:
                    from urllib.parse import urlencode
                    from urllib.request import Request, urlopen
                    url = base_url + "?" + urlencode(params)
                    req = Request(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Python"})
                    with urlopen(req, timeout=6) as resp:
                        status_code = getattr(resp, "status", 200)
                        raw = resp.read()
                    data = json.loads(raw.decode("utf-8"))
                except Exception as e_url:
                    raise RuntimeError(f"HTTP_ERROR: {e_req} | URLLIB_FALLBACK: {e_url}")

            # Parse esperado
            if "chart" not in data:
                raise RuntimeError("PARSE_ERROR: chave 'chart' ausente")
            chart = data["chart"]
            if chart.get("error"):
                raise RuntimeError(f"REMOTE_ERROR: {chart.get('error')}")
            results = chart.get("result", [])
            if not results:
                raise RuntimeError("PARSE_ERROR: 'result' vazio")
            res0 = results[0]
            ts = res0.get("timestamp", [])
            inds = res0.get("indicators", {})
            quotes = inds.get("quote", [])
            if not quotes:
                raise RuntimeError("PARSE_ERROR: 'quote[0]' ausente")
            q0 = quotes[0]
            opens = q0.get("open", [])
            highs = q0.get("high", [])
            lows = q0.get("low", [])
            closes = q0.get("close", [])
            vols = q0.get("volume", [])

            n = min(len(ts), len(opens), len(highs), len(lows), len(closes), len(vols))
            if n == 0:
                raise RuntimeError("DATA_EMPTY_ERROR: listas vazias")
            # Construir DataFrame posicional
            df_pre = pd.DataFrame({
                "date": pd.to_datetime(ts[:n], unit="s", utc=True),
                "open": opens[:n],
                "high": highs[:n],
                "low": lows[:n],
                "close": closes[:n],
                "volume": vols[:n],
            })
            # Normalizar Bronze com limpeza
            df_norm, stats = bronze_normalize(df_pre, ticker, START_DATE_UTC, END_DATE_UTC)
            attempts.append({"provider": "yahoo-chart", "attempt": i + 1, "ok": True, "rows": int(len(df_norm)), "exception_message": None})
            df_final = df_norm
            return df_final, stats, attempts
        except Exception as e:
            attempts.append({"provider": "yahoo-chart", "attempt": i + 1, "ok": False, "rows": 0, "exception_message": str(e)})
            if i < retries - 1:
                time.sleep(backoff_seconds[min(i, len(backoff_seconds) - 1)])

    return None, stats, attempts

def fetch_with_yfinance(
    ticker: str,
    start_utc: pd.Timestamp,
    end_utc: pd.Timestamp,
    retries: int = 2,
    backoff_seconds: List[float] = [0.8, 1.6]
) -> Tuple[Optional[pd.DataFrame], Dict[str, int], List[Dict[str, Any]]]:
    attempts: List[Dict[str, Any]] = []
    stats: Dict[str, int] = {"rows_before_cleaning": 0, "rows_after_cleaning": 0, "rows_dropped_ohlc": 0}
    for i in range(retries):
        try:
            try:
                import yfinance as yf  # type: ignore
            except Exception as e_imp:
                attempts.append({"provider": "yfinance", "attempt": i + 1, "ok": False, "rows": 0, "exception_message": f"IMPORT_ERROR: {e_imp}"})
                break
            try:
                start_str = start_utc.tz_localize(None).date().isoformat() if start_utc.tzinfo else start_utc.date().isoformat()
                end_inc = (end_utc + pd.Timedelta(days=1))  # end-exclusive
                end_str = end_inc.tz_localize(None).date().isoformat() if end_inc.tzinfo else end_inc.date().isoformat()
                df_raw = yf.download(
                    tickers=ticker,
                    start=start_str,
                    end=end_str,
                    interval="1d",
                    auto_adjust=False,
                    progress=False,
                    threads=True
                )
                if df_raw is None or df_raw.empty:
                    raise RuntimeError("DATA_EMPTY_ERROR: yfinance retornou vazio")
                # Mapear colunas
                df_raw = df_raw.copy()
                # Lidar com MultiIndex simples: se colunas são ('Open',), etc.
                if isinstance(df_raw.columns, pd.MultiIndex):
                    try:
                        df_raw.columns = [c[-1] if isinstance(c, tuple) else c for c in df_raw.columns.to_list()]
                    except Exception:
                        df_raw.columns = df_raw.columns.get_level_values(-1)
                rename_map = {"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume",
                              "open": "open", "high": "high", "low": "low", "close": "close", "volume": "volume"}
                df_raw = df_raw.rename(columns=rename_map)
                need = {"open", "high", "low", "close", "volume"}
                if not need.issubset(set(df_raw.columns)):
                    missing = sorted(list(need - set(df_raw.columns)))
                    raise RuntimeError(f"SCHEMA_ERROR: faltam colunas em yfinance: {missing}")
                df_pre = df_raw.reset_index().rename(columns={"Date": "date", "Datetime": "date"})
                if "date" not in df_pre.columns:
                    # se índice for datetime e não houver 'date' após reset
                    df_pre = df_raw.copy()
                    df_pre["date"] = df_pre.index
                    df_pre = df_pre.reset_index(drop=True)
                df_pre = df_pre[["date", "open", "high", "low", "close", "volume"]]
                df_norm, stats = bronze_normalize(df_pre, ticker, START_DATE_UTC, END_DATE_UTC)
                attempts.append({"provider": "yfinance", "attempt": i + 1, "ok": True, "rows": int(len(df_norm)), "exception_message": None})
                return df_norm, stats, attempts
            except Exception as e_dl:
                attempts.append({"provider": "yfinance", "attempt": i + 1, "ok": False, "rows": 0, "exception_message": str(e_dl)})
                if i < retries - 1:
                    time.sleep(backoff_seconds[min(i, len(backoff_seconds) - 1)])
        except Exception as e:
            attempts.append({"provider": "yfinance", "attempt": i + 1, "ok": False, "rows": 0, "exception_message": str(e)})
            break
    return None, stats, attempts

def fetch_with_stooq(
    ticker: str,
    start_utc: pd.Timestamp,
    end_utc: pd.Timestamp,
    retries: int = 1
) -> Tuple[Optional[pd.DataFrame], Dict[str, int], List[Dict[str, Any]]]:
    attempts: List[Dict[str, Any]] = []
    stats: Dict[str, int] = {"rows_before_cleaning": 0, "rows_after_cleaning": 0, "rows_dropped_ohlc": 0}
    for i in range(retries):
        try:
            try:
                from pandas_datareader import data as dr  # type: ignore
            except Exception as e_imp:
                attempts.append({"provider": "stooq", "attempt": i + 1, "ok": False, "rows": 0, "exception_message": f"IMPORT_ERROR: {e_imp}"})
                break
            try:
                candidates = [ticker, ticker.replace("^", ""), ticker.replace("^", "").lower()]
                df_raw = None
                last_exc = None
                for tk in candidates:
                    try:
                        df_raw = dr.DataReader(tk, "stooq", start=start_utc.tz_localize(None), end=end_utc.tz_localize(None))
                        if df_raw is not None and not df_raw.empty:
                            break
                    except Exception as e2:
                        last_exc = e2
                        continue
                if df_raw is None or df_raw.empty:
                    raise RuntimeError(f"STOOQ_EMPTY: {last_exc}") if last_exc else RuntimeError("STOOQ_EMPTY: retorno vazio")
                # Stooq costuma vir com colunas minúsculas ou 'Open/High/...'
                df_raw = df_raw.sort_index()
                if isinstance(df_raw.columns, pd.MultiIndex):
                    try:
                        df_raw.columns = [c[-1] if isinstance(c, tuple) else c for c in df_raw.columns.to_list()]
                    except Exception:
                        df_raw.columns = df_raw.columns.get_level_values(-1)
                rename_map = {"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume",
                              "open": "open", "high": "high", "low": "low", "close": "close", "volume": "volume"}
                df_raw = df_raw.rename(columns=rename_map)
                need = {"open", "high", "low", "close", "volume"}
                if not need.issubset(set(df_raw.columns)):
                    missing = sorted(list(need - set(df_raw.columns)))
                    raise RuntimeError(f"SCHEMA_ERROR: faltam colunas em stooq: {missing}")
                df_pre = df_raw.copy()
                df_pre["date"] = df_pre.index
                df_pre = df_pre.reset_index(drop=True)
                df_pre = df_pre[["date", "open", "high", "low", "close", "volume"]]
                df_norm, stats = bronze_normalize(df_pre, ticker, START_DATE_UTC, END_DATE_UTC)
                attempts.append({"provider": "stooq", "attempt": i + 1, "ok": True, "rows": int(len(df_norm)), "exception_message": None})
                return df_norm, stats, attempts
            except Exception as e_dl:
                attempts.append({"provider": "stooq", "attempt": i + 1, "ok": False, "rows": 0, "exception_message": str(e_dl)})
                break
        except Exception as e:
            attempts.append({"provider": "stooq", "attempt": i + 1, "ok": False, "rows": 0, "exception_message": str(e)})
            break
    return None, stats, attempts

# =========================
# Validações & Plano
# =========================
def validate_schema(df: pd.DataFrame) -> List[str]:
    erros = []
    if list(df.columns) != EXPECTED_COLUMNS:
        erros.append(f"VALIDATION_ERROR: schema de colunas incorreto. Esperado={EXPECTED_COLUMNS} Obtido={list(df.columns)}")
    dts = dtypes_signature(df)
    for c, dt_expected in EXPECTED_DTYPES.items():
        if c not in dts:
            erros.append(f"VALIDATION_ERROR: coluna ausente no DataFrame: {c}")
            continue
        got = dts[c]
        if c == "ticker":
            if not got.startswith("string"):
                erros.append(f"VALIDATION_ERROR: dtype incorreto para ticker. Esperado=string Obtido={got}")
        else:
            if got != dt_expected:
                erros.append(f"VALIDATION_ERROR: dtype incorreto para {c}. Esperado={dt_expected} Obtido={got}")
    if df["ticker"].isna().any():
        erros.append("VALIDATION_ERROR: ticker contém valores nulos (deve ser 0%).")
    return erros

def validate_quality(df: pd.DataFrame) -> List[str]:
    erros = []
    if len(df) < 2500:
        erros.append(f"VALIDATION_ERROR: cobertura insuficiente — linhas={len(df)} (< 2500)")
    pn = percent_nulls(df)
    for col in ["date", "close", "ticker"]:
        if round(pn.get(col, 100.0), 6) != 0.0:
            erros.append(f"VALIDATION_ERROR: % nulos em {col} deve ser 0%, obtido={pn.get(col, 100.0):.6f}%")
    dups = int(df.duplicated(subset=["date"]).sum())
    if dups != 0:
        erros.append(f"VALIDATION_ERROR: duplicatas por date detectadas (= {dups})")
    if not df["date"].is_monotonic_increasing:
        erros.append("VALIDATION_ERROR: coluna date não é monotônica crescente.")
    return erros

def validate_interval_with_tolerance(df: pd.DataFrame, start_utc: pd.Timestamp) -> Tuple[List[str], Dict[str, Any]]:
    erros = []
    if df.empty:
        return ["VALIDATION_ERROR: DataFrame vazio após ingestão."], {"date_min": None, "date_max": None, "start_verdict": "FAIL", "end_verdict": "FAIL"}
    dmin = pd.to_datetime(df["date"].min())
    dmax = pd.to_datetime(df["date"].max())
    required_start = start_utc.tz_convert(None).tz_localize(None) if start_utc.tzinfo else start_utc
    start_tol_max = (required_start + BDay(5)).to_pydatetime().date()
    start_ok = dmin <= pd.Timestamp(start_tol_max).to_pydatetime()
    if not start_ok:
        erros.append(f"VALIDATION_ERROR: date.min ({dmin.date().isoformat()}) > tolerância de início ({start_tol_max.isoformat()})")
    required_end_min = (pd.Timestamp(datetime.now(timezone.utc)).normalize() - pd.Timedelta(days=3)).tz_localize(None)
    end_ok = dmax >= required_end_min
    if not end_ok:
        erros.append(f"VALIDATION_ERROR: date.max ({dmax.date().isoformat()}) < requerido mínimo ({required_end_min.date().isoformat()}) (tolerância 3 dias)")
    info = {
        "date_min": dmin,
        "date_max": dmax,
        "required_start": required_start,
        "start_tolerance_max": pd.Timestamp(start_tol_max),
        "required_end_min": required_end_min,
        "start_verdict": "OK" if start_ok else "FAIL",
        "end_verdict": "OK" if end_ok else "FAIL",
    }
    return erros, info

def build_persistence_plan(df: pd.DataFrame) -> Dict[str, Any]:
    years = sorted(pd.to_datetime(df["date"]).dt.year.unique().tolist())
    partitions = [f"year={y}" for y in years]
    manifesto_header = ["timestamp", "ticker", "rows_total", "date_min", "date_max", "columns_json", "partitions_json", "target_path"]
    manifesto_row = [
        AGORA.isoformat(),
        TICKER,
        int(len(df)),
        str(pd.to_datetime(df["date"]).min()),
        str(pd.to_datetime(df["date"]).max()),
        json.dumps(EXPECTED_COLUMNS, ensure_ascii=False),
        json.dumps(partitions, ensure_ascii=False),
        str(PARQUET_TARGET),
    ]
    return {
        "parquet_target": str(PARQUET_TARGET),
        "partitions": partitions,
        "manifesto_path": str(MANIFESTO_TARGET),
        "manifesto_header": ",".join(manifesto_header),
        "manifesto_row_sample": ",".join([str(x) for x in manifesto_row]),
    }

# =========================
# Execução Principal
# =========================
def main():
    provider_attempts: List[Dict[str, Any]] = []
    erros_normativos: List[str] = []

    bronze_ibov: Optional[pd.DataFrame] = None
    used_provider: Optional[str] = None
    cleaning_stats: Dict[str, int] = {"rows_before_cleaning": 0, "rows_after_cleaning": 0, "rows_dropped_ohlc": 0}

    # P1: Yahoo Chart
    df_yc, stats_yc, attempts_yc = fetch_yahoo_chart_direct(TICKER, START_DATE_UTC, PERIOD2_NOW_UTC)
    provider_attempts.extend(attempts_yc)
    if df_yc is not None and not df_yc.empty:
        bronze_ibov = df_yc
        used_provider = "yahoo-chart"
        cleaning_stats = stats_yc
    else:
        # P2: yfinance (apenas se P1 falhar)
        df_yf, stats_yf, attempts_yf = fetch_with_yfinance(TICKER, START_DATE_UTC, END_DATE_UTC)
        provider_attempts.extend(attempts_yf)
        if df_yf is not None and not df_yf.empty:
            bronze_ibov = df_yf
            used_provider = "yfinance"
            cleaning_stats = stats_yf
        else:
            # P3: stooq (apenas se P1 e P2 falharem)
            df_stq, stats_stq, attempts_stq = fetch_with_stooq(TICKER, START_DATE_UTC, END_DATE_UTC)
            provider_attempts.extend(attempts_stq)
            if df_stq is not None and not df_stq.empty:
                bronze_ibov = df_stq
                used_provider = "stooq"
                cleaning_stats = stats_stq

    # Se todos falharem
    if bronze_ibov is None or bronze_ibov.empty:
        print_section("PROVEDORES E TENTATIVAS")
        print(json.dumps(provider_attempts, ensure_ascii=False, indent=2))
        # Selecionar a exceção mais informativa (última não-ok com mensagem)
        last_err = None
        for att in reversed(provider_attempts):
            if not att.get("ok") and att.get("exception_message"):
                last_err = att.get("exception_message")
                break
        print(f"VALIDATION_ERROR: PROVIDERS_EXHAUSTED — {last_err if last_err else 'sem mensagem detalhada.'}")
        print_section("CHECKLIST")
        checklist = {
            "provider_attempts_listed": "ok",
            "schema_columns_and_dtypes_exact": "falha",
            "interval_tolerance_verdicts": "falha",
            "quality_nulls_and_duplicates": "falha",
            "sample_head_tail_presented": "falha",
            "counts_included": "falha",
            "persistence_plan_simulated": "ok",
        }
        print(json.dumps(checklist, ensure_ascii=False, indent=2))
        for k, v in checklist.items():
            if v != "ok":
                print(f"CHECKLIST_FAILURE: {k} não atendido.")
        print_section("DÚVIDAS OBJETIVAS")
        print("- Rede pode estar bloqueada para Yahoo/Stooq? Há Proxy que devamos configurar?")
        print("- Deseja fornecer outro provedor (AlphaVantage/Polygon) com chave?")
        print("- Autoriza aumentar timeouts/backoff e tentar novamente?")
        return

    # Reforço de tipos/order e ticker
    bronze_ibov = bronze_ibov.copy()
    bronze_ibov["date"] = pd.to_datetime(bronze_ibov["date"], errors="coerce").dt.normalize()
    for c in ["open", "high", "low", "close"]:
        bronze_ibov[c] = pd.to_numeric(bronze_ibov[c], errors="coerce").astype("float64")
    bronze_ibov["volume"] = pd.to_numeric(bronze_ibov["volume"], errors="coerce").fillna(0).astype("int64")
    bronze_ibov["ticker"] = pd.Series([TICKER] * len(bronze_ibov), dtype="string").astype("string")
    bronze_ibov = bronze_ibov[EXPECTED_COLUMNS].sort_values("date").drop_duplicates(subset=["date"], keep="last").reset_index(drop=True)

    # Validações
    schema_errors = validate_schema(bronze_ibov)
    qual_errors = validate_quality(bronze_ibov)
    interval_errors, interval_info = validate_interval_with_tolerance(bronze_ibov, START_DATE_UTC)
    erros_normativos.extend(schema_errors + qual_errors + interval_errors)

    # Métricas
    total_linhas = int(len(bronze_ibov))
    dias_unicos = int(bronze_ibov["date"].nunique()) if total_linhas > 0 else 0
    dias_vol_zero = int((bronze_ibov["volume"] == 0).sum()) if total_linhas > 0 else 0
    pct_nulos = percent_nulls(bronze_ibov)
    dups_by_date = int(bronze_ibov.duplicated(subset=["date"]).sum())

    # Plano de persistência (simulado)
    persist_plan = build_persistence_plan(bronze_ibov)

    # Relatórios
    print_section("PROVEDOR E TENTATIVAS")
    print(json.dumps({"provider_used": used_provider, "rows_returned": total_linhas}, ensure_ascii=False, indent=2))
    print(json.dumps(provider_attempts, ensure_ascii=False, indent=2))

    print_section("SCHEMA (EXATO)")
    schema_out = {
        "columns_expected": EXPECTED_COLUMNS,
        "columns_obtained": list(bronze_ibov.columns),
        "dtypes_obtained": dtypes_signature(bronze_ibov),
        "nulls_percent": {k: round(v, 6) for k, v in pct_nulos.items()},
        "ticker_dtype_is_string": str(bronze_ibov.dtypes["ticker"]).startswith("string"),
        "ticker_nulls_percent": round(pct_nulos.get("ticker", 100.0), 6),
    }
    print(json.dumps(schema_out, ensure_ascii=False, indent=2))

    print_section("INTERVALO TEMPORAL (com tolerâncias)")
    interval_out = {
        "required_start": str(interval_info["required_start"]) if interval_info["date_min"] is not None else None,
        "start_tolerance_max": str(interval_info["start_tolerance_max"]) if interval_info["date_min"] is not None else None,
        "required_end_min": str(interval_info["required_end_min"]) if interval_info["date_max"] is not None else None,
        "date_min": str(pd.to_datetime(interval_info["date_min"])) if interval_info["date_min"] is not None else None,
        "date_max": str(pd.to_datetime(interval_info["date_max"])) if interval_info["date_max"] is not None else None,
        "start_verdict": interval_info.get("start_verdict", "FAIL"),
        "end_verdict": interval_info.get("end_verdict", "FAIL"),
    }
    print(json.dumps(interval_out, ensure_ascii=False, indent=2))

    print_section("QUALIDADE")
    qualidade_out = {
        "percent_nulls": {k: round(v, 6) for k, v in pct_nulos.items()},
        "duplicates_by_date": dups_by_date,
        "constraints": {
            "nulls_must_be_zero_in": {"date": True, "close": True, "ticker": True},
            "duplicates_by_date_must_be_zero": True,
            "min_rows_required": 2500,
            "date_monotonic_increasing": True
        }
    }
    print(json.dumps(qualidade_out, ensure_ascii=False, indent=2))

    print_section("AMOSTRA — HEAD(10)")
    print(bronze_ibov[["date", "close", "volume", "ticker"]].head(10).to_string(index=False))

    print_section("AMOSTRA — TAIL(10)")
    print(bronze_ibov[["date", "close", "volume", "ticker"]].tail(10).to_string(index=False))

    print_section("CONTAGENS")
    print(json.dumps({
        "rows_before_cleaning": cleaning_stats.get("rows_before_cleaning", 0),
        "rows_dropped_ohlc": cleaning_stats.get("rows_dropped_ohlc", 0),
        "rows_after_cleaning": cleaning_stats.get("rows_after_cleaning", 0),
        "unique_days": dias_unicos,
        "days_with_volume_zero": dias_vol_zero,
        "final_rows": total_linhas
    }, ensure_ascii=False, indent=2))

    print_section("PLANO DE PERSISTÊNCIA (SIMULADO)")
    print(json.dumps({
        "dry_run": DRY_RUN,
        "parquet_target": persist_plan["parquet_target"],
        "partitions": persist_plan["partitions"],
        "manifesto_path": persist_plan["manifesto_path"],
        "manifesto_header": persist_plan["manifesto_header"],
        "manifesto_row_sample": persist_plan["manifesto_row_sample"],
        "nota": "Nenhuma escrita realizada em dry_run=True."
    }, ensure_ascii=False, indent=2))

    # Erros normativos (se houver)
    if erros_normativos:
        print_section("ERROS NORMATIVOS")
        seen = set()
        ordered = []
        for e in erros_normativos:
            if e not in seen:
                seen.add(e)
                ordered.append(e)
        for e in ordered:
            if not (str(e).startswith("VALIDATION_ERROR") or str(e).startswith("CHECKLIST_FAILURE")):
                print(f"VALIDATION_ERROR: {e}")
            else:
                print(e)

    # Checklist
    print_section("CHECKLIST")
    schema_ok = (len(schema_errors := schema_errors if 'schema_errors' in locals() else validate_schema(bronze_ibov)) == 0)  # revalida se necessário
    interval_ok = (len(interval_errors) == 0 and interval_info.get("start_verdict") == "OK" and interval_info.get("end_verdict") == "OK")
    quality_ok = (len(qual_errors := qual_errors if 'qual_errors' in locals() else validate_quality(bronze_ibov)) == 0)
    sample_ok = (total_linhas > 0)
    counts_ok = True  # contagens sempre apresentadas
    attempts_ok = True
    plan_ok = True

    checklist = {
        "provider_attempts_listed": "ok" if attempts_ok else "falha",
        "schema_columns_and_dtypes_exact": "ok" if schema_ok else "falha",
        "interval_tolerance_verdicts": "ok" if interval_ok else "falha",
        "quality_nulls_and_duplicates": "ok" if quality_ok else "falha",
        "sample_head_tail_presented": "ok" if sample_ok else "falha",
        "counts_included": "ok" if counts_ok else "falha",
        "persistence_plan_simulated": "ok" if plan_ok else "falha",
    }
    print(json.dumps(checklist, ensure_ascii=False, indent=2))
    for k, v in checklist.items():
        if v != "ok":
            print(f"CHECKLIST_FAILURE: {k} não atendido.")

    # Estrutura do Resultado (info)
    print_section("ESTRUTURA DO RESULTADO (info)")
    resultado = {
        "ticker": TICKER,
        "periodo": {"start": str(START_DATE_UTC.tz_localize(None)), "end": str(END_DATE_UTC.tz_localize(None))},
        "dry_run": DRY_RUN,
        "timestamp_execucao": AGORA.isoformat(),
        "dataframe_name": "bronze_ibov",
        "columns": EXPECTED_COLUMNS,
        "dtypes": dtypes_signature(bronze_ibov),
        "provider_used": used_provider,
        "status": "sucesso" if not erros_normativos and all(v == "ok" for v in checklist.values()) else "falha"
    }
    print(json.dumps(resultado, ensure_ascii=False, indent=2))

if __name__ == "__main__":
    # Contrato
    # - Coleta direta Yahoo Chart (requests/stdlib) → yfinance → stooq (sem dados sintéticos)
    # - Normalização Bronze e validações: schema, qualidade, tolerâncias de calendário
    # - Planos de persistência (simulados), checklist e mensagens normativas
    main()


{
  "provider_used": "yahoo-chart",
  "rows_returned": 3399
}
[
  {
    "provider": "yahoo-chart",
    "attempt": 1,
    "ok": true,
    "rows": 3399,
    "exception_message": null
  }
]

{
  "columns_expected": [
    "date",
    "open",
    "high",
    "low",
    "close",
    "volume",
    "ticker"
  ],
  "columns_obtained": [
    "date",
    "open",
    "high",
    "low",
    "close",
    "volume",
    "ticker"
  ],
  "dtypes_obtained": {
    "date": "datetime64[ns]",
    "open": "float64",
    "high": "float64",
    "low": "float64",
    "close": "float64",
    "volume": "int64",
    "ticker": "string"
  },
  "nulls_percent": {
    "date": 0.0,
    "open": 0.0,
    "high": 0.0,
    "low": 0.0,
    "close": 0.0,
    "volume": 0.0,
    "ticker": 0.0
  },
  "ticker_dtype_is_string": true,
  "ticker_nulls_percent": 0.0
}

{
  "required_start": "2012-01-01 00:00:00",
  "start_tolerance_max": "2012-01-06 00:00:00",
  "required_end_min": "2025-09-15 00:00:00",
  "date_min": "2012-01-03 00