In [4]:
# célula Jupyter atualizada — Baixa IBOV (^BVSP) desde 2012-01-01, salva Bronze parquet e atualiza manifesto
# Mudanças principais:
# - Achata MultiIndex de colunas (junta níveis com "_") e depois normaliza para snake_case (1a).
# - Se coluna 'date' inexistente ou inteiramente NaT, PARA e imprime diagnóstico sem salvar (2a).
# - Adiciona prints diagnósticos de df.columns e tipo antes da deduplicação (3sim).
# - Se parquet alvo existir, adiciona sufixo timestamp para não sobrescrever (4b).
# - Pode ser inserida no notebook (5sim).
import os
import json
import hashlib
from datetime import datetime, timezone
from pathlib import Path

import pandas as pd
import yfinance as yf

# ---- Helpers ----
def sha256_of_file(path, chunk_size=65536):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            h.update(chunk)
    return h.hexdigest()

def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def flatten_multiindex_columns(df):
    # If columns are MultiIndex, join levels with "_" and coerce to strings
    cols = df.columns
    if isinstance(cols, pd.MultiIndex):
        new_cols = []
        for tup in cols:
            parts = [str(p) for p in tup if (p is not None and str(p) != "")]
            new_cols.append("_".join(parts) if parts else "")
        df.columns = new_cols
    else:
        # ensure all column names are strings
        df.columns = [str(c) for c in df.columns]
    return df

def to_snake_case_name(s):
    # minimal snake_case normalizer: remove spaces, replace hyphens, lower, replace multiple _ with single
    import re
    s = s.strip()
    s = s.replace(" ", "_").replace("-", "_")
    s = re.sub(r"[:/\\]", "_", s)
    s = re.sub(r"__+", "_", s)
    s = s.lower()
    return s

def to_snake_case_cols(df):
    df = df.copy()
    new_cols = {}
    for c in df.columns:
        new_cols[c] = to_snake_case_name(c).replace("adj_close", "adj_close")  # keep adj_close if already
    return df.rename(columns=new_cols)

def csv_quote(value):
    s = str(value)
    if any(ch in s for ch in ['"', ',', '\n']):
        s = '"' + s.replace('"', '""') + '"'
    return s

def timestamp_suffix():
    return datetime.utcnow().strftime("%Y%m%d_%H%M%S")

# ---- Params / Paths ----
ROOT = Path("/home/wrm/BOLSA_2026")
TARGET_DIR = ROOT / "dados_originais"
MANIFEST_PATH = TARGET_DIR / "manifesto_dados_originais_bronze_ibov.csv"
TICKER = "^BVSP"
REQ_START = "2012-01-01"
today_utc = pd.Timestamp.utcnow().normalize()
REQ_END = (today_utc + pd.Timedelta(days=1)).strftime("%Y-%m-%d")  # exclusive end

ensure_dir(TARGET_DIR)

# ---- Download ----
df_raw = yf.download(TICKER, start=REQ_START, end=REQ_END, auto_adjust=False, progress=False)

# ---- Process & Validate ----
if isinstance(df_raw, pd.DataFrame) and not df_raw.empty:
    # Make a working copy
    df = df_raw.copy()

    # If columns are MultiIndex, flatten them by joining levels with "_" (1a)
    df = flatten_multiindex_columns(df)

    # Reset index if index is DatetimeIndex and there's no explicit date column
    if not any(pd.api.types.is_datetime64_any_dtype(df[col]) for col in df.columns):
        # try reset_index to extract index as 'Date' column
        df = df.reset_index()

    # Normalize column names to snake_case after flattening
    df = to_snake_case_cols(df)

    # Diagnostic prints authorized (3sim)
    print("DEBUG df.columns:", list(df.columns))
    print("DEBUG type(df.columns):", type(df.columns))
    print("DEBUG sample rows (up to 3):")
    try:
        print(df.head(3).to_dict(orient="records"))
    except Exception:
        print("DEBUG: cannot show sample rows")

    # Robustly ensure 'date' column exists:
    if "date" not in df.columns:
        # Try common alternatives
        if "index" in df.columns:
            df = df.rename(columns={"index": "date"})
        elif "date_local" in df.columns:
            df = df.rename(columns={"date_local": "date"})
        else:
            # try to find any datetime-like column
            found = False
            for c in df.columns:
                try:
                    if pd.api.types.is_datetime64_any_dtype(df[c]) or pd.api.types.is_datetime64_ns_dtype(df[c]):
                        df = df.rename(columns={c: "date"})
                        found = True
                        break
                except Exception:
                    continue
            if not found:
                # try original index from df_raw
                try:
                    idx = df_raw.index
                    if pd.api.types.is_datetime64_any_dtype(idx) or getattr(idx, "dtype", None) is not None:
                        df.insert(0, "date", pd.to_datetime(idx))
                    else:
                        df.insert(0, "date", pd.NaT)
                except Exception:
                    df.insert(0, "date", pd.NaT)

    # Coerce date dtype (safe: errors -> NaT)
    df["date"] = pd.to_datetime(df["date"], errors="coerce")

    # If date column missing entirely or all NaT -> stop and print diagnostic (2a)
    if "date" not in df.columns or df["date"].dropna().empty:
        print("ERROR: 'date' column missing or all NaT after processing. Aborting write.")
        print("DIAGNOSTIC df.columns:", list(df.columns))
        print("DIAGNOSTIC head (5):")
        try:
            print(df.head(5).to_dict(orient="records"))
        except Exception:
            print("DIAGNOSTIC: cannot show head")
        print("Manifest path (no write):", MANIFEST_PATH)
    else:
        # Ensure required numeric columns exist
        for c in ["open", "high", "low", "close", "volume"]:
            if c not in df.columns:
                df[c] = pd.NA

        # Ensure adj_close exists
        if "adj_close" not in df.columns:
            # Try variants from flattened names like 'adj_close_^bvsp' etc.
            adj_candidates = [col for col in df.columns if "adj" in col and "close" in col]
            if adj_candidates:
                df = df.rename(columns={adj_candidates[0]: "adj_close"})
            else:
                df["adj_close"] = pd.NA

        # Ensure ticker column and set value
        df["ticker"] = TICKER

        # Keep only expected schema order
        cols_required = ["date", "open", "high", "low", "close", "adj_close", "volume", "ticker"]
        df = df[[c for c in cols_required if c in df.columns]]

        # Sort by date ascending (NaT to the end)
        df = df.sort_values("date", ascending=True, na_position="last").reset_index(drop=True)

        # Count duplicates by date and drop keeping last
        before_rows = len(df)
        duplicates_mask = df.duplicated(subset=["date"], keep="last")
        duplicates_count = int(duplicates_mask.sum())
        if duplicates_count > 0:
            df = df.drop_duplicates(subset=["date"], keep="last").reset_index(drop=True)
        after_rows = len(df)

        # NaNs per column
        nan_counts = {col: int(df[col].isna().sum()) for col in df.columns}

        # Date range (skip NaT)
        valid_dates = df["date"].dropna()
        date_min = valid_dates.min() if not valid_dates.empty else pd.NaT
        date_max = valid_dates.max() if not valid_dates.empty else pd.NaT
        date_min_str = pd.to_datetime(date_min).strftime("%Y-%m-%d") if pd.notna(date_min) else "na"
        date_max_str = pd.to_datetime(date_max).strftime("%Y-%m-%d") if pd.notna(date_max) else "na"

        # Parquet filename and save (snappy). If exists, add timestamp suffix (4b)
        parquet_filename = f"IBOV_{date_min_str}_{date_max_str}.parquet"
        parquet_path = TARGET_DIR / parquet_filename
        if parquet_path.exists():
            parquet_path = TARGET_DIR / f"IBOV_{date_min_str}_{date_max_str}_{timestamp_suffix()}.parquet"

        df.to_parquet(parquet_path, engine="pyarrow", compression="snappy", index=False)

        # SHA256
        file_sha256 = sha256_of_file(parquet_path)

        # Manifest entry
        run_ts = datetime.utcnow().replace(tzinfo=timezone.utc).isoformat()
        manifest_fields = [
            "run_ts",
            "ticker",
            "start_req",
            "end_req",
            "date_min",
            "date_max",
            "rows",
            "cols",
            "duplicates_dropped",
            "nan_counts_json",
            "parquet_path",
            "parquet_sha256"
        ]
        manifest_row = {
            "run_ts": run_ts,
            "ticker": TICKER,
            "start_req": REQ_START,
            "end_req": today_utc.strftime("%Y-%m-%d"),
            "date_min": date_min_str,
            "date_max": date_max_str,
            "rows": after_rows,
            "cols": len(df.columns),
            "duplicates_dropped": duplicates_count,
            "nan_counts_json": json.dumps(nan_counts, ensure_ascii=False),
            "parquet_path": str(parquet_path),
            "parquet_sha256": file_sha256
        }

        # Append to manifest (create with header if missing)
        manifest_exists = MANIFEST_PATH.exists()
        if not manifest_exists:
            with open(MANIFEST_PATH, "w", encoding="utf-8") as f:
                f.write(",".join(manifest_fields) + "\n")
        with open(MANIFEST_PATH, "a", encoding="utf-8") as f:
            row_values = [manifest_row[field] for field in manifest_fields]
            row_quoted = [csv_quote(v) for v in row_values]
            f.write(",".join(row_quoted) + "\n")

        # Required prints (exact lines)
        print(f"Bronze target parquet: {parquet_path}")
        print(f"Exists: {parquet_path.exists()}")
        print(f"Rows, cols: ({after_rows}, {len(df.columns)})")
        print(f"Columns: {list(df.columns)}")
        print(f"Dataframe date range: {date_min_str} -> {date_max_str}")
        print(f"Duplicate rows by date: {duplicates_count}")
        print(f"NaNs per column: {nan_counts}")
        print(f"File sha256: {file_sha256}")
        print(f"Manifest path: {MANIFEST_PATH}")
        print(f"Saved bronze parquet and updated manifest: {parquet_path} {MANIFEST_PATH}")

else:
    # No data downloaded — prints required fallback
    print("Bronze target parquet: None")
    print("Exists: False")
    print("Rows, cols: (0, 0)")
    print("Columns: []")
    print("Dataframe date range: None -> None")
    print("Duplicate rows by date: 0")
    print("NaNs per column: {}")
    print("File sha256: None")
    print(f"Manifest path: {MANIFEST_PATH}")
    print("Saved bronze parquet and updated manifest: None None")

DEBUG df.columns: ['date', 'adj_close_^bvsp', 'close_^bvsp', 'high_^bvsp', 'low_^bvsp', 'open_^bvsp', 'volume_^bvsp']
DEBUG type(df.columns): <class 'pandas.core.indexes.base.Index'>
DEBUG sample rows (up to 3):
[{'date': Timestamp('2012-01-03 00:00:00'), 'adj_close_^bvsp': 59265.0, 'close_^bvsp': 59265.0, 'high_^bvsp': 59288.0, 'low_^bvsp': 57836.0, 'open_^bvsp': 57836.0, 'volume_^bvsp': 3083000}, {'date': Timestamp('2012-01-04 00:00:00'), 'adj_close_^bvsp': 59365.0, 'close_^bvsp': 59365.0, 'high_^bvsp': 59519.0, 'low_^bvsp': 58558.0, 'open_^bvsp': 59263.0, 'volume_^bvsp': 2252000}, {'date': Timestamp('2012-01-05 00:00:00'), 'adj_close_^bvsp': 58546.0, 'close_^bvsp': 58546.0, 'high_^bvsp': 59354.0, 'low_^bvsp': 57963.0, 'open_^bvsp': 59354.0, 'volume_^bvsp': 2351200}]
Bronze target parquet: /home/wrm/BOLSA_2026/dados_originais/IBOV_2012-01-03_2025-09-17.parquet
Exists: True
Rows, cols: (3398, 8)
Columns: ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume', 'ticker']
Datafra

  run_ts = datetime.utcnow().replace(tzinfo=timezone.utc).isoformat()
