In [1]:
# ETAPA: DOWNLOAD DE 3 ANOS (OHLCV DIÁRIO) — 24 TICKERS + IBOV
# Objetivo: baixar 3 anos de dados para a carteira base e salvar Parquets + manifesto.

%pip install tqdm
from pathlib import Path
from datetime import datetime, timedelta, timezone
import pandas as pd
from tqdm import tqdm
import yfinance as yf

# Diretório SSOT local
BASE_DIR = Path("/home/wrm/BOLSA_2026/dados_originais")
BASE_DIR.mkdir(parents=True, exist_ok=True)

# Seleção interna balanceada (24)
TICKERS_24 = [
    # Financeiro (4)
    "ITUB4.SA", "BBAS3.SA", "B3SA3.SA", "PSSA3.SA",
    # Materiais / Papel & Celulose / Metais (4)
    "VALE3.SA", "GGBR4.SA", "CSNA3.SA", "SUZB3.SA",
    # Energia / Óleo & Gás / Combustíveis (3)
    "PETR4.SA", "PRIO3.SA", "UGPA3.SA",
    # Utilidades Elétricas (3)
    "ELET3.SA", "TAEE11.SA", "CPLE6.SA",
    # Saneamento (1)
    "SBSP3.SA",
    # Telecom (2)
    "VIVT3.SA", "TIMS3.SA",
    # Saúde (2)
    "RDOR3.SA", "HAPV3.SA",
    # Consumo / Industrial / Tecnologia (4)
    "ABEV3.SA", "WEGE3.SA", "TOTS3.SA", "LREN3.SA",
    # Transporte / Infra (1)
    "RAIL3.SA",
]

# Índice Ibovespa
IBOV_TICKER = "^BVSP"

# Janela temporal (3 anos até hoje, com folga)
agora = datetime.now(timezone.utc).astimezone()
data_fim = agora.date()
data_inicio = data_fim - timedelta(days=365*3 + 10)

# Validações
assert len(TICKERS_24) == 24, f"Esperados 24 tickers, obtidos {len(TICKERS_24)}"
for tk in TICKERS_24:
    if not tk.endswith(".SA"):
        raise ValueError(f"Ticker sem sufixo .SA: {tk}")

def baixar_e_salvar(ticker: str, start_date: str, end_date: str, base_dir: Path) -> dict:
    df = yf.download(
        ticker, start=start_date, end=end_date,
        progress=False, interval="1d", auto_adjust=False, threads=True
    )
    if not df.empty:
        df = (df.rename(columns={
                "Open": "open", "High": "high", "Low": "low",
                "Close": "close", "Adj Close": "adj_close", "Volume": "volume"
             })
             .reset_index()
             .rename(columns={"Date": "date"}))
        df["ticker"] = ticker
        out_path = base_dir / f"{ticker.replace('^','').replace('.','_')}_3y.parquet"
        df.to_parquet(out_path, index=False)
        return {
            "ticker": ticker, "rows": len(df),
            "first_date": df["date"].min().strftime("%Y-%m-%d"),
            "last_date": df["date"].max().strftime("%Y-%m-%d"),
            "file_path": str(out_path), "status": "OK"
        }
    else:
        return {"ticker": ticker, "rows": 0, "first_date": None, "last_date": None,
                "file_path": None, "status": "VAZIO"}

metas = []
for tk in tqdm(TICKERS_24 + [IBOV_TICKER], desc="Baixando 3 anos (OHLCV)"):
    metas.append(baixar_e_salvar(
        ticker=tk,
        start_date=data_inicio.strftime("%Y-%m-%d"),
        end_date=data_fim.strftime("%Y-%m-%d"),
        base_dir=BASE_DIR
    ))

manifesto = pd.DataFrame(metas)
manifesto_path = BASE_DIR / "manifesto_dados_originais_3y.csv"
manifesto.to_csv(manifesto_path, index=False)

print("\nResumo do manifesto:")
print(manifesto[["ticker", "rows", "first_date", "last_date", "status", "file_path"]]
      .to_string(index=False))
print(f"\nArquivos salvos em: {BASE_DIR}")
print(f"Manifesto: {manifesto_path}")


Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)


Installing collected packages: tqdm


Successfully installed tqdm-4.67.1


Note: you may need to restart the kernel to use updated packages.



Baixando 3 anos (OHLCV):   0%|                                       | 0/25 [00:00<?, ?it/s]


Baixando 3 anos (OHLCV):   4%|█▏                             | 1/25 [00:01<00:29,  1.23s/it]


Baixando 3 anos (OHLCV):   8%|██▍                            | 2/25 [00:01<00:19,  1.18it/s]


Baixando 3 anos (OHLCV):  12%|███▋                           | 3/25 [00:02<00:15,  1.43it/s]


Baixando 3 anos (OHLCV):  16%|████▉                          | 4/25 [00:02<00:12,  1.62it/s]


Baixando 3 anos (OHLCV):  20%|██████▏                        | 5/25 [00:03<00:12,  1.61it/s]


Baixando 3 anos (OHLCV):  24%|███████▍                       | 6/25 [00:04<00:11,  1.62it/s]


Baixando 3 anos (OHLCV):  28%|████████▋                      | 7/25 [00:04<00:11,  1.62it/s]


Baixando 3 anos (OHLCV):  32%|█████████▉                     | 8/25 [00:05<00:10,  1.63it/s]


Baixando 3 anos (OHLCV):  36%|███████████▏                   | 9/25 [00:05<00:09,  1.72it/s]


Baixando 3 anos (OHLCV):  40%|████████████                  | 10/25 [00:06<00:09,  1.61it/s]


Baixando 3 anos (OHLCV):  44%|█████████████▏                | 11/25 [00:07<00:08,  1.69it/s]


Baixando 3 anos (OHLCV):  48%|██████████████▍               | 12/25 [00:07<00:08,  1.52it/s]


Baixando 3 anos (OHLCV):  52%|███████████████▌              | 13/25 [00:08<00:07,  1.54it/s]


Baixando 3 anos (OHLCV):  56%|████████████████▊             | 14/25 [00:09<00:07,  1.57it/s]


Baixando 3 anos (OHLCV):  60%|██████████████████            | 15/25 [00:09<00:06,  1.57it/s]


Baixando 3 anos (OHLCV):  64%|███████████████████▏          | 16/25 [00:10<00:05,  1.61it/s]


Baixando 3 anos (OHLCV):  68%|████████████████████▍         | 17/25 [00:10<00:04,  1.70it/s]


Baixando 3 anos (OHLCV):  72%|█████████████████████▌        | 18/25 [00:11<00:03,  1.76it/s]


Baixando 3 anos (OHLCV):  76%|██████████████████████▊       | 19/25 [00:11<00:03,  1.81it/s]


Baixando 3 anos (OHLCV):  80%|████████████████████████      | 20/25 [00:12<00:02,  1.77it/s]


Baixando 3 anos (OHLCV):  84%|█████████████████████████▏    | 21/25 [00:13<00:02,  1.72it/s]


Baixando 3 anos (OHLCV):  88%|██████████████████████████▍   | 22/25 [00:13<00:01,  1.69it/s]


Baixando 3 anos (OHLCV):  92%|███████████████████████████▌  | 23/25 [00:14<00:01,  1.60it/s]


Baixando 3 anos (OHLCV):  96%|████████████████████████████▊ | 24/25 [00:14<00:00,  1.72it/s]


Baixando 3 anos (OHLCV): 100%|██████████████████████████████| 25/25 [00:15<00:00,  1.76it/s]


Baixando 3 anos (OHLCV): 100%|██████████████████████████████| 25/25 [00:15<00:00,  1.62it/s]


Resumo do manifesto:
   ticker  rows first_date  last_date status                                                 file_path
 ITUB4.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/ITUB4_SA_3y.parquet
 BBAS3.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/BBAS3_SA_3y.parquet
 B3SA3.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/B3SA3_SA_3y.parquet
 PSSA3.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/PSSA3_SA_3y.parquet
 VALE3.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/VALE3_SA_3y.parquet
 GGBR4.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/GGBR4_SA_3y.parquet
 CSNA3.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/CSNA3_SA_3y.parquet
 SUZB3.SA   754 2022-09-06 2025-09-12     OK  /home/wrm/BOLSA_2026/dados_originais/SUZB3_SA_3y.parquet
 PETR4.SA   754 2022-09-06 2025-09-12     OK  /home




In [2]:
# Objetivo: definir diretórios, imports e funções utilitárias (hash e escrita segura).

from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
import hashlib
import pyarrow  # garante engine parquet
import pyarrow.parquet as pq

# === Diretórios ===
BRONZE_DIR = Path("/home/wrm/BOLSA_2026/dados_originais")            # já existente
SILVER_DIR = Path("/home/wrm/BOLSA_2026/intermediarios/silver")      # destino Silver
SILVER_DIR.mkdir(parents=True, exist_ok=True)

# Arquivo manifesto do Bronze (gerado na ingesta)
MANIFESTO_BRONZE = BRONZE_DIR / "manifesto_dados_originais_3y.csv"
MANIFESTO_SILVER = SILVER_DIR / "manifesto_silver.csv"

# === Utilitários ===
def sha256_file(path: Path, chunk_size: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk: break
            h.update(chunk)
    return h.hexdigest()

def enforce_schema_flat(df: pd.DataFrame) -> pd.DataFrame:
    """
    Garante colunas planas e nomes padronizados: 
    date, open, high, low, close, adj_close, volume, ticker
    Converte dtypes e remove MultiIndex se houver.
    """
    # Se houver MultiIndex, flaten
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = ["_".join([str(c) for c in col if c != ""]).strip("_") for col in df.columns]

    # Mapear nomes usuais para padrão
    rename_map = {
        "Date": "date", "Datetime": "date", "date": "date",
        "Open": "open", "open": "open",
        "High": "high", "high": "high",
        "Low": "low", "low": "low",
        "Close": "close", "close": "close",
        "Adj Close": "adj_close", "Adj_Close": "adj_close", "adj_close": "adj_close",
        "Volume": "volume", "volume": "volume",
        "Ticker": "ticker", "ticker": "ticker"
    }
    # Se existem colunas com prefixos (ex.: "Price_Open"), tentar reduzir
    for c in list(df.columns):
        base = c.split("_")[-1] if "_" in c else c
        if base in rename_map:
            rename_map[c] = rename_map[base]
    df = df.rename(columns=rename_map)

    # Manter apenas colunas do schema alvo se existirem
    cols_alvo = ["date", "open", "high", "low", "close", "adj_close", "volume", "ticker"]
    keep = [c for c in cols_alvo if c in df.columns]
    df = df[keep].copy()

    # Conversões de tipo
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], utc=False, errors="coerce")
    for c in ["open", "high", "low", "close", "adj_close"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("float64")
    if "volume" in df.columns:
        df["volume"] = pd.to_numeric(df["volume"], errors="coerce").fillna(0).astype("int64")
    if "ticker" in df.columns:
        # garantir string simples
        df["ticker"] = df["ticker"].astype("string")

    # Ordenação básica e drop de nulos críticos (date/ticker)
    if "date" in df.columns:
        df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)

    return df

def write_parquet(df: pd.DataFrame, out_path: Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(out_path, index=False)


In [3]:
# Objetivo: carregar o manifesto do Bronze, validar existência de 25 entradas e a presença de arquivos.

assert MANIFESTO_BRONZE.exists(), f"Manifesto bronze não encontrado: {MANIFESTO_BRONZE}"

man_bronze = pd.read_csv(MANIFESTO_BRONZE)
# Esperado: 24 tickers + IBOV = 25 linhas
assert len(man_bronze) == 25, f"Manifesto deveria ter 25 entradas; encontrado {len(man_bronze)}"

# Validar caminhos de arquivos
missing = []
for i, row in man_bronze.iterrows():
    fp = row.get("file_path", None)
    if not fp or not Path(fp).exists():
        missing.append((row.get("ticker", "?"), fp))
if missing:
    raise FileNotFoundError(f"Arquivos ausentes no Bronze: {missing}")

print("Manifesto bronze validado com sucesso. Total de arquivos:", len(man_bronze))


Manifesto bronze validado com sucesso. Total de arquivos: 25


In [4]:
# Objetivo: ler cada parquet do Bronze, normalizar schema e garantir unicidade por (ticker, date).
#           Aqui ainda NÃO alinhamos o calendário; apenas garantimos schema consistente e dedup.

bronze_frames = {}
stats_norm = []

for _, row in tqdm(man_bronze.iterrows(), total=len(man_bronze), desc="Lendo/normalizando Bronze"):
    ticker = row["ticker"]
    fpath = Path(row["file_path"])
    df = pd.read_parquet(fpath)

    # Normaliza schema
    df = enforce_schema_flat(df)

    # Garantir coluna 'ticker' válida
    if "ticker" not in df.columns or df["ticker"].isna().any():
        # tenta inferir do nome do arquivo, fallback para o valor do manifesto
        inferred = ticker
        df["ticker"] = str(inferred)

    # Deduplicar chave (ticker, date)
    before = len(df)
    df = df.drop_duplicates(subset=["ticker", "date"], keep="last").reset_index(drop=True)
    after = len(df)

    # Guardar
    bronze_frames[ticker] = df
    stats_norm.append({
        "ticker": ticker,
        "rows_before": before,
        "rows_after": after,
        "dups_removed": before - after,
        "date_min": df["date"].min(),
        "date_max": df["date"].max()
    })

stats_norm = pd.DataFrame(stats_norm)
print("Resumo normalização e deduplicação (Bronze → Silver preliminar):")
display(stats_norm.sort_values("ticker"))


Lendo/normalizando Bronze: 100%|██████████| 25/25 [00:00<00:00, 251.81it/s]

Resumo normalização e deduplicação (Bronze → Silver preliminar):





Unnamed: 0,ticker,rows_before,rows_after,dups_removed,date_min,date_max
19,ABEV3.SA,754,754,0,2022-09-06,2025-09-12
2,B3SA3.SA,754,754,0,2022-09-06,2025-09-12
1,BBAS3.SA,754,754,0,2022-09-06,2025-09-12
13,CPLE6.SA,753,753,0,2022-09-06,2025-09-12
6,CSNA3.SA,754,754,0,2022-09-06,2025-09-12
11,ELET3.SA,754,754,0,2022-09-06,2025-09-12
5,GGBR4.SA,754,754,0,2022-09-06,2025-09-12
18,HAPV3.SA,754,754,0,2022-09-06,2025-09-12
0,ITUB4.SA,754,754,0,2022-09-06,2025-09-12
22,LREN3.SA,754,754,0,2022-09-06,2025-09-12


In [5]:
# Objetivo: alinhar todos os 25 conjuntos para as MESMAS datas de pregão (interseção).
# Racional: Silver precisa garantir "mesma data de leilão/pregão" para todos.
# Estratégia: interseção de datas presentes em TODOS os tickers (evita imputações).

# 1) Obter conjunto de datas por ticker
date_sets = {tk: set(df["date"].dt.normalize().unique()) for tk, df in bronze_frames.items()}

# 2) Interseção de todas as datas
tickers = sorted(bronze_frames.keys())
common_dates = None
for tk in tickers:
    ds = date_sets[tk]
    common_dates = ds if common_dates is None else (common_dates & ds)

common_dates = sorted(list(common_dates))
common_dates = pd.to_datetime(pd.Series(common_dates)).dt.normalize().unique()

assert len(common_dates) > 0, "Interseção de datas vazia — verificar dados do Bronze."

print("Datas comuns contabilizadas:", len(common_dates))
print("Intervalo comum:",
      pd.to_datetime(common_dates.min()).date(), "→", pd.to_datetime(common_dates.max()).date())

# 3) Alinhar cada ticker à interseção (drop de datas fora do conjunto comum)
aligned_frames = {}
drop_stats = []

for tk in tqdm(tickers, desc="Alinhando ao calendário comum"):
    df = bronze_frames[tk].copy()
    df["d_norm"] = df["date"].dt.normalize()
    before = len(df)
    df = df[df["d_norm"].isin(common_dates)].copy()
    df = df.drop(columns=["d_norm"]).sort_values("date").reset_index(drop=True)
    after = len(df)

    # Sanidade: todas as datas presentes e ordenadas
    assert after == len(common_dates), f"{tk}: linhas {after} != datas comuns {len(common_dates)}"
    assert df["date"].is_monotonic_increasing, f"{tk}: datas não ordenadas"

    aligned_frames[tk] = df
    drop_stats.append({
        "ticker": tk,
        "kept_rows": after,
        "dropped_rows": before - after,
        "date_min_common": df['date'].min(),
        "date_max_common": df['date'].max()
    })

drop_stats = pd.DataFrame(drop_stats).sort_values("ticker")
print("Resumo de alinhamento (datas fora do comum removidas):")
display(drop_stats)


Datas comuns contabilizadas: 753
Intervalo comum: 2022-09-06 → 2025-09-12


Alinhando ao calendário comum: 100%|██████████| 25/25 [00:00<00:00, 975.98it/s]

Resumo de alinhamento (datas fora do comum removidas):





Unnamed: 0,ticker,kept_rows,dropped_rows,date_min_common,date_max_common
0,ABEV3.SA,753,1,2022-09-06,2025-09-12
1,B3SA3.SA,753,1,2022-09-06,2025-09-12
2,BBAS3.SA,753,1,2022-09-06,2025-09-12
3,CPLE6.SA,753,0,2022-09-06,2025-09-12
4,CSNA3.SA,753,1,2022-09-06,2025-09-12
5,ELET3.SA,753,1,2022-09-06,2025-09-12
6,GGBR4.SA,753,1,2022-09-06,2025-09-12
7,HAPV3.SA,753,1,2022-09-06,2025-09-12
8,ITUB4.SA,753,1,2022-09-06,2025-09-12
9,LREN3.SA,753,1,2022-09-06,2025-09-12


In [6]:
# Objetivo: escrever arquivos Silver individuais (um por ticker) e gerar manifesto_silver.csv
# Formato: Parquet, schema plano, datas alinhadas (interseção), sem duplicidades.

silver_meta = []
RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_SUBDIR = SILVER_DIR / f"run_{RUN_TS}"
OUT_SUBDIR.mkdir(parents=True, exist_ok=True)

for tk, df in tqdm(aligned_frames.items(), desc="Gravando Silver por ticker"):
    out_path = OUT_SUBDIR / f"{tk.replace('^','').replace('.','_')}_silver.parquet"
    write_parquet(df, out_path)
    silver_meta.append({
        "ticker": tk,
        "rows": len(df),
        "date_min": df["date"].min().strftime("%Y-%m-%d"),
        "date_max": df["date"].max().strftime("%Y-%m-%d"),
        "file_path": str(out_path)
    })

# Criar DataFrame de manifesto
man_silver = pd.DataFrame(silver_meta)
# Adicionar hash sha256 por arquivo
hashes = []
for _, row in tqdm(man_silver.iterrows(), total=len(man_silver), desc="Calculando SHA256"):
    hashes.append(sha256_file(Path(row["file_path"])))
man_silver["sha256"] = hashes

# Estatísticas globais
man_silver.attrs["common_dates_count"] = int(len(aligned_frames[tickers[0]]))
man_silver.attrs["run_ts"] = RUN_TS

# Salvar manifesto
man_silver.to_csv(MANIFESTO_SILVER, index=False)

print("Manifesto Silver salvo em:", MANIFESTO_SILVER)
display(man_silver.sort_values("ticker"))


Gravando Silver por ticker: 100%|██████████| 25/25 [00:00<00:00, 823.59it/s]
Calculando SHA256: 100%|██████████| 25/25 [00:00<00:00, 15678.47it/s]

Manifesto Silver salvo em: /home/wrm/BOLSA_2026/intermediarios/silver/manifesto_silver.csv





Unnamed: 0,ticker,rows,date_min,date_max,file_path,sha256
0,ABEV3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,901983fc2609f9aaa348b0a6ff3579b9fc79669126a9de...
1,B3SA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,d21e9e650743ce8dadbb0ed6ad2486c18c817d465d833b...
2,BBAS3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,da557d97eea93fd3c9f77527097dfa583947adb7d99d79...
3,CPLE6.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0166b7451f6018e9efd5620b91752fd97f39e8769157f1...
4,CSNA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0824896e45b6e24fe360e98d21e628b865a820585617e4...
5,ELET3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,816595502ea741ecd93da1f468a5bae5debaa47dfd0ae7...
6,GGBR4.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,83ea0e714f2d36a9963314c1b855fcc7f4ed8f967becf8...
7,HAPV3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,1eb3299498898c88aff9017aef676636c80f608f9a6f55...
8,ITUB4.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,bced5a81871eaa95f8cfd1675467f8237686ed60988c0a...
9,LREN3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,c62fd6bb6abfe6db791fd399abb22f6631ea65fa3e2753...


In [7]:
# Objetivo: garantir integridade mínima: 
# - todos os 25 arquivos existem,
# - mesmo número de linhas em todos,
# - mesmos date_min/date_max,
# - schema e dtypes esperados.

# 1) Recarregar e validar coesão básica
sizes = []
date_min_set, date_max_set = set(), set()

for _, row in man_silver.iterrows():
    fp = Path(row["file_path"])
    assert fp.exists(), f"Arquivo Silver inexistente: {fp}"
    df = pd.read_parquet(fp)

    # Schema esperado
    expected_cols = ["date", "open", "high", "low", "close", "adj_close", "volume", "ticker"]
    assert all(c in df.columns for c in expected_cols), f"Schema faltando colunas em {fp}"

    sizes.append(len(df))
    date_min_set.add(pd.to_datetime(df["date"].min()).normalize())
    date_max_set.add(pd.to_datetime(df["date"].max()).normalize())

# 2) Mesma contagem de linhas para todos
sizes = np.array(sizes)
assert sizes.min() == sizes.max(), f"Tamanhos divergentes: {sizes.min()}..{sizes.max()}"

# 3) Mesmos limites de datas para todos
assert len(date_min_set) == 1 and len(date_max_set) == 1, \
    f"Datas min/max divergentes: mins={date_min_set}, maxs={date_max_set}"

print("Checagens Silver concluídas: integridade OK.")
print(f"Linhas por arquivo: {sizes[0]}")
print(f"Período comum: {list(date_min_set)[0].date()} → {list(date_max_set)[0].date()}")


AssertionError: Schema faltando colunas em /home/wrm/BOLSA_2026/intermediarios/silver/run_20250915_114709/ABEV3_SA_silver.parquet

In [8]:
# Inspecionar Silver problemático, inspecionar Bronze correspondente e tentar reparar (não sobrescreve)
from pathlib import Path
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import traceback

# Ajuste estes caminhos se necessário
SILVER_PROBLEM = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_20250915_114709/ABEV3_SA_silver.parquet")
MANIFESTO_BRONZE = Path("/home/wrm/BOLSA_2026/dados_originais/manifesto_dados_originais_3y.csv")

assert SILVER_PROBLEM.exists(), f"Arquivo Silver não encontrado: {SILVER_PROBLEM}"
assert MANIFESTO_BRONZE.exists(), f"Manifesto Bronze não encontrado: {MANIFESTO_BRONZE}"

print("== INSPEÇÃO: Silver problemático (pandas) ==")
try:
    df_s = pd.read_parquet(SILVER_PROBLEM)
    print("shape:", df_s.shape)
    print("columns repr:", [repr(c) for c in df_s.columns])
    print("dtypes:\n", df_s.dtypes)
    print("nulls per column:\n", df_s.isna().sum())
    print("\nhead:")
    display(df_s.head(6))
except Exception as e:
    print("pandas.read_parquet falhou:", repr(e))
    traceback.print_exc()

print("\n== INSPEÇÃO: Silver problemático (pyarrow schema) ==")
try:
    tbl = pq.read_table(SILVER_PROBLEM)
    print("pyarrow schema:")
    print(tbl.schema)
    print("pyarrow column names:", tbl.column_names)
except Exception as e:
    print("pyarrow.read_table falhou:", repr(e))
    traceback.print_exc()

print("\n== LOCALIZANDO Bronze correspondente via manifesto ==")
man = pd.read_csv(MANIFESTO_BRONZE)
# Tentar localizar entrada que contenha 'ABEV3' no ticker ou no path
candidate = None
for _, r in man.iterrows():
    tk = str(r.get("ticker", ""))
    fp = r.get("file_path", "")
    if "ABEV3" in tk or (isinstance(fp, str) and "ABEV3" in fp):
        candidate = Path(fp)
        break

if candidate is None:
    # fallback: procurar arquivo no diretório Bronze
    bronze_dir = MANIFESTO_BRONZE.parent
    found = list(bronze_dir.glob("*ABEV3*.parquet"))
    if found:
        candidate = found[0]

assert candidate is not None and candidate.exists(), f"Não encontrei Bronze correspondente a ABEV3 (candidates: {candidate})"
print("Bronze correspondente:", candidate)

print("\n== INSPEÇÃO: Bronze (pyarrow schema + pandas head) ==")
try:
    tbronze = pq.read_table(candidate)
    print(tbronze.schema)
    print("column names:", tbronze.column_names)
    display(tbronze.to_pandas().head(6))
except Exception as e:
    print("pyarrow.read_table falhou no Bronze:", repr(e))
    traceback.print_exc()
    try:
        df_b = pd.read_parquet(candidate)
        print("pandas leu Bronze shape:", df_b.shape)
        print(df_b.dtypes)
        display(df_b.head(6))
    except Exception as e2:
        print("Também falhou pd.read_parquet no Bronze:", repr(e2))
        traceback.print_exc()
        raise

# Agora tentar reparar: aplicar enforce_schema_flat se disponível, senão fazer reparo básico
print("\n== TENTANDO REPARO A PARTIR DO BRONZE ==")
try:
    dfb = pd.read_parquet(candidate)
except Exception as e:
    raise RuntimeError(f"Falha ao ler Bronze {candidate}: {e!r}")

# preferir usar enforce_schema_flat se a função estiver definida no notebook
if 'enforce_schema_flat' in globals() and callable(globals()['enforce_schema_flat']):
    repaired = globals()['enforce_schema_flat'](dfb)
    print("Usando enforce_schema_flat definida no notebook.")
else:
    print("Função enforce_schema_flat não encontrada; aplicando reparo heurístico.")
    # heurística: flatten MultiIndex columns, map common names, coercões
    df = dfb.copy()
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = ["_".join([str(p) for p in col if p != ""]).strip("_") for col in df.columns]
    # map simples
    rename_map = {}
    lower_map = {c.lower(): c for c in df.columns}
    for want in ["date","open","high","low","close","adj_close","volume","ticker"]:
        if want in df.columns:
            continue
        for c_low, c_orig in lower_map.items():
            if c_low.endswith(want) or c_low == want:
                rename_map[c_orig] = want
                break
    if rename_map:
        df = df.rename(columns=rename_map)
    # ensure date exists or try to find a date-like column
    if "date" not in df.columns:
        for c in df.columns:
            if "date" in c.lower() or "datetime" in c.lower():
                df = df.rename(columns={c: "date"})
                break
    # basic coercions
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
    for col in ["open","high","low","close","adj_close"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("float64")
    if "volume" in df.columns:
        df["volume"] = pd.to_numeric(df["volume"], errors="coerce").fillna(0).astype("int64")
    if "ticker" in df.columns:
        df["ticker"] = df["ticker"].astype("string")
    # keep only target cols that exist
    cols_target = ["date","open","high","low","close","adj_close","volume","ticker"]
    keep = [c for c in cols_target if c in df.columns]
    repaired = df[keep].copy()

print("Reparado shape:", repaired.shape)
print("Cols reparado:", repaired.columns.tolist())
print("Dtypes:\n", repaired.dtypes)
print("Nulls per column:\n", repaired.isna().sum())
display(repaired.head(6))

# escrever reparado sem sobrescrever (sufixo _repaired.parquet)
outp = SILVER_PROBLEM.with_name(SILVER_PROBLEM.stem + "_repaired.parquet")
if outp.exists():
    print("Arquivo reparado já existe:", outp)
else:
    outp.parent.mkdir(parents=True, exist_ok=True)
    repaired.to_parquet(outp, index=False)
    print("Escrito reparado em:", outp)

# mostrar hashes
import hashlib
def sha256_file(p: Path):
    h = hashlib.sha256()
    with open(p, "rb") as f:
        for chunk in iter(lambda: f.read(1<<20), b""):
            h.update(chunk)
    return h.hexdigest()

print("sha256(silver_original) =", sha256_file(SILVER_PROBLEM) if SILVER_PROBLEM.exists() else None)
print("sha256(silver_repaired) =", sha256_file(outp))

== INSPEÇÃO: Silver problemático (pandas) ==
shape: (753, 2)
columns repr: ["'date'", "'ticker'"]
dtypes:
 date      datetime64[ns]
ticker    string[python]
dtype: object
nulls per column:
 date      0
ticker    0
dtype: int64

head:


Unnamed: 0,date,ticker
0,2022-09-06,ABEV3.SA
1,2022-09-08,ABEV3.SA
2,2022-09-09,ABEV3.SA
3,2022-09-12,ABEV3.SA
4,2022-09-13,ABEV3.SA
5,2022-09-14,ABEV3.SA



== INSPEÇÃO: Silver problemático (pyarrow schema) ==
pyarrow schema:
date: timestamp[ns]
ticker: string
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 304
pyarrow column names: ['date', 'ticker']

== LOCALIZANDO Bronze correspondente via manifesto ==
Bronze correspondente: /home/wrm/BOLSA_2026/dados_originais/ABEV3_SA_3y.parquet

== INSPEÇÃO: Bronze (pyarrow schema + pandas head) ==
('date', ''): timestamp[ns]
('adj_close', 'ABEV3.SA'): double
('close', 'ABEV3.SA'): double
('high', 'ABEV3.SA'): double
('low', 'ABEV3.SA'): double
('open', 'ABEV3.SA'): double
('volume', 'ABEV3.SA'): int64
('ticker', ''): string
-- schema metadata --
pandas: '{"column_indexes": [{"field_name": "Price", "metadata": null, "n' + 1444
column names: ["('date', '')", "('adj_close', 'ABEV3.SA')", "('close', 'ABEV3.SA')", "('high', 'ABEV3.SA')", "('low', 'ABEV3.SA')", "('open', 'ABEV3.SA')", "('volume', 'ABEV3.SA')", "('ticker', '')"]


Price,date,adj_close,close,high,low,open,volume,ticker
Ticker,Unnamed: 1_level_1,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,ABEV3.SA,Unnamed: 8_level_1
0,2022-09-06,12.937738,15.56,15.66,15.39,15.44,20160400,ABEV3.SA
1,2022-09-08,12.88785,15.5,15.74,15.35,15.6,22311100,ABEV3.SA
2,2022-09-09,13.070773,15.72,15.76,15.48,15.5,13422600,ABEV3.SA
3,2022-09-12,13.054143,15.7,15.8,15.58,15.79,18048600,ABEV3.SA
4,2022-09-13,12.962682,15.59,15.88,15.49,15.6,27453000,ABEV3.SA
5,2022-09-14,12.788073,15.38,15.59,15.27,15.59,17680200,ABEV3.SA



== TENTANDO REPARO A PARTIR DO BRONZE ==
Usando enforce_schema_flat definida no notebook.
Reparado shape: (754, 2)
Cols reparado: ['date', 'ticker']
Dtypes:
 date      datetime64[ns]
ticker    string[python]
dtype: object
Nulls per column:
 date      0
ticker    0
dtype: int64


Unnamed: 0,date,ticker
0,2022-09-06,ABEV3.SA
1,2022-09-08,ABEV3.SA
2,2022-09-09,ABEV3.SA
3,2022-09-12,ABEV3.SA
4,2022-09-13,ABEV3.SA
5,2022-09-14,ABEV3.SA


Escrito reparado em: /home/wrm/BOLSA_2026/intermediarios/silver/run_20250915_114709/ABEV3_SA_silver_repaired.parquet
sha256(silver_original) = 901983fc2609f9aaa348b0a6ff3579b9fc79669126a9de710055f1d5b261439d
sha256(silver_repaired) = 0ce36e231927dfddba48f79dde9c8eb03f21720624460b7700e0830d19d4b0aa


In [11]:
# Reprocessar Bronze -> Silver (fix global)
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
import hashlib

# Configurações (ajuste se necessário)
BRONZE_DIR = Path("/home/wrm/BOLSA_2026/dados_originais")
MANIFESTO_BRONZE = BRONZE_DIR / "manifesto_dados_originais_3y.csv"
SILVER_ROOT = Path("/home/wrm/BOLSA_2026/intermediarios/silver")
SILVER_ROOT.mkdir(parents=True, exist_ok=True)

# Utilitários
def sha256_file(p: Path, chunk_size: int = 1 << 20) -> str:
    h = hashlib.sha256()
    with open(p, "rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()

def flatten_columns_handle_tuples(df: pd.DataFrame) -> pd.DataFrame:
    """
    Converte MultiIndex/tuple columns em nomes simples.
    Heurística:
      - se coluna é tupla e algum componente for um nome conhecido (date/open/close/etc),
        seleciona esse componente.
      - caso contrário junta as partes não-vazias por '_' como fallback.
    """
    cols = df.columns
    if isinstance(cols, pd.MultiIndex):
        new_cols = []
        for col in cols:
            parts = [str(p) for p in col]
            parts_clean = [p.strip() for p in parts if str(p).strip() != ""]
            # procurar candidato óbvio
            picked = None
            for candidate in ["date","open","high","low","close","adj close","adj_close","adjclose","volume","ticker"]:
                for p in parts_clean:
                    if candidate == p.strip().lower():
                        picked = candidate.replace(" ", "_")
                        break
                if picked:
                    break
            if not picked:
                # evitar pegar ticker-like (ex: 'ABEV3.SA')
                non_ticker = [p for p in parts_clean if not (p.upper().endswith(".SA") or p.startswith("^"))]
                if non_ticker:
                    picked = non_ticker[0]
                else:
                    picked = "_".join(parts_clean) if parts_clean else "_".join([str(x) for x in col])
            new_cols.append(picked)
        df = df.copy()
        df.columns = new_cols
    else:
        df = df.copy()
    return df

def normalize_schema(df: pd.DataFrame) -> pd.DataFrame:
    """
    Mapeia variantes para o schema canônico:
    date, open, high, low, close, adj_close, volume, ticker
    Faz coerções de tipos e retorna DataFrame com colunas canônicas presentes.
    """
    df = flatten_columns_handle_tuples(df)

    # construir mapa simples de renomeação
    rename_map = {}
    for c in df.columns:
        c0 = str(c).strip()
        key = c0.lower().replace(".", "_").replace(" ", "_")
        if key in {"date", "datetime"}:
            rename_map[c] = "date"
        elif "open" == key or key.endswith("_open") or key.endswith("open"):
            rename_map[c] = "open"
        elif "high" == key or key.endswith("_high") or key.endswith("high"):
            rename_map[c] = "high"
        elif "low" == key or key.endswith("_low") or key.endswith("low"):
            rename_map[c] = "low"
        elif key == "close" or key.endswith("_close") or "close" in key:
            # cuidado com adj_close
            if "adj" in key:
                rename_map[c] = "adj_close"
            else:
                # prefer close if não é adj
                rename_map[c] = "close"
        elif key in {"adj_close", "adjclose"} or ("adj" in key and "close" in key):
            rename_map[c] = "adj_close"
        elif key == "volume" or key.endswith("_volume") or "volume" in key:
            rename_map[c] = "volume"
        elif key == "ticker":
            rename_map[c] = "ticker"
    df = df.rename(columns=rename_map)

    # se não há 'date', tentar inferir por heurística
    if "date" not in df.columns:
        for c in df.columns:
            cl = c.lower()
            if "date" in cl or "datetime" in cl:
                df = df.rename(columns={c: "date"})
                break

    # coerções
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"], errors="coerce")
    for c in ["open","high","low","close","adj_close"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("float64")
    if "volume" in df.columns:
        df["volume"] = pd.to_numeric(df["volume"], errors="coerce")
        # se todos finitos, cast para int
        if df["volume"].notna().all():
            try:
                df["volume"] = df["volume"].astype("int64")
            except Exception:
                pass
    if "ticker" in df.columns:
        df["ticker"] = df["ticker"].astype("string")

    # manter apenas colunas canônicas que existem
    cols_target = ["date","open","high","low","close","adj_close","volume","ticker"]
    keep = [c for c in cols_target if c in df.columns]
    df = df[keep].copy()

    # drop linhas sem date (crítico)
    if "date" in df.columns:
        df = df.dropna(subset=["date"]).sort_values("date").reset_index(drop=True)
    return df

# Início do fluxo
assert MANIFESTO_BRONZE.exists(), f"Manifesto bronze não encontrado: {MANIFESTO_BRONZE}"
man = pd.read_csv(MANIFESTO_BRONZE)

frames = {}
stats = []

for _, row in tqdm(man.iterrows(), total=len(man), desc="Normalizando Bronze (todos)"):
    tk = row["ticker"]
    fp = row.get("file_path", None)
    if not fp or not isinstance(fp, str):
        print(f"[WARN] Pulando {tk}: caminho inválido no manifesto -> {fp}")
        continue
    p = Path(fp)
    if not p.exists():
        print(f"[WARN] Pulando {tk}: arquivo não encontrado -> {p}")
        continue
    try:
        dfb = pd.read_parquet(p)
    except Exception as e:
        print(f"[ERROR] Falha lendo {p}: {e!r}")
        continue

    dfn = normalize_schema(dfb)

    # garantir ticker
    if "ticker" not in dfn.columns or dfn["ticker"].isna().all():
        dfn["ticker"] = str(tk)

    before = len(dfn)
    if "date" in dfn.columns:
        dfn = dfn.drop_duplicates(subset=["ticker","date"], keep="last").reset_index(drop=True)
    else:
        dfn = dfn.drop_duplicates(keep="last").reset_index(drop=True)
    after = len(dfn)

    stats.append({
        "ticker": tk,
        "bronze_rows_manifest": int(row.get("rows", -1)) if not pd.isna(row.get("rows", np.nan)) else -1,
        "rows_after_norm": after,
        "date_min": dfn["date"].min() if "date" in dfn.columns else None,
        "date_max": dfn["date"].max() if "date" in dfn.columns else None,
        "path_bronze": str(p)
    })
    frames[tk] = dfn

stats_df = pd.DataFrame(stats)
print("Resumo pré-alinhamento:")
display(stats_df.sort_values("ticker"))

# montar interseção de datas (normalizando)
tickers_with_dates = [tk for tk, df in frames.items() if "date" in df.columns and len(df) > 0]
if len(tickers_with_dates) == 0:
    raise RuntimeError("Nenhum ticker com coluna 'date' válida encontrada.")

date_sets = {tk: set(df["date"].dt.normalize().unique()) for tk, df in frames.items() if "date" in df.columns}
common_dates = None
for tk in sorted(date_sets.keys()):
    ds = date_sets[tk]
    common_dates = ds if common_dates is None else (common_dates & ds)
if not common_dates:
    raise RuntimeError("Interseção de datas vazia — não é possível alinhar.")
# garantir tipo consistente (set de Timestamp normalizados)
common_dates_set = set(pd.to_datetime(list(common_dates)).normalize())

print(f"Datas comuns: {len(common_dates_set)} (intervalo aproximado: {min(common_dates_set).date()} → {max(common_dates_set).date()})")

# Alinhar cada ticker
aligned = {}
drop_stats = []
for tk in tqdm(sorted(frames.keys()), desc="Alinhando ao calendário comum"):
    df = frames[tk].copy()
    if "date" not in df.columns:
        raise RuntimeError(f"{tk} não possui coluna 'date' após normalização; abortando.")
    df["d_norm"] = df["date"].dt.normalize()
    before = len(df)
    df = df[df["d_norm"].isin(common_dates_set)].copy()
    df = df.drop(columns=["d_norm"]).sort_values("date").reset_index(drop=True)
    after = len(df)
    if after != len(common_dates_set):
        raise AssertionError(f"{tk}: linhas mantidas {after} != datas comuns {len(common_dates_set)}")
    aligned[tk] = df
    drop_stats.append({"ticker": tk, "kept": after, "dropped": before - after, "date_min": df["date"].min(), "date_max": df["date"].max()})

drop_stats_df = pd.DataFrame(drop_stats).sort_values("ticker")
print("Resumo de alinhamento:")
display(drop_stats_df)

# Escrever run_fixed
RUN_TS = datetime.now().strftime("%Y%m%d_%H%M%S")
OUT_DIR = SILVER_ROOT / f"run_fixed_{RUN_TS}"
OUT_DIR.mkdir(parents=True, exist_ok=True)

man_silver = []
for tk, df in tqdm(sorted(aligned.items()), desc="Gravando Silver por ticker"):
    outp = OUT_DIR / f"{tk.replace('^','').replace('.','_')}_silver.parquet"
    cols_order = ["date","open","high","low","close","adj_close","volume","ticker"]
    cols_present = [c for c in cols_order if c in df.columns]
    df_to_write = df[cols_present].copy()
    df_to_write.to_parquet(outp, index=False)
    man_silver.append({
        "ticker": tk,
        "rows": len(df_to_write),
        "date_min": df_to_write["date"].min().strftime("%Y-%m-%d"),
        "date_max": df_to_write["date"].max().strftime("%Y-%m-%d"),
        "file_path": str(outp)
    })

man_silver_df = pd.DataFrame(man_silver)

# Hashes
hashes = []
for _, r in tqdm(man_silver_df.iterrows(), total=len(man_silver_df), desc="Calculando SHA256"):
    p = Path(r["file_path"])
    hashes.append(sha256_file(p))
man_silver_df["sha256"] = hashes

# Salvar manifesto do run
manifest_fixed_path = OUT_DIR / "manifesto_silver_fixed.csv"
man_silver_df.to_csv(manifest_fixed_path, index=False)

print("Run_fixed concluído. Saída em:", OUT_DIR)
display(man_silver_df.sort_values("ticker"))

# Checagem rápida final
sizes = []
for _, r in man_silver_df.iterrows():
    p = Path(r["file_path"])
    assert p.exists(), f"Arquivo faltando: {p}"
    df = pd.read_parquet(p)
    assert "date" in df.columns, f"'date' ausente em {p}"
    price_cols = [c for c in ["open","high","low","close","adj_close"] if c in df.columns]
    assert len(price_cols) > 0, f"Nenhuma coluna de preço em {p}"
    sizes.append(len(df))
sizes = np.array(sizes)
assert sizes.min() == sizes.max(), f"Tamanhos divergentes entre arquivos Silver: {sizes.min()}..{sizes.max()}"

print("Checagem básica final: OK — arquivos consistentes e alinhados.")

Normalizando Bronze (todos): 100%|██████████| 25/25 [00:00<00:00, 200.25it/s]

Resumo pré-alinhamento:





Unnamed: 0,ticker,bronze_rows_manifest,rows_after_norm,date_min,date_max,path_bronze
19,ABEV3.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/ABEV3_SA_...
2,B3SA3.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/B3SA3_SA_...
1,BBAS3.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/BBAS3_SA_...
13,CPLE6.SA,753,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/CPLE6_SA_...
6,CSNA3.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/CSNA3_SA_...
11,ELET3.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/ELET3_SA_...
5,GGBR4.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/GGBR4_SA_...
18,HAPV3.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/HAPV3_SA_...
0,ITUB4.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/ITUB4_SA_...
22,LREN3.SA,754,754,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/dados_originais/LREN3_SA_...


Datas comuns: 753 (intervalo aproximado: 2022-09-06 → 2025-09-12)


Alinhando ao calendário comum: 100%|██████████| 25/25 [00:00<00:00, 630.37it/s]

Resumo de alinhamento:





Unnamed: 0,ticker,kept,dropped,date_min,date_max
0,ABEV3.SA,753,1,2022-09-06,2025-09-12
1,B3SA3.SA,753,1,2022-09-06,2025-09-12
2,BBAS3.SA,753,1,2022-09-06,2025-09-12
3,CPLE6.SA,753,0,2022-09-06,2025-09-12
4,CSNA3.SA,753,1,2022-09-06,2025-09-12
5,ELET3.SA,753,1,2022-09-06,2025-09-12
6,GGBR4.SA,753,1,2022-09-06,2025-09-12
7,HAPV3.SA,753,1,2022-09-06,2025-09-12
8,ITUB4.SA,753,1,2022-09-06,2025-09-12
9,LREN3.SA,753,1,2022-09-06,2025-09-12


Gravando Silver por ticker: 100%|██████████| 25/25 [00:00<00:00, 605.33it/s]
Calculando SHA256: 100%|██████████| 25/25 [00:00<00:00, 14285.78it/s]

Run_fixed concluído. Saída em: /home/wrm/BOLSA_2026/intermediarios/silver/run_fixed_20250915_121821





Unnamed: 0,ticker,rows,date_min,date_max,file_path,sha256
0,ABEV3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,fb74a882ea4417a67eb2df085b5ce8fb70f25e988e5454...
1,B3SA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,71b1d261cfd0611376a42900527786caa831ce94f8231b...
2,BBAS3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0f3ea873f631a27c8432ca82cb106911e954d855fbbf5f...
3,CPLE6.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,c234fc06657bc85102ebd77b601d3ef665a7ea4761b7ed...
4,CSNA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0d59f1f5600063b4c5d120ac7c70a9ab8124ea016eda4e...
5,ELET3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,a0b83af8acc67719eb3ab9a4167a1b39651bedff687298...
6,GGBR4.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,cddd942cf1c482c80535abe4d22460ab95a46e0112bd8f...
7,HAPV3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,ae079a04f09669026c524724183c982de0ba558bce5510...
8,ITUB4.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,173530cac25a04cd79c7298be3f81f5854f02697090170...
9,LREN3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,ecd2fdfc556da0e16d03c342dbe4c6f2030a5d773feb16...


Checagem básica final: OK — arquivos consistentes e alinhados.


In [1]:
# Validação de Parquets Silver — gera relatório por arquivo
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np

# Config
SILVER_ROOT = Path("/home/wrm/BOLSA_2026/intermediarios/silver")
# Se quiser testar um manifesto específico, defina MANIFEST_PATH = Path(".../manifesto_silver_fixed.csv")
MANIFEST_PATH = None
NA_THRESHOLD = 0.01  # se >1% de NaNs em coluna crítica -> WARN/FAIL
REQUIRED_COLS = ["date", "open", "high", "low", "close", "adj_close", "volume", "ticker"]

def find_latest_run_manifest(silver_root: Path) -> Path:
    # procura por run_fixed_* e usa o manifesto_silver_fixed.csv do mais recente
    runs = sorted([d for d in silver_root.iterdir() if d.is_dir() and d.name.startswith("run_fixed_")], reverse=True)
    for r in runs:
        m = r / "manifesto_silver_fixed.csv"
        if m.exists():
            return m
    # fallback: procurar manifesto_silver.csv no root
    m2 = silver_root / "manifesto_silver.csv"
    if m2.exists():
        return m2
    raise FileNotFoundError(f"Nenhum manifesto encontrado em {silver_root}")

if MANIFEST_PATH is None:
    MANIFEST_PATH = find_latest_run_manifest(SILVER_ROOT)

print("Usando manifesto:", MANIFEST_PATH)
manifest = pd.read_csv(MANIFEST_PATH)

reports = []
for _, row in manifest.iterrows():
    fp = Path(row["file_path"])
    tick = row.get("ticker", fp.stem)
    report = {"ticker": tick, "file_path": str(fp), "exists": fp.exists(), "ok": True, "messages": []}
    if not fp.exists():
        report["ok"] = False
        report["messages"].append("Arquivo não encontrado")
        reports.append(report)
        continue
    try:
        df = pd.read_parquet(fp)
    except Exception as e:
        report["ok"] = False
        report["messages"].append(f"Falha leitura parquet: {e!r}")
        reports.append(report)
        continue

    # Colunas presentes
    cols_present = list(df.columns)
    missing = [c for c in ["date", "ticker"] if c not in cols_present]
    if missing:
        report["ok"] = False
        report["messages"].append(f"Colunas críticas ausentes: {missing}")

    # At least one price column
    price_cols = [c for c in ["open","high","low","close","adj_close"] if c in cols_present]
    if len(price_cols) == 0:
        report["ok"] = False
        report["messages"].append("Nenhuma coluna de preço presente (open/high/low/close/adj_close)")

    # dtypes checks
    if "date" in df.columns:
        try:
            dates = pd.to_datetime(df["date"], errors="coerce")
            n_bad_dates = dates.isna().sum()
            if n_bad_dates > 0:
                pct = n_bad_dates / len(df)
                report["ok"] = False
                report["messages"].append(f"{n_bad_dates} datas inválidas ({pct:.2%})")
        except Exception as e:
            report["ok"] = False
            report["messages"].append(f"Erro parsing date: {e!r}")

    # price columns numeric?
    for c in price_cols:
        n_nonnum = pd.to_numeric(df[c], errors="coerce").isna().sum()
        pct = n_nonnum / max(1, len(df))
        if pct > NA_THRESHOLD:
            report["ok"] = False
            report["messages"].append(f"Coluna {c} tem {n_nonnum} valores não-numéricos ({pct:.2%})")

    # volume check
    if "volume" in df.columns:
        n_nonnum = pd.to_numeric(df["volume"], errors="coerce").isna().sum()
        pct = n_nonnum / max(1, len(df))
        if pct > NA_THRESHOLD:
            report["ok"] = False
            report["messages"].append(f"Coluna volume tem {n_nonnum} valores não-numéricos ({pct:.2%})")

    # ticker type
    if "ticker" in df.columns:
        n_bad_ticker = df["ticker"].isna().sum()
        if n_bad_ticker > 0:
            report["ok"] = False
            report["messages"].append(f"{n_bad_ticker} tickers vazios")

    # duplicates check
    if "date" in df.columns and "ticker" in df.columns:
        dup = df.duplicated(subset=["ticker","date"]).sum()
        if dup > 0:
            report["ok"] = False
            report["messages"].append(f"{dup} duplicatas por (ticker,date)")

    # monotonic dates
    if "date" in df.columns:
        if not df["date"].is_monotonic_increasing:
            report["ok"] = False
            report["messages"].append("Datas não estão ordenadas ascendentemente")

    # NaN percent per critical column
    for c in ["open","high","low","close","adj_close","volume"]:
        if c in df.columns:
            n_na = df[c].isna().sum()
            pct = n_na / max(1, len(df))
            if pct > NA_THRESHOLD:
                report["ok"] = False
                report["messages"].append(f"Coluna {c} tem {n_na} NaNs ({pct:.2%})")

    if report["ok"]:
        report["messages"].append("OK")

    reports.append(report)

rep_df = pd.DataFrame(reports)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
out_csv = (Path(MANIFEST_PATH).parent / f"report_parquet_validation_{ts}.csv")
rep_df.to_csv(out_csv, index=False)

# Sumário
n_ok = rep_df["ok"].sum()
n_total = len(rep_df)
print(f"Validação concluída: {n_ok}/{n_total} arquivos OK. Relatório salvo em: {out_csv}")
display(rep_df.sort_values(["ok","ticker"], ascending=[False, True]))

Usando manifesto: /home/wrm/BOLSA_2026/intermediarios/silver/run_fixed_20250915_121821/manifesto_silver_fixed.csv
Validação concluída: 25/25 arquivos OK. Relatório salvo em: /home/wrm/BOLSA_2026/intermediarios/silver/run_fixed_20250915_121821/report_parquet_validation_20250915_151906.csv


Unnamed: 0,ticker,file_path,exists,ok,messages
0,ABEV3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
1,B3SA3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
2,BBAS3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
3,CPLE6.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
4,CSNA3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
5,ELET3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
6,GGBR4.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
7,HAPV3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
8,ITUB4.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
9,LREN3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]


In [1]:
from pathlib import Path
import pandas as pd

runA = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_20250915_114709")
runB = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_fixed_20250915_121821")

mA = runA / "manifesto_silver.csv"
mB = runB / "manifesto_silver_fixed.csv"

print("Manifests exist:", mA.exists(), mB.exists())

dfA = pd.read_csv(mA) if mA.exists() else pd.DataFrame()
dfB = pd.read_csv(mB) if mB.exists() else pd.DataFrame()

def summary(df):
    if df.empty: 
        return {}
    return {
        "n_files": len(df),
        "rows_min": int(df["rows"].min()),
        "rows_max": int(df["rows"].max()),
        "date_min": df["date_min"].min(),
        "date_max": df["date_max"].max(),
    }

print("Resumo run A:", summary(dfA))
print("Resumo run B:", summary(dfB))

# comparar tickers presentes
setA = set(dfA["ticker"].astype(str)) if not dfA.empty else set()
setB = set(dfB["ticker"].astype(str)) if not dfB.empty else set()
print("Tickers only in A:", sorted(setA - setB))
print("Tickers only in B:", sorted(setB - setA))
print("Common tickers:", len(setA & setB))

# comparar hashes por ticker (se coluna sha256 existir)
if "sha256" in dfA.columns and "sha256" in dfB.columns:
    merged = dfA[["ticker","sha256"]].merge(dfB[["ticker","sha256"]], on="ticker", how="outer", suffixes=("_A","_B"))
    diffs = merged[merged["sha256_A"] != merged["sha256_B"]]
    print("Arquivos com hash diferente entre A e B:", len(diffs))
    display(diffs.head())
else:
    print("Um dos manifests não tem coluna 'sha256'.")

Manifests exist: False True
Resumo run A: {}
Resumo run B: {'n_files': 25, 'rows_min': 753, 'rows_max': 753, 'date_min': '2022-09-06', 'date_max': '2025-09-12'}
Tickers only in A: []
Tickers only in B: ['ABEV3.SA', 'B3SA3.SA', 'BBAS3.SA', 'CPLE6.SA', 'CSNA3.SA', 'ELET3.SA', 'GGBR4.SA', 'HAPV3.SA', 'ITUB4.SA', 'LREN3.SA', 'PETR4.SA', 'PRIO3.SA', 'PSSA3.SA', 'RAIL3.SA', 'RDOR3.SA', 'SBSP3.SA', 'SUZB3.SA', 'TAEE11.SA', 'TIMS3.SA', 'TOTS3.SA', 'UGPA3.SA', 'VALE3.SA', 'VIVT3.SA', 'WEGE3.SA', '^BVSP']
Common tickers: 0
Um dos manifests não tem coluna 'sha256'.


In [2]:
import shutil
from pathlib import Path
from datetime import datetime

old = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_20250915_114709")
new = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_fixed_20250915_121821")
archive_root = Path("/home/wrm/BOLSA_2026/intermediarios/silver/runs_archive")
archive_root.mkdir(parents=True, exist_ok=True)

# 1) arquivar old
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
if old.exists():
    archive_name = str(archive_root / f"{old.name}_{ts}")
    shutil.make_archive(archive_name, "gztar", root_dir=old)
    print("Arquivado:", archive_name + ".tar.gz")
else:
    print("Run antigo não encontrado (pulando arquivamento).")

# 2) promover new -> criar run_stable copy (opcional: move em vez de copy)
stable = new.parent / f"run_stable_{ts}"
if not stable.exists():
    shutil.copytree(new, stable)
    print("Copiado run_fixed para run_stable:", stable)

# 3) atualizar manifesto global (opcional: sobrescrever manifesto_silver.csv com o manifesto do stable)
manifest_src = stable / "manifesto_silver_fixed.csv"
manifest_dst = new.parent / ".." / "manifesto_silver.csv"  # ou Path("/home/wrm/BOLSA_2026/intermediarios/silver/manifesto_silver.csv")
manifest_dst = manifest_dst.resolve()
if manifest_src.exists():
    shutil.copy(manifest_src, manifest_dst)
    print("Manifesto global atualizado:", manifest_dst)
else:
    print("Manifesto fonte não encontrado em:", manifest_src)

Arquivado: /home/wrm/BOLSA_2026/intermediarios/silver/runs_archive/run_20250915_114709_20250915_152222.tar.gz
Copiado run_fixed para run_stable: /home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222
Manifesto global atualizado: /home/wrm/BOLSA_2026/intermediarios/manifesto_silver.csv


In [3]:
from pathlib import Path
import pandas as pd

run_stable = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222")
manifest_global = Path("/home/wrm/BOLSA_2026/intermediarios/manifesto_silver.csv")
manifest_run = run_stable / "manifesto_silver_fixed.csv"

print("run_stable exists:", run_stable.exists())
print("manifest_global exists:", manifest_global.exists())
print("manifest_run exists:", manifest_run.exists())

if manifest_global.exists():
    mg = pd.read_csv(manifest_global)
    print("Global manifest rows:", len(mg))
    print("Sample (first 5):")
    display(mg.head())
else:
    print("Manifesto global não encontrado:", manifest_global)

if manifest_run.exists():
    mr = pd.read_csv(manifest_run)
    print("Run manifest rows:", len(mr))
    print("Sample (first 5):")
    display(mr.head())
else:
    print("Manifesto do run_stable não encontrado:", manifest_run)

# Compare tickers
if manifest_global.exists() and manifest_run.exists():
    set_g = set(mg["ticker"].astype(str))
    set_r = set(mr["ticker"].astype(str))
    print("Tickers equal?:", set_g == set_r)
    print("Only in global:", sorted(set_g - set_r))
    print("Only in run:", sorted(set_r - set_g))

run_stable exists: True
manifest_global exists: True
manifest_run exists: True
Global manifest rows: 25
Sample (first 5):


Unnamed: 0,ticker,rows,date_min,date_max,file_path,sha256
0,ABEV3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,fb74a882ea4417a67eb2df085b5ce8fb70f25e988e5454...
1,B3SA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,71b1d261cfd0611376a42900527786caa831ce94f8231b...
2,BBAS3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0f3ea873f631a27c8432ca82cb106911e954d855fbbf5f...
3,CPLE6.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,c234fc06657bc85102ebd77b601d3ef665a7ea4761b7ed...
4,CSNA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0d59f1f5600063b4c5d120ac7c70a9ab8124ea016eda4e...


Run manifest rows: 25
Sample (first 5):


Unnamed: 0,ticker,rows,date_min,date_max,file_path,sha256
0,ABEV3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,fb74a882ea4417a67eb2df085b5ce8fb70f25e988e5454...
1,B3SA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,71b1d261cfd0611376a42900527786caa831ce94f8231b...
2,BBAS3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0f3ea873f631a27c8432ca82cb106911e954d855fbbf5f...
3,CPLE6.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,c234fc06657bc85102ebd77b601d3ef665a7ea4761b7ed...
4,CSNA3.SA,753,2022-09-06,2025-09-12,/home/wrm/BOLSA_2026/intermediarios/silver/run...,0d59f1f5600063b4c5d120ac7c70a9ab8124ea016eda4e...


Tickers equal?: True
Only in global: []
Only in run: []


In [4]:
# Validação completa (gera CSV de relatório)
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np

MANIFEST_PATH = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222/manifesto_silver_fixed.csv")
NA_THRESHOLD = 0.01  # tolerância

print("Usando manifesto:", MANIFEST_PATH)
manifest = pd.read_csv(MANIFEST_PATH)

def sha256_file_dummy(p):  # placeholder se quiser calcular hashes depois
    return None

reports = []
for _, row in manifest.iterrows():
    fp = Path(row["file_path"])
    tick = row.get("ticker", fp.stem)
    report = {"ticker": tick, "file_path": str(fp), "exists": fp.exists(), "ok": True, "messages": []}
    if not fp.exists():
        report["ok"] = False
        report["messages"].append("Arquivo não encontrado")
        reports.append(report)
        continue
    try:
        df = pd.read_parquet(fp)
    except Exception as e:
        report["ok"] = False
        report["messages"].append(f"Falha leitura parquet: {e!r}")
        reports.append(report)
        continue

    # checks
    missing_crit = [c for c in ["date","ticker"] if c not in df.columns]
    if missing_crit:
        report["ok"] = False
        report["messages"].append(f"Colunas críticas ausentes: {missing_crit}")

    price_cols = [c for c in ["open","high","low","close","adj_close"] if c in df.columns]
    if not price_cols:
        report["ok"] = False
        report["messages"].append("Nenhuma coluna de preço presente")

    if "date" in df.columns:
        dates = pd.to_datetime(df["date"], errors="coerce")
        n_bad_dates = int(dates.isna().sum())
        if n_bad_dates > 0:
            pct = n_bad_dates / max(1, len(df))
            report["ok"] = False
            report["messages"].append(f"{n_bad_dates} datas inválidas ({pct:.2%})")

    for c in price_cols:
        n_nonnum = int(pd.to_numeric(df[c], errors="coerce").isna().sum())
        pct = n_nonnum / max(1, len(df))
        if pct > NA_THRESHOLD:
            report["ok"] = False
            report["messages"].append(f"Coluna {c} tem {n_nonnum} valores não-numéricos ({pct:.2%})")

    if "volume" in df.columns:
        n_nonnum = int(pd.to_numeric(df["volume"], errors="coerce").isna().sum())
        pct = n_nonnum / max(1, len(df))
        if pct > NA_THRESHOLD:
            report["ok"] = False
            report["messages"].append(f"volume tem {n_nonnum} valores não-numéricos ({pct:.2%})")

    if "ticker" in df.columns:
        n_bad_ticker = int(df["ticker"].isna().sum())
        if n_bad_ticker > 0:
            report["ok"] = False
            report["messages"].append(f"{n_bad_ticker} tickers vazios")

    if "date" in df.columns and "ticker" in df.columns:
        dup = int(df.duplicated(subset=["ticker","date"]).sum())
        if dup > 0:
            report["ok"] = False
            report["messages"].append(f"{dup} duplicatas por (ticker,date)")

    if "date" in df.columns:
        if not df["date"].is_monotonic_increasing:
            report["ok"] = False
            report["messages"].append("Datas não ordenadas ascendentemente")

    for c in ["open","high","low","close","adj_close","volume"]:
        if c in df.columns:
            n_na = int(df[c].isna().sum())
            pct = n_na / max(1, len(df))
            if pct > NA_THRESHOLD:
                report["ok"] = False
                report["messages"].append(f"{c} tem {n_na} NaNs ({pct:.2%})")

    if report["ok"]:
        report["messages"].append("OK")
    reports.append(report)

rep_df = pd.DataFrame(reports)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
out_csv = MANIFEST_PATH.parent / f"report_parquet_validation_{ts}.csv"
rep_df.to_csv(out_csv, index=False)
print(f"Relatório salvo em: {out_csv}")
display(rep_df.sort_values(["ok","ticker"], ascending=[False, True]))

Usando manifesto: /home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222/manifesto_silver_fixed.csv
Relatório salvo em: /home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222/report_parquet_validation_20250915_152433.csv


Unnamed: 0,ticker,file_path,exists,ok,messages
0,ABEV3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
1,B3SA3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
2,BBAS3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
3,CPLE6.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
4,CSNA3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
5,ELET3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
6,GGBR4.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
7,HAPV3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
8,ITUB4.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]
9,LREN3.SA,/home/wrm/BOLSA_2026/intermediarios/silver/run...,True,True,[OK]


In [5]:
from pathlib import Path

stable_dir = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222")
prov = stable_dir / "PROVENANCE.txt"

print("run_stable dir:", stable_dir.exists())
print("PROVENANCE exists:", prov.exists())
if prov.exists():
    print("--- PROVENANCE.txt ---")
    print(prov.read_text())
else:
    print("PROVENANCE.txt não encontrado em", prov)

run_stable dir: True
PROVENANCE exists: False
PROVENANCE.txt não encontrado em /home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222/PROVENANCE.txt


In [6]:
from pathlib import Path
from datetime import datetime
import getpass
import platform
import hashlib

stable_dir = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222")
stable_dir.mkdir(parents=True, exist_ok=True)

origin_run = "run_fixed_20250915_121821"
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S %Z")
user = getpass.getuser()
host = platform.node()
note = f"""run_id: {stable_dir.name}
origin: {origin_run}
promoted_at: {ts}
promoted_by: {user}@{host}
manifest_source: {stable_dir / 'manifesto_silver_fixed.csv'}
note: promoted after validation; original run archived in intermediarios/silver/runs_archive
"""
prov_path = stable_dir / "PROVENANCE.txt"
prov_path.write_text(note)
print("Escrito:", prov_path)
print(note)

Escrito: /home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222/PROVENANCE.txt
run_id: run_stable_20250915_152222
origin: run_fixed_20250915_121821
promoted_at: 2025-09-15 15:27:00 
promoted_by: wrm@zen
manifest_source: /home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222/manifesto_silver_fixed.csv
note: promoted after validation; original run archived in intermediarios/silver/runs_archive



In [8]:
from pathlib import Path
from datetime import datetime
import re
import logging
import pandas as pd
import numpy as np
from typing import Tuple, Dict, Any

# Config logging básico
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# ====== Config ======
RUN_STABLE_DIR = Path("/home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222")
# O manifesto do run_stable pode ter nome "manifesto_silver.csv" ou "manifesto_silver_fixed.csv"
MANIFEST_CANDIDATES = ["manifesto_silver_fixed.csv", "manifesto_silver.csv"]
GOLD_DIR = Path("/home/wrm/BOLSA_2026/gold/IBOV")
GOLD_DIR.mkdir(parents=True, exist_ok=True)


# ====== Helpers ======
def load_manifest(run_dir: Path) -> pd.DataFrame:
    """Carrega o manifesto do run_dir procurando por nomes conhecidos."""
    for name in MANIFEST_CANDIDATES:
        f = run_dir / name
        if f.exists():
            logging.info(f"Usando manifesto: {f}")
            return pd.read_csv(f)
    raise FileNotFoundError("Manifesto do run_stable não encontrado (nem *_fixed nem padrão).")


def find_ibov_row(df_manifest: pd.DataFrame) -> pd.Series:
    """
    Localiza a linha do índice IBOV/BVSP no manifesto.
    Retorna a primeira linha que parece representar o IBOV.
    """
    # Procura colunas plausíveis que contenham ticker/symbol
    candidates = [c for c in df_manifest.columns if c.lower() in ("ticker", "symbol", "asset", "nome")]
    # Se não encontrou, tenta heurística: qualquer coluna string que contenha "ticker" ou "symbol"
    if not candidates:
        candidates = [c for c in df_manifest.columns if "ticker" in c.lower() or "symbol" in c.lower()]

    # Se ainda nada, procura em qualquer coluna string por valores que soem como tickers
    if not candidates:
        string_cols = [c for c in df_manifest.columns if df_manifest[c].dtype == object]
        candidates = string_cols

    if not candidates:
        raise KeyError("Manifesto não tem colunas plausíveis para identificar ticker/symbol.")

    pattern = re.compile(r"\b(IBOV|BVSP|IBOV11|^IBOV$)\b", flags=re.IGNORECASE)
    hits = []
    for c in candidates:
        # converte para string e busca
        ser = df_manifest[c].astype(str).fillna("")
        mask = ser.str.contains(pattern)
        if mask.any():
            hits.append((c, df_manifest[mask]))

    if not hits:
        # última tentativa: buscar por 'IBOV' no nome do arquivo (colunas com 'path'/'file')
        filecols = [c for c in df_manifest.columns if "path" in c.lower() or "file" in c.lower()]
        for fc in filecols:
            ser = df_manifest[fc].astype(str).fillna("")
            mask = ser.str.contains(pattern)
            if mask.any():
                return df_manifest[mask].iloc[0]

        raise ValueError("Não encontrei IBOV no manifesto do Silver (procurei por 'BVSP' ou 'IBOV').")

    # Retorna a primeira correspondência encontrada (preferível a uma escolha ambígua)
    chosen_col, chosen_df = hits[0]
    logging.info(f"Encontrado IBOV pela coluna '{chosen_col}' - usando primeira ocorrência.")
    return chosen_df.iloc[0]


def rolling_z(x: pd.Series, window: int) -> pd.Series:
    r = (x - x.rolling(window, min_periods=1).mean()) / x.rolling(window, min_periods=1).std(ddof=0)
    return r


def compute_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Recebe df com colunas mínimas ['date','open','high','low','close'] (datetime em 'date').
    Retorna df com features técnicas adicionadas.
    """
    df = df.sort_values("date").reset_index(drop=True)
    # Retornos
    df["ret_1d"] = df["close"].pct_change(1)
    df["ret_3d"] = df["close"].pct_change(3)
    df["ret_5d"] = df["close"].pct_change(5)
    df["ret_10d"] = df["close"].pct_change(10)

    # Médias móveis e z-score
    df["ma_5"] = df["close"].rolling(5, min_periods=1).mean()
    df["ma_10"] = df["close"].rolling(10, min_periods=1).mean()
    df["ma_20"] = df["close"].rolling(20, min_periods=1).mean()
    df["ma_5_z"] = rolling_z(df["close"], 5)

    # Bandas de Bollinger (20, 2) com proteção divisão por zero
    ma20 = df["close"].rolling(20, min_periods=1).mean()
    sd20 = df["close"].rolling(20, min_periods=1).std(ddof=0)
    up = ma20 + 2 * sd20
    lo = ma20 - 2 * sd20
    denom = up - lo
    df["bb_pos"] = np.where(denom > 0, (df["close"] - lo) / denom, np.nan)

    # Volatilidades de retornos
    df["std_ret_10"] = df["ret_1d"].rolling(10, min_periods=1).std(ddof=0)
    df["std_ret_15"] = df["ret_1d"].rolling(15, min_periods=1).std(ddof=0)
    df["std_ret_20"] = df["ret_1d"].rolling(20, min_periods=1).std(ddof=0)

    # ATR% aproximado (Wilder 14): usa True Range simplificada
    high_low = (df["high"] - df["low"]).abs()
    high_close = (df["high"] - df["close"].shift(1)).abs()
    low_close = (df["low"] - df["close"].shift(1)).abs()
    tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
    atr14 = tr.ewm(alpha=1 / 14, adjust=False).mean()
    df["atr_pct_14"] = atr14 / df["close"]

    # Osciladores: RSI(14)
    delta = df["close"].diff()
    up_ = delta.clip(lower=0)
    down_ = -delta.clip(upper=0)
    roll_up = up_.ewm(alpha=1 / 14, adjust=False).mean()
    roll_down = down_.ewm(alpha=1 / 14, adjust=False).mean()
    # evita divisão por zero em roll_down
    rs = roll_up / roll_down.replace(0, np.nan)
    df["rsi_14"] = 100 - (100 / (1 + rs))

    # %K/%D (renomeado para campos sem '%')
    ll14 = df["low"].rolling(14, min_periods=1).min()
    hh14 = df["high"].rolling(14, min_periods=1).max()
    df["pct_k_14"] = np.where((hh14 - ll14) > 0, (df["close"] - ll14) / (hh14 - ll14), np.nan)
    df["pct_d_3"] = df["pct_k_14"].rolling(3, min_periods=1).mean()

    # Calendário (exige coluna date em datetime)
    if not np.issubdtype(df["date"].dtype, np.datetime64):
        df["date"] = pd.to_datetime(df["date"])
    df["dow"] = df["date"].dt.weekday  # 0=Mon
    df["dom"] = df["date"].dt.day
    df["month"] = df["date"].dt.month
    df["is_month_end"] = df["date"].dt.is_month_end.astype(int)
    df["is_quarter_end"] = df["date"].dt.is_quarter_end.astype(int)

    return df


def make_labels(
    df: pd.DataFrame,
    h_set=(1, 3, 5),
    target_neutral: Dict[int, Tuple[float, float]] = {1: (0.45, 0.55), 3: (0.38, 0.45), 5: (0.30, 0.38)},
    sigma_window: int = 15,
) -> Tuple[pd.DataFrame, Dict[int, float]]:
    """
    Cria retornos futuros e rotulagem calibrada por k.
    Retorna (df_with_labels, ks_dict).
    """
    out = df.copy()
    # Retornos futuros
    for h in h_set:
        out[f"r_d{h}"] = (out["close"].shift(-h) - out["close"]) / out["close"]

    # σ rolante do retorno 1d (proxy de volatilidade de curto prazo)
    sig = out["ret_1d"].rolling(sigma_window, min_periods=1).std(ddof=0)
    sig.name = "sigma"

    def calibrate_k(h: int, lo_hi: Tuple[float, float]) -> float:
        lo, hi = lo_hi
        candidates = np.linspace(0.1, 3.0, 120)
        best_k, best_gap = None, float("inf")
        rt = out[f"r_d{h}"]
        for k in candidates:
            neutral = (rt.abs() <= k * sig)
            # neutral é Series boolean com NaNs quando sig NaN ou rt NaN -> mean(skipna=True)
            frac = neutral.mean(skipna=True)
            # se frac está dentro do intervalo alvo, gap=0 e podemos retornar k imediatamente
            if lo <= frac <= hi:
                return float(k)
            # senão, medimos distância mínima ao intervalo [lo,hi]
            if frac < lo:
                gap = lo - frac
            else:
                gap = frac - hi
            if gap < best_gap:
                best_gap = gap
                best_k = float(k)
        return best_k

    ks: Dict[int, float] = {}
    for h in h_set:
        # Se série muito curta (sig NaN majoritariamente), devolve NaN para k
        if sig.dropna().size < max(5, sigma_window // 2):
            logging.warning(f"Dados insuficientes para calibrar k para horizonte {h} (poucos valores de sigma).")
            ks[h] = float("nan")
            continue
        ks[h] = calibrate_k(h, target_neutral[h])

    # aplica rótulos
    for h in h_set:
        thr = ks[h] * sig
        # thr pode ser NaN -> comparações com NaN retornam False -> label 0 (neutro)
        y = np.where(out[f"r_d{h}"] > +thr, 1, np.where(out[f"r_d{h}"] < -thr, -1, 0))
        out[f"y_d{h}_cls"] = y
        out[f"k_d{h}"] = ks[h]

    return out, ks


# ====== Execução (fluxo principal) ======
def build_gold_ibov(run_dir: Path, gold_dir: Path) -> Dict[str, Any]:
    """
    Executa o pipeline Gold IBOV: localiza IBOV no manifesto do run_dir, carrega parquet,
    gera features, labels, escreve parquet e manifesto. Retorna um dicionário com metadados.
    """
    man = load_manifest(run_dir)
    row_ibov = find_ibov_row(man)

    # tenta obter caminho do parquet (procura colunas com 'path' ou 'file')
    cand_cols = [c for c in man.columns if "path" in c.lower() or "file" in c.lower()]
    ibov_path = None
    for c in cand_cols:
        v = row_ibov.get(c)
        if isinstance(v, str) and v.strip():
            s = v.strip()
            if s.lower().endswith((".parquet", ".pq")):
                ibov_path = s
                break
            # se a célula aponta para um diretório ou nome sem extensão, ainda aceita e tentará abrir
            if ".parquet" in s.lower() or "parquet" in s.lower():
                ibov_path = s
                break

    if ibov_path is None:
        raise RuntimeError("Não encontrei 'file_path' (ou coluna equivalente) para o IBOV no manifesto.")

    p = Path(ibov_path)
    if not p.is_absolute():
        p = run_dir / p
    if not p.exists():
        raise FileNotFoundError(f"Arquivo Parquet do IBOV não encontrado: {p}")

    logging.info(f"Lendo Parquet IBOV de: {p}")
    df = pd.read_parquet(p)

    # sanity: colunas mínimas
    required = {"date", "open", "high", "low", "close"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Colunas mínimas ausentes no IBOV: {missing} (esperado: {required})")

    # converte date, ordena e computa features
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").reset_index(drop=True)

    df_feat = compute_features(df)

    # Labels
    df_gold, ks = make_labels(
        df_feat,
        h_set=(1, 3, 5),
        target_neutral={1: (0.45, 0.55), 3: (0.38, 0.45), 5: (0.30, 0.38)},
        sigma_window=15,
    )

    # Limpeza de linhas iniciais (onde rolling cria NaN) - usar a maior janela relevante
    min_w = max(20, 15, 14)  # ma_20, sigma_window, rsi/llhh
    if df_gold.shape[0] <= min_w:
        raise ValueError("Série muito curta para gerar features/labels após rollings.")
    df_gold = df_gold.iloc[min_w:].reset_index(drop=True)

    # Persistência
    out_parquet = gold_dir / "gold_ibov_features.parquet"
    # garante diretório
    gold_dir.mkdir(parents=True, exist_ok=True)
    df_gold.to_parquet(out_parquet, index=False)
    logging.info(f"Gold salvo em: {out_parquet}")

    # Manifesto simples do Gold (IBOV)
    def class_share(s: pd.Series) -> Dict[str, float]:
        return {
            "p_neg1": float((s == -1).mean()),
            "p_zero": float((s == 0).mean()),
            "p_pos1": float((s == 1).mean()),
        }

    manifest = {
        "rows": int(len(df_gold)),
        "date_min": df_gold["date"].min().strftime("%Y-%m-%d"),
        "date_max": df_gold["date"].max().strftime("%Y-%m-%d"),
        "file_path": str(out_parquet),
        "k_d1": float(ks.get(1, np.nan)),
        "k_d3": float(ks.get(3, np.nan)),
        "k_d5": float(ks.get(5, np.nan)),
    }
    m = pd.DataFrame([manifest])
    for h in (1, 3, 5):
        shares = class_share(df_gold[f"y_d{h}_cls"])
        for kk, vv in shares.items():
            m[f"{kk}_d{h}"] = vv

    m_path = gold_dir / "gold_ibov_manifest.csv"
    m.to_csv(m_path, index=False)
    logging.info(f"Manifesto Gold salvo em: {m_path}")

    # PROVENANCE
    prov = gold_dir / "PROVENANCE_IBOV.txt"
    prov.write_text(
        "\n".join(
            [
                f"gold_source: {run_dir}",
                f"ibov_source_file: {p}",
                f"built_at: {datetime.now().isoformat()}",
                f"label_targets: D+1 45–55%, D+3 38–45%, D+5 30–38%",
                f"sigma_window: 15",
            ]
        )
    )
    logging.info(f"Provenance salvo em: {prov}")

    return {
        "parquet": str(out_parquet),
        "manifest": str(m_path),
        "provenance": str(prov),
        "ks": ks,
        "rows": len(df_gold),
    }


if __name__ == "__main__":
    # Executa a construção do GOLD para IBOV
    result = build_gold_ibov(RUN_STABLE_DIR, GOLD_DIR)
    logging.info("OK - Gold IBOV salvo.")
    logging.info(f"Parquet: {result['parquet']}")
    logging.info(f"Manifest: {result['manifest']}")
    logging.info(f"K calibrados: {result['ks']}")

INFO: Usando manifesto: /home/wrm/BOLSA_2026/intermediarios/silver/run_stable_20250915_152222/manifesto_silver_fixed.csv
  mask = ser.str.contains(pattern)
INFO: Encontrado IBOV pela coluna 'ticker' - usando primeira ocorrência.
INFO: Lendo Parquet IBOV de: /home/wrm/BOLSA_2026/intermediarios/silver/run_fixed_20250915_121821/BVSP_silver.parquet
INFO: Gold salvo em: /home/wrm/BOLSA_2026/gold/IBOV/gold_ibov_features.parquet
INFO: Manifesto Gold salvo em: /home/wrm/BOLSA_2026/gold/IBOV/gold_ibov_manifest.csv
INFO: Provenance salvo em: /home/wrm/BOLSA_2026/gold/IBOV/PROVENANCE_IBOV.txt
INFO: OK - Gold IBOV salvo.
INFO: Parquet: /home/wrm/BOLSA_2026/gold/IBOV/gold_ibov_features.parquet
INFO: Manifest: /home/wrm/BOLSA_2026/gold/IBOV/gold_ibov_manifest.csv
INFO: K calibrados: {1: 0.6117647058823529, 3: 0.9529411764705882, 5: 0.8798319327731092}


In [9]:
# Objetivo: Validar o Gold IBOV recentemente gerado, produzir relatórios e atualizar PROVENANCE.
# Inputs: /home/wrm/BOLSA_2026/gold/IBOV/gold_ibov_features.parquet
#         /home/wrm/BOLSA_2026/gold/IBOV/gold_ibov_manifest.csv
# Outputs: /home/wrm/BOLSA_2026/gold/IBOV/validation_report_ibov.csv
#          /home/wrm/BOLSA_2026/gold/IBOV/label_stats_ibov.csv
#          Append em /home/wrm/BOLSA_2026/gold/IBOV/PROVENANCE_IBOV.txt com resumo e timestamp
# Premissas: janela mínima de rollings = max(20,15,14); linhas iniciais são descartadas (min_window).
from pathlib import Path
from datetime import datetime
import json
import pandas as pd
import numpy as np
import logging

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

GOLD_DIR = Path("/home/wrm/BOLSA_2026/gold/IBOV")
GOLD_PARQUET = GOLD_DIR / "gold_ibov_features.parquet"
GOLD_MANIFEST = GOLD_DIR / "gold_ibov_manifest.csv"
VALIDATION_REPORT = GOLD_DIR / "validation_report_ibov.csv"
LABEL_STATS = GOLD_DIR / "label_stats_ibov.csv"
PROV = GOLD_DIR / "PROVENANCE_IBOV.txt"

def load_gold(p: Path) -> pd.DataFrame:
    if not p.exists():
        raise FileNotFoundError(f"Gold parquet não encontrado: {p}")
    logging.info(f"Carregando gold parquet: {p}")
    return pd.read_parquet(p)

def basic_schema_checks(df: pd.DataFrame) -> dict:
    expected = ["date", "close", "volume"]
    res = {
        "present_columns": list(df.columns),
        "missing_columns": [],
        "n_rows": int(len(df)),
    }
    for c in expected:
        if c not in df.columns:
            res["missing_columns"].append(c)

    try:
        dmin = pd.to_datetime(df["date"]).min()
        dmax = pd.to_datetime(df["date"]).max()
        res["date_min"] = dmin.strftime("%Y-%m-%d")
        res["date_max"] = dmax.strftime("%Y-%m-%d")
    except Exception:
        res["date_min"] = None
        res["date_max"] = None

    nan_counts = df.isna().sum().to_dict()
    res["nan_counts"] = {k: int(v) for k, v in nan_counts.items()}

    res["dtypes"] = {k: str(v) for k, v in df.dtypes.items()}

    return res

def check_label_cols(df: pd.DataFrame, h_set=(1, 3, 5)) -> dict:
    stats = {}
    for h in h_set:
        label_col = f"y_d{h}_cls"
        rcol = f"r_d{h}"
        stats[label_col] = {
            "present": label_col in df.columns,
            "missing": int(df[label_col].isna().sum()) if label_col in df.columns else None,
        }
        stats[rcol] = {
            "present": rcol in df.columns,
            "missing": int(df[rcol].isna().sum()) if rcol in df.columns else None,
        }
    return stats

def write_validation_report(report: dict, out: Path):
    flat = {}
    flat["n_rows"] = report.get("n_rows")
    flat["date_min"] = report.get("date_min")
    flat["date_max"] = report.get("date_max")
    flat["missing_columns"] = ";".join(report.get("missing_columns", []))
    flat["nan_counts_json"] = json.dumps(report.get("nan_counts", {}))
    flat["dtypes_json"] = json.dumps(report.get("dtypes", {}))
    df = pd.DataFrame([flat])
    df.to_csv(out, index=False)
    logging.info(f"Validation report salvo: {out}")

def write_label_stats(df: pd.DataFrame, out: Path, h_set=(1, 3, 5)):
    rows = []
    for h in h_set:
        label_col = f"y_d{h}_cls"
        if label_col not in df.columns:
            rows.append({"h": h, "label": None, "count": None, "frac": None})
            continue
        counts = df[label_col].value_counts(dropna=False).to_dict()
        total = float(len(df))
        for lab, cnt in counts.items():
            rows.append({"h": h, "label": int(lab) if not pd.isna(lab) else None, "count": int(cnt), "frac": float(cnt / total)})
    outdf = pd.DataFrame(rows)
    outdf.to_csv(out, index=False)
    logging.info(f"Label stats salvo: {out}")

def append_provenance(prov_path: Path, summary: dict):
    line = f"[{datetime.now().isoformat()}] VALIDATION: {json.dumps(summary, default=str)}\n"
    prov_path.write_text(prov_path.read_text() + line if prov_path.exists() else line)
    logging.info(f"Provenance atualizado: {prov_path}")

def run_validation():
    df = load_gold(GOLD_PARQUET)

    report = basic_schema_checks(df)

    label_checks = check_label_cols(df)
    report["label_checks"] = label_checks

    min_window = max(20, 15, 14)
    if df.shape[0] <= min_window:
        raise ValueError("Série muito curta para validação após trim (len <= min_window).")
    df_trim = df.iloc[min_window:].reset_index(drop=True)

    critical = ["date", "close", "volume"] + [f"r_d{h}" for h in (1, 3, 5)] + [f"y_d{h}_cls" for h in (1, 3, 5)]
    nan_crit = {c: (int(df_trim[c].isna().sum()) if c in df_trim.columns else None) for c in critical}
    report["nan_critical_after_trim"] = nan_crit

    write_validation_report(report, VALIDATION_REPORT)

    write_label_stats(df_trim, LABEL_STATS)

    label_df = pd.read_csv(LABEL_STATS)
    summary = {"rows_trimmed": int(len(df_trim))}
    for h in (1, 3, 5):
        sub = label_df[label_df["h"] == h]
        zero_row = sub[sub["label"] == 0]
        frac_zero = float(zero_row["frac"].iloc[0]) if not zero_row.empty else 0.0
        summary[f"p_zero_d{h}"] = frac_zero

    append_provenance(PROV, summary)

    logging.info("Validação concluída.")

# Executa
run_validation()

INFO: Carregando gold parquet: /home/wrm/BOLSA_2026/gold/IBOV/gold_ibov_features.parquet
INFO: Validation report salvo: /home/wrm/BOLSA_2026/gold/IBOV/validation_report_ibov.csv
INFO: Label stats salvo: /home/wrm/BOLSA_2026/gold/IBOV/label_stats_ibov.csv
INFO: Provenance atualizado: /home/wrm/BOLSA_2026/gold/IBOV/PROVENANCE_IBOV.txt
INFO: Validação concluída.
