# Pipeline Bronze: IBOV download (Jupyter)

Este notebook baixa o histórico diário do IBOV (ticker `^BVSP`) cobrindo pelo menos 8 anos, salva um arquivo Parquet em `dados_originais/` e escreve um manifesto `manifesto_dados_originais_bronze_ibov.csv`.

Siga a política do projeto: não sobrescrever arquivos existentes — o notebook criará novos arquivos quando necessário.

In [4]:
# Cell 1: Imports e configurações iniciais
from pathlib import Path
from datetime import date, timedelta
import hashlib
import pandas as pd
import yfinance as yf

ROOT = Path('/home/wrm/BOLSA_2026')
DADOS = ROOT / 'dados_originais'
DADOS.mkdir(parents=True, exist_ok=True)
TICKER = '^BVSP'
MIN_YEARS = 12
END = date.today()
START = END - timedelta(days=int(MIN_YEARS * 365.25) + 30)  # buffer
print('Prepared paths and date range:', START, '->', END)

Prepared paths and date range: 2013-08-18 -> 2025-09-17


In [5]:
# Cell 2: Check for existing bronze files and build output path (do not overwrite)
out_parquet = DADOS / f'IBOV_{START.isoformat()}_{END.isoformat()}.parquet'
manifest_csv = DADOS / 'manifesto_dados_originais_bronze_ibov.csv'
print('Bronze target parquet:', out_parquet)
print('Manifest path:', manifest_csv)

Bronze target parquet: /home/wrm/BOLSA_2026/dados_originais/IBOV_2013-08-18_2025-09-17.parquet
Manifest path: /home/wrm/BOLSA_2026/dados_originais/manifesto_dados_originais_bronze_ibov.csv


In [6]:
# Cell 3: Create Silver from Bronze (Instrução 02)
from pathlib import Path
from datetime import datetime
import hashlib, json
import pandas as pd

ROOT = Path('/home/wrm/BOLSA_2026')
BRONZE_PATH = ROOT / 'dados_originais' / 'IBOV_2013-08-18_2025-09-17.parquet'
BRONZE_SHA256_EXPECTED = 'c18ab2ee0d4ffbc4e969bcfc79fa2c31445c739096ea75a1e3a4968b038eaafa'
SILVER_ROOT = ROOT / 'intermediarios' / 'silver'
MANIFESTO_SILVER = SILVER_ROOT / 'manifesto_silver_ibov.csv'
REQUIRED_COLS = ['date','open','high','low','close','adj_close','volume']

def compute_sha256(path: Path, chunk_size: int = 8192) -> str:
    h = hashlib.sha256()
    with path.open('rb') as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()

def flatten_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = []
    for c in df.columns:
        if isinstance(c, tuple):
            cols.append(c[0])
        else:
            cols.append(c)
    df.columns = cols
    return df

def snake_standardize(col: str) -> str:
    return col.strip().lower()

def ensure_date_and_order(df: pd.DataFrame) -> pd.DataFrame:
    if 'date' not in df.columns:
        raise KeyError("Input data must contain a 'date' column")
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date', ascending=True)
    return df

def create_silver(dry_run: bool = False):
    run_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
    run_dir = SILVER_ROOT / f'run_{run_ts}'
    run_dir.mkdir(parents=True, exist_ok=True)
    silver_path = run_dir / 'IBOV_silver.parquet'

    if not BRONZE_PATH.exists():
        raise FileNotFoundError(f'Bronze file not found: {BRONZE_PATH}')

    df = pd.read_parquet(BRONZE_PATH)
    df = flatten_columns(df)
    col_map = {c: snake_standardize(c) for c in df.columns}
    df = df.rename(columns=col_map)

    # Ensure required cols exist
    for c in REQUIRED_COLS:
        if c not in df.columns:
            df[c] = pd.NA

    df = df[REQUIRED_COLS]
    df = ensure_date_and_order(df)

    # Cleaning rules
    price_cols = ['open','high','low','close','adj_close']
    mask_all_prices_nan = df[price_cols].isna().all(axis=1)
    df_clean = df.loc[~mask_all_prices_nan].copy()
    df_clean = df_clean.loc[~df_clean['volume'].isna()].copy()

    n_before_dups = len(df_clean)
    df_clean = df_clean.drop_duplicates(subset=['date'], keep='last')
    n_after_dups = len(df_clean)
    duplicates_dropped = n_before_dups - n_after_dups

    df_clean = df_clean.sort_values('date', ascending=True).reset_index(drop=True)

    total_rows = len(df_clean)
    if total_rows == 0:
        adj_close_rule = 'no_rows_after_cleaning'
    else:
        adj_nan_count = int(df_clean['adj_close'].isna().sum())
        if adj_nan_count == total_rows:
            df_clean['adj_close'] = df_clean['close']
            adj_close_rule = 'filled_from_close_all_rows'
        else:
            adj_close_rule = 'as_is'

    nan_counts = df_clean.isna().sum().to_dict()
    nan_counts_json = json.dumps({k: int(v) for k, v in nan_counts.items()})

    final_cols = list(df_clean.columns)
    if final_cols != REQUIRED_COLS:
        df_clean = df_clean.reindex(columns=REQUIRED_COLS)
        final_cols = list(df_clean.columns)
        if final_cols != REQUIRED_COLS:
            raise AssertionError(f'Final schema mismatch. Expected {REQUIRED_COLS}, got {final_cols}')

    if len(df_clean) > 0:
        date_min = df_clean['date'].min().strftime('%Y-%m-%d')
        date_max = df_clean['date'].max().strftime('%Y-%m-%d')
    else:
        date_min = None
        date_max = None

    if not dry_run:
        df_clean.to_parquet(silver_path, index=False, compression='snappy')
        silver_sha256 = compute_sha256(silver_path)

        manifest_row = {
            'run_ts': run_ts,
            'source_parquet_path': str(BRONZE_PATH),
            'source_sha256': BRONZE_SHA256_EXPECTED,
            'rows': int(len(df_clean)),
            'date_min': date_min,
            'date_max': date_max,
            'duplicates_dropped': int(duplicates_dropped),
            'nan_counts_json': nan_counts_json,
            'adj_close_rule': adj_close_rule,
            'columns_json': json.dumps(REQUIRED_COLS),
            'silver_parquet_path': str(silver_path),
            'silver_sha256': silver_sha256
        }

        SILVER_ROOT.mkdir(parents=True, exist_ok=True)
        write_header = not MANIFESTO_SILVER.exists()
        with MANIFESTO_SILVER.open('a', encoding='utf-8') as f:
            if write_header:
                f.write(','.join(list(manifest_row.keys())) + '\n')
            row_values = [str(manifest_row[h]) for h in list(manifest_row.keys())]
            f.write(','.join(row_values) + '\n')

        # Prints required by spec (all properly-terminated strings)
        print(f'Silver path: {silver_path}')
        print(f'Silver rows: {len(df_clean)} | Date range: {date_min} -> {date_max}')
        print(f'Duplicates dropped: {duplicates_dropped}')
        print(f'NaNs after cleaning: {nan_counts_json}')
        print(f'Columns: {REQUIRED_COLS}')
        print(f'AdjClose rule: {adj_close_rule}')
        print(f'Manifest updated: {MANIFESTO_SILVER}')
        print(f'Silver sha256: {silver_sha256}')
        return manifest_row
    else:
        print('Dry run mode - no files written.')
        print(f'Would write silver to: {silver_path}')
        print(f'Rows after cleaning: {len(df_clean)} | Date range: {date_min} -> {date_max}')
        print(f'Duplicates dropped: {duplicates_dropped}')
        print(f'NaNs after cleaning: {nan_counts_json}')
        print(f'Columns: {REQUIRED_COLS}')
        print(f'AdjClose rule: {adj_close_rule}')
        return { 'run_ts': run_ts, 'rows': int(len(df_clean)), 'date_min': date_min, 'date_max': date_max, 'duplicates_dropped': int(duplicates_dropped), 'nan_counts_json': nan_counts_json, 'adj_close_rule': adj_close_rule, 'silver_path': str(silver_path) }

# Execute creation when running cell
_result = create_silver(dry_run=False)


Silver path: /home/wrm/BOLSA_2026/intermediarios/silver/run_20250917_144039/IBOV_silver.parquet
Silver rows: 2996 | Date range: 2013-08-19 -> 2025-09-17
Duplicates dropped: 0
NaNs after cleaning: {"date": 0, "open": 0, "high": 0, "low": 0, "close": 0, "adj_close": 0, "volume": 0}
Columns: ['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']
AdjClose rule: filled_from_close_all_rows
Manifest updated: /home/wrm/BOLSA_2026/intermediarios/silver/manifesto_silver_ibov.csv
Silver sha256: 350216dfaa30971bb83a75e9f3364b25397b33aeebfa2eb7e80d805a2279b7ca


In [3]:
# Instrução 03B — Calibração de k por horizonte (dry-run)
from pathlib import Path
from datetime import datetime
import hashlib, json
import pandas as pd
import numpy as np

# CONFIG: silver input (validated) — do not modify paths here unless you know what you're doing
ROOT = Path("/home/wrm/BOLSA_2026")
SILVER_PATH = ROOT / "intermediarios" / "silver" / "run_20250917_114743" / "IBOV_silver.parquet"
SILVER_SHA256_EXPECTED = "350216dfaa30971bb83a75e9f3364b25397b33aeebfa2eb7e80d805a2279b7ca"
SIGMA_WINDOW = 252
HORIZONS = {"d1": 1, "d3": 3, "d5": 5}

def compute_sha256(path: Path, chunk_size: int = 8192) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()

def load_silver(path: Path) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(f"Silver not found: {path}")
    df = pd.read_parquet(path)
    # flatten multiindex cols if present and standardize names
    cols = [c[0] if isinstance(c, tuple) else c for c in df.columns]
    df.columns = [str(c).strip().lower() for c in cols]
    # ensure expected cols
    expected = ["date", "open", "high", "low", "close", "adj_close", "volume"]
    for c in expected:
        if c not in df.columns:
            raise KeyError(f"Silver missing required column: {c}")
    df = df[expected].copy()
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").reset_index(drop=True)
    return df

def compute_logret_and_sigma(df: pd.DataFrame, sigma_window: int) -> pd.DataFrame:
    df = df.copy()
    df["logret"] = np.log(df["adj_close"] / df["adj_close"].shift(1))
    df["sigma_rolling"] = df["logret"].rolling(window=sigma_window, min_periods=sigma_window).std()
    return df

def horizon_logret(df: pd.DataFrame, h: int) -> pd.Series:
    return np.log(df["adj_close"].shift(-h) / df["adj_close"])

def neutral_prop_for_k(df: pd.DataFrame, h: int, k: float) -> float:
    ret = horizon_logret(df, h)
    thresh = k * df["sigma_rolling"]
    mask = (~ret.isna()) & (~thresh.isna())
    if mask.sum() == 0:
        return float('nan')
    selected_ret = ret[mask]
    selected_th = thresh[mask]
    neutral = ((selected_ret >= -selected_th) & (selected_ret <= selected_th)).sum()
    return float(neutral) / float(mask.sum())

# Grid of k values: 0.40 .. 1.20 step 0.05
ks = [round(x, 2) for x in np.arange(0.40, 1.201, 0.05)]

def calibrate_k_dryrun(silver_path: Path, ks: list, sigma_window: int, horizons: dict):
    df = load_silver(silver_path)
    df = compute_logret_and_sigma(df, sigma_window)

    # coverage after windows
    mask_cov = ~df["sigma_rolling"].isna()
    cov_df = df.loc[mask_cov].copy().reset_index(drop=True)
    rows = len(cov_df)
    date_min = cov_df["date"].min().strftime("%Y-%m-%d") if rows>0 else None
    date_max = cov_df["date"].max().strftime("%Y-%m-%d") if rows>0 else None

    results = []
    for h_name, h in horizons.items():
        for k in ks:
            prop = neutral_prop_for_k(df, h, k)
            results.append({"horizon": h_name, "h": h, "k": k, "neutral_prop": prop})

    res_df = pd.DataFrame(results)
    # drop nan neutral_prop rows
    res_df = res_df.dropna(subset=["neutral_prop"]).reset_index(drop=True)

    # For reporting: find ks that first enter the acceptable band per horizon
    target = {"d1": (0.45, 0.55), "d3": (0.38, 0.45), "d5": (0.30, 0.38)}
    chosen = {}
    chosen_props = {}
    for h_name in horizons.keys():
        band = target[h_name]
        cand = res_df[res_df["horizon"] == h_name].sort_values("k").reset_index(drop=True)
        sel = cand[(cand["neutral_prop"] >= band[0]) & (cand["neutral_prop"] <= band[1])]
        if not sel.empty:
            # pick smallest k that satisfies
            pick = float(sel.iloc[0]["k"])
            chosen[h_name] = pick
            chosen_props[h_name] = float(sel.iloc[0]["neutral_prop"])
        else:
            chosen[h_name] = None
            chosen_props[h_name] = None

    # Print mandated outputs
    print("Calib grid head (amostra):")
    # show for each horizon the 10 rows closest to the center of band
    samples = []
    for h_name in horizons.keys():
        band = target[h_name]
        cand = res_df[res_df["horizon"] == h_name].copy()
        cand['dist'] = cand['neutral_prop'].apply(lambda x: abs(x - ((band[0]+band[1])/2)))
        cand = cand.sort_values('dist').head(10)[["horizon", "k", "neutral_prop"]]
        samples.append(cand)
        print(cand.to_string(index=False))
        print("---")

    print("Chosen ks: " + json.dumps(chosen))
    # format percents
    chosen_perc = {k: (v if v is None else round(v*100, 2)) for k, v in chosen_props.items()}
    print("Chosen neutral proportions: " + json.dumps(chosen_perc))
    print(f"Coverage after windows (preview): rows={rows}, range={date_min} -> {date_max}")

    return res_df, chosen, chosen_props, {"rows": rows, "date_min": date_min, "date_max": date_max}

# Run dry-run calibration and print outputs
res_df, chosen, chosen_props, coverage = calibrate_k_dryrun(SILVER_PATH, ks, SIGMA_WINDOW, HORIZONS)

# Final question to user as required by protocol
print("End of cell - dry run only (no files written)")

Calib grid head (amostra):
horizon    k  neutral_prop
     d1 0.60      0.519869
     d1 0.55      0.480131
     d1 0.50      0.448414
     d1 0.65      0.554138
     d1 0.70      0.584761
     d1 0.45      0.404666
     d1 0.75      0.610645
     d1 0.40      0.362012
     d1 0.80      0.639081
     d1 0.85      0.664601
---
horizon    k  neutral_prop
     d3 0.80      0.405327
     d3 0.85      0.429040
     d3 0.75      0.383072
     d3 0.90      0.447282
     d3 0.70      0.359358
     d3 0.95      0.473185
     d3 1.00      0.495804
     d3 0.65      0.332360
     d3 1.05      0.516600
     d3 0.60      0.311200
---
horizon    k  neutral_prop
     d5 0.85      0.339540
     d5 0.90      0.353414
     d5 0.80      0.323476
     d5 0.75      0.308507
     d5 0.95      0.376050
     d5 0.70      0.292808
     d5 1.00      0.393939
     d5 0.65      0.275283
     d5 1.05      0.410004
     d5 1.10      0.425703
---
Chosen ks: {"d1": 0.55, "d3": 0.75, "d5": 0.75}
Chosen neutral proport