# Pipeline Bronze: IBOV download (Jupyter)

Este notebook baixa o histórico diário do IBOV (ticker `^BVSP`) cobrindo pelo menos 8 anos, salva um arquivo Parquet em `dados_originais/` e escreve um manifesto `manifesto_dados_originais_bronze_ibov.csv`.

Siga a política do projeto: não sobrescrever arquivos existentes — o notebook criará novos arquivos quando necessário.

In [6]:
# Cell 1: Imports e configurações iniciais
from pathlib import Path
from datetime import date, timedelta
import hashlib
import pandas as pd
import yfinance as yf

ROOT = Path('/home/wrm/BOLSA_2026')
DADOS = ROOT / 'dados_originais'
DADOS.mkdir(parents=True, exist_ok=True)
TICKER = '^BVSP'
MIN_YEARS = 12
END = date.today()
START = END - timedelta(days=int(MIN_YEARS * 365.25) + 30)  # buffer
print('Prepared paths and date range:', START, '->', END)

Prepared paths and date range: 2013-08-18 -> 2025-09-17


In [7]:
# Cell 2: Check for existing bronze files and build output path (do not overwrite)
out_parquet = DADOS / f'IBOV_{START.isoformat()}_{END.isoformat()}.parquet'
manifest_csv = DADOS / 'manifesto_dados_originais_bronze_ibov.csv'
print('Bronze target parquet:', out_parquet)
print('Manifest path:', manifest_csv)

Bronze target parquet: /home/wrm/BOLSA_2026/dados_originais/IBOV_2013-08-18_2025-09-17.parquet
Manifest path: /home/wrm/BOLSA_2026/dados_originais/manifesto_dados_originais_bronze_ibov.csv


In [None]:
# Cell 4: Promote Bronze -> Silver (flatten schema, remove duplicates, align B3 calendar)
INTERMED = ROOT / 'intermediarios'
INTERMED.mkdir(parents=True, exist_ok=True)
# Build silver output path (do not overwrite existing silver)
bronze_path = out_parquet
if not bronze_path.exists():
    raise SystemExit(f'Bronze parquet not found: {bronze_path}')
dfb = pd.read_parquet(bronze_path)
# Flatten MultiIndex columns if present
if isinstance(dfb.columns, pd.MultiIndex):
    dfb.columns = ['_'.join([str(c) for c in col if c not in [None, '']]).strip() for col in dfb.columns]
# Normalize column names to expected flat schema
col_map = {}
for col in list(dfb.columns):
    lc = str(col).lower()
    if 'adj' in lc and 'close' in lc:
        col_map[col] = 'adj_close'
    elif 'close' in lc:
        col_map[col] = 'close'
    elif 'open' in lc:
        col_map[col] = 'open'
    elif 'high' in lc:
        col_map[col] = 'high'
    elif 'low' in lc:
        col_map[col] = 'low'
    elif 'volume' in lc:
        col_map[col] = 'volume'
    elif 'date' in lc:
        col_map[col] = 'date'
    elif 'ticker' in lc:
        col_map[col] = 'ticker'
dfb = dfb.rename(columns=col_map)
# Ensure required columns exist and fill adj_close from close if necessary
for c in ['date','open','high','low','close','adj_close','volume','ticker']:
    if c not in dfb.columns:
        dfb[c] = pd.NA
if 'adj_close' in dfb.columns and dfb['adj_close'].isna().all() and 'close' in dfb.columns:
    dfb['adj_close'] = dfb['close']
# Normalize date column, remove duplicates and sort
dfb['date'] = pd.to_datetime(dfb['date']).dt.normalize()
pre_rows = len(dfb)
dfb = dfb.sort_values('date').drop_duplicates('date', keep='first').reset_index(drop=True)
dups_removed = pre_rows - len(dfb)
# Try to align with B3 calendar (pandas_market_calendars)
missing_trading_days = None
try:
    import pandas_market_calendars as mcal
    cal = mcal.get_calendar('BVMF')
    schedule = cal.schedule(start_date=dfb['date'].min().date(), end_date=dfb['date'].max().date())
    trading_days = pd.to_datetime(schedule.index).normalize()
    # Filter dataframe to trading days (keeps only pregões)
    dfb = dfb[dfb['date'].isin(trading_days)].reset_index(drop=True)
    # compute missing days if any
    missing_trading_days = sorted(set(trading_days) - set(dfb['date']))
except Exception as e:
    print('B3 calendar alignment skipped:', e)
# Write silver parquet and manifest (do not overwrite existing silver file)
silver_path = INTERMED / f'IBOV_silver_{dfb['date'].min().date().isoformat()}_{dfb['date'].max().date().isoformat()}.parquet'
manifest_silver = INTERMED / 'manifesto_silver.csv'
if silver_path.exists():
    print('Silver parquet already exists, skipping:', silver_path)
else:
    dfb.to_parquet(silver_path)
    row = {
        'file': silver_path.name,
        'ticker': TICKER,
        'date_min': dfb['date'].min().isoformat(),
        'date_max': dfb['date'].max().isoformat(),
        'rows': len(dfb),
        'source': 'derived_from_bronze',
    }
    row['sha256'] = hashlib.sha256(silver_path.read_bytes()).hexdigest()
    dfm = pd.DataFrame([row])
    if manifest_silver.exists():
        dfm.to_csv(manifest_silver, mode='a', header=False, index=False)
    else:
        dfm.to_csv(manifest_silver, index=False)
    print('Saved silver parquet and manifest:', silver_path, manifest_silver)
    print('Rows after cleaning:', len(dfb), 'duplicates removed:', dups_removed)
    if missing_trading_days is not None and len(missing_trading_days):
        print('Warning: missing trading days (present in B3 calendar but not in data):', len(missing_trading_days))
    print('NaNs per column:\n', dfb.isna().sum())