# Coleta limpa: 24 B3 (.SA) + 7 indicadores (2012-01-01 ↔ 2025-09-20)

- Persistência: Parquets individuais por série em `00_data/01_bruto`.
- Janela: do mais recente (2025-09-20) até o mais antigo, cortando em 2012-01-01.
- Fontes:
  - Ações B3: investpy (com fallback yfinance para listagens mais recentes)
  - Indicadores/ETF/commodities: yfinance


In [None]:
# 1) Instalação e imports essenciais
%pip install --quiet investpy==1.0.8 yfinance==0.2.58 lxml==4.9.3 html5lib==1.1 tqdm>=4.66.0 pyarrow>=14.0.0 minio>=7.2.7

import os, time
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
from zoneinfo import ZoneInfo
import investpy, yfinance as yf

TZ_SP = ZoneInfo("America/Sao_Paulo")
CUT_MAX = datetime(2025, 9, 20, tzinfo=TZ_SP).date()   # mais recente
CUT_MIN = datetime(2012, 1, 1, tzinfo=TZ_SP).date()    # mais antigo
OUT_DIR = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/01_bruto")
OUT_DIR.mkdir(parents=True, exist_ok=True)
print("OUT_DIR =", OUT_DIR)



[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


  import pkg_resources


OUT_DIR = G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_bruto


In [2]:
# 2) Lista dos 24 B3 (.SA) e 7 indicadores
TICKERS_B3 = [
    "ABEV3.SA", "B3SA3.SA", "BBAS3.SA", "CSNA3.SA", "CPLE6.SA", "ELET3.SA", "GGBR4.SA",
    "HAPV3.SA", "ITUB4.SA", "LREN3.SA", "PETR4.SA", "PRIO3.SA", "PSSA3.SA", "RAIL3.SA",
    "RDOR3.SA", "SBSP3.SA", "SUZB3.SA", "TAEE11.SA", "TIMS3.SA", "UGPA3.SA", "VALE3.SA",
    "VIVT3.SA", "WEGE3.SA", "TOTS3.SA"
]
INDICATORS = {
    "^BVSP": "_bvsp",
    "EWZ": "ewz",
    "^GSPC": "_gspc",
    "^VIX": "_vix",
    "DX-Y.NYB": "dx-y.nyb",
    "^TNX": "_tnx",
    "BZ=F": "bz=f",
}
print(len(TICKERS_B3), "B3 +", len(INDICATORS), "indicadores")

24 B3 + 7 indicadores


In [5]:
# 3) Utilitários de datas/normalização e coletores (investpy + yfinance)

def sp_date(d: datetime | str):
    if isinstance(d, str):
        return datetime.fromisoformat(d).date()
    return d.date() if isinstance(d, datetime) else d

def clamp_window(min_str="2012-01-01", max_str="2025-09-20"):
    dmin = datetime.fromisoformat(min_str).date()
    dmax = datetime.fromisoformat(max_str).date()
    return dmin, dmax

CUT_MIN, CUT_MAX = clamp_window("2012-01-01", "2025-09-20")

def to_investing_symbol(b3_symbol: str) -> str:
    return b3_symbol.upper().replace('.SA','')


def _to_dtindex_sp(values) -> pd.DatetimeIndex:
    idx = pd.DatetimeIndex(pd.to_datetime(values))
    if idx.tz is None:
        idx = idx.tz_localize(TZ_SP)
    else:
        idx = idx.tz_convert(TZ_SP)
    return idx


def normalize_df(df: pd.DataFrame, ticker_label: str) -> pd.DataFrame:
    rename = {"Date":"date","Open":"open","High":"high","Low":"low","Close":"close","Adj Close":"adj_close","Volume":"volume"}
    df = df.rename(columns=rename)
    # construir datetime_sp em TZ_SP
    if 'date' in df.columns:
        dt_idx = _to_dtindex_sp(df['date'])
    elif 'Date' in df.columns:
        dt_idx = _to_dtindex_sp(df['Date'])
    else:
        dt_idx = _to_dtindex_sp(df.index)
    out = df.reset_index(drop=True)
    ts = pd.Series(dt_idx)
    out['datetime_sp'] = ts.values
    out['date'] = pd.to_datetime(ts).dt.date.astype(str)
    out['ticker'] = ticker_label
    # recorte da janela
    out = out[(out['date'] >= CUT_MIN.strftime('%Y-%m-%d')) & (out['date'] <= CUT_MAX.strftime('%Y-%m-%d'))]
    # garantir colunas essenciais
    for c in ["open","high","low","close","volume"]:
        if c not in out.columns:
            out[c] = pd.NA
    out = out[["ticker","date","open","high","low","close","volume","datetime_sp"]]
    out.drop_duplicates(subset=["ticker","date"], inplace=True)
    return out


def fetch_b3(b3_symbol: str) -> pd.DataFrame:
    """Tenta investpy; se falhar, cai para yfinance (útil para TIMS3/RDOR3)."""
    sym = to_investing_symbol(b3_symbol)
    f_str = CUT_MIN.strftime('%d/%m/%Y'); t_str = CUT_MAX.strftime('%d/%m/%Y')
    try:
        # tenta via search_quotes
        res = investpy.search_quotes(text=sym, products=['stocks'], countries=['brazil'])
        qlist = res if isinstance(res, list) else ([res] if res else [])
        if qlist:
            pick = next((q for q in qlist if getattr(q,'symbol','').upper()==sym.upper()), qlist[0])
            df = pick.retrieve_historical_data(from_date=f_str, to_date=t_str, as_json=False, order='descending')
            if df is not None and not df.empty:
                return normalize_df(df, b3_symbol.upper())
        # fallback investpy direto
        df = investpy.get_stock_historical_data(stock=sym, country='brazil', from_date=f_str, to_date=t_str, as_json=False, order='descending')
        if df is not None and not df.empty:
            return normalize_df(df, b3_symbol.upper())
    except Exception:
        pass
    # yfinance fallback
    tkr = yf.Ticker(b3_symbol)
    df = tkr.history(start=CUT_MIN.strftime('%Y-%m-%d'), end=(CUT_MAX + timedelta(days=1)).strftime('%Y-%m-%d'), interval='1d', auto_adjust=False)
    if df is None or df.empty:
        raise RuntimeError(f"Sem dados para {b3_symbol}")
    return normalize_df(df, b3_symbol.upper())


def fetch_indicator(yf_symbol: str, db_symbol: str) -> pd.DataFrame:
    tkr = yf.Ticker(yf_symbol)
    df = tkr.history(start=CUT_MIN.strftime('%Y-%m-%d'), end=(CUT_MAX + timedelta(days=1)).strftime('%Y-%m-%d'), interval='1d', auto_adjust=False)
    if df is None or df.empty:
        raise RuntimeError(f"Sem dados para {yf_symbol}")
    return normalize_df(df, db_symbol)


def save_parquet(df: pd.DataFrame, out_path: Path):
    if df is None or df.empty:
        raise ValueError("DF vazio.")
    out_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_parquet(out_path, index=False)
    return out_path

In [6]:
# 4) Coleta B3 (24 .SA) — do mais recente (20/09/2025) ao mais antigo (>= 01/01/2012)
results = []
errors = []
for tk in tqdm(TICKERS_B3, desc='B3 .SA'):
    try:
        df = fetch_b3(tk)
        out = OUT_DIR / f"{tk.replace('.','_').lower()}_1d.parquet"
        save_parquet(df, out)
        results.append((tk, len(df), str(out)))
    except Exception as e:
        errors.append((tk, str(e)))
        time.sleep(0.5)

print('B3 OK:', len(results), 'Falhas:', len(errors))
results[:5], errors[:5]

B3 .SA: 100%|██████████| 24/24 [00:15<00:00,  1.56it/s]

B3 OK: 24 Falhas: 0





([('ABEV3.SA',
   3409,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\abev3_sa_1d.parquet'),
  ('B3SA3.SA',
   3409,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\b3sa3_sa_1d.parquet'),
  ('BBAS3.SA',
   3409,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\bbas3_sa_1d.parquet'),
  ('CSNA3.SA',
   3409,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\csna3_sa_1d.parquet'),
  ('CPLE6.SA',
   3408,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\cple6_sa_1d.parquet')],
 [])

In [7]:
# 5) Coleta indicadores/ETF/commodities (7) via yfinance
res_i = []
err_i = []
for yf_sym, db_sym in tqdm(list(INDICATORS.items()), desc='Indicadores'):
    try:
        df = fetch_indicator(yf_sym, db_sym)
        out = OUT_DIR / f"{db_sym.replace('.','_').lower()}_1d.parquet"
        save_parquet(df, out)
        res_i.append((yf_sym, db_sym, len(df), str(out)))
    except Exception as e:
        err_i.append((yf_sym, str(e)))
        time.sleep(0.5)

print('Indicadores OK:', len(res_i), 'Falhas:', len(err_i))
res_i[:5], err_i[:5]

Indicadores: 100%|██████████| 7/7 [00:03<00:00,  1.95it/s]

Indicadores OK: 7 Falhas: 0





([('^BVSP',
   '_bvsp',
   3400,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\_bvsp_1d.parquet'),
  ('EWZ',
   'ewz',
   3449,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\ewz_1d.parquet'),
  ('^GSPC',
   '_gspc',
   3449,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\_gspc_1d.parquet'),
  ('^VIX',
   '_vix',
   3449,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\_vix_1d.parquet'),
  ('DX-Y.NYB',
   'dx-y.nyb',
   3450,
   'G:\\Drives compartilhados\\BOLSA_2026\\a_bolsa2026_gemini\\00_data\\01_bruto\\dx-y_nyb_1d.parquet')],
 [])

In [8]:
# 6) Inspeção de estrutura/detalhes em 01_bruto e 02_adequado (3 ações + 3 indicadores)
from pathlib import Path
import pandas as pd

BASE = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data")
SRC_FOLDERS = ["01_bruto", "02_adequado"]

# escolha das séries para amostra
STOCKS = ["ABEV3.SA", "ITUB4.SA", "PETR4.SA"]
INDICS = ["_bvsp", "ewz", "_gspc"]

def fname_for_stock(sym: str) -> str:
    return f"{sym.lower().replace('.', '_')}_1d.parquet"

def fname_for_indicator(db_label: str) -> str:
    return f"{db_label.replace('.', '_').lower()}_1d.parquet"

def summarize_parquet(folder: str, filename: str) -> dict:
    p = BASE / folder / filename
    if not p.exists():
        return {
            "folder": folder, "file": filename, "exists": False,
            "rows": 0, "date_min": None, "date_max": None,
            "columns": None, "dtypes": None, "tz_datetime_sp": None,
            "path": str(p),
        }
    df = pd.read_parquet(p)
    info = {
        "folder": folder,
        "file": filename,
        "exists": True,
        "rows": int(df.shape[0]),
        "columns": list(df.columns),
        "dtypes": {c: str(df[c].dtype) for c in df.columns},
        "date_min": str(pd.to_datetime(df["date"]).min().date()) if "date" in df.columns and not df.empty else None,
        "date_max": str(pd.to_datetime(df["date"]).max().date()) if "date" in df.columns and not df.empty else None,
        "tz_datetime_sp": None,
        "path": str(p),
    }
    if "datetime_sp" in df.columns and not df.empty:
        try:
            v = df["datetime_sp"].iloc[0]
            tz = getattr(getattr(v, "tz", None), "key", None) or str(getattr(v, "tz", None))
            info["tz_datetime_sp"] = tz
        except Exception:
            info["tz_datetime_sp"] = None
    return info

def pretty_print(info: dict):
    print(f"[{info['folder']}] {info['file']} -> exists={info['exists']}")
    print(" path:", info["path"])
    if not info["exists"]:
        print()
        return
    print(" rows:", info["rows"], "date:", info["date_min"], "->", info["date_max"])
    print(" tz(datetime_sp):", info["tz_datetime_sp"])
    print(" columns:", info["columns"])
    print(" dtypes:", info["dtypes"])
    # amostra rápida
    try:
        df = pd.read_parquet(info["path"])
        display(df.head(2))
    except Exception:
        pass
    print()

targets = []
for s in STOCKS:
    targets.append(fname_for_stock(s))
for i in INDICS:
    targets.append(fname_for_indicator(i))

for folder in SRC_FOLDERS:
    print("===", folder, "===")
    for fn in targets:
        pretty_print(summarize_parquet(folder, fn))

=== 01_bruto ===
[01_bruto] abev3_sa_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_bruto\abev3_sa_1d.parquet
 rows: 3414 date: 2012-01-02 -> 2025-09-26
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,ABEV3.SA,2012-01-02,10.890463,10.9804,10.746562,10.872475,119582,2012-01-02 02:00:00
1,ABEV3.SA,2012-01-03,10.892461,10.946424,10.654626,10.748561,2099952,2012-01-03 02:00:00



[01_bruto] itub4_sa_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_bruto\itub4_sa_1d.parquet
 rows: 3415 date: 2012-01-02 -> 2025-09-29
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,ITUB4.SA,2012-01-02,14.165285,14.177703,13.867242,14.086635,8201763,2012-01-02 02:00:00
1,ITUB4.SA,2012-01-03,14.107332,14.43849,14.107332,14.43849,15453407,2012-01-03 02:00:00



[01_bruto] petr4_sa_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_bruto\petr4_sa_1d.parquet
 rows: 3415 date: 2012-01-02 -> 2025-09-29
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,PETR4.SA,2012-01-02,21.51,22.120001,21.26,21.73,20391300,2012-01-02 02:00:00
1,PETR4.SA,2012-01-03,21.83,22.41,21.809999,22.41,22940500,2012-01-03 02:00:00



[01_bruto] _bvsp_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_bruto\_bvsp_1d.parquet
 rows: 3407 date: 2012-01-03 -> 2025-09-30
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,_bvsp,2012-01-03,57836.0,59288.0,57836.0,59265.0,3083000,2012-01-03 02:00:00
1,_bvsp,2012-01-04,59263.0,59519.0,58558.0,59365.0,2252000,2012-01-04 02:00:00



[01_bruto] ewz_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_bruto\ewz_1d.parquet
 rows: 3456 date: 2012-01-03 -> 2025-09-30
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,ewz,2012-01-03,59.099998,60.110001,59.060001,59.700001,20052500,2012-01-03 05:00:00
1,ewz,2012-01-04,59.580002,60.470001,59.560001,59.919998,11113200,2012-01-04 05:00:00



[01_bruto] _gspc_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_bruto\_gspc_1d.parquet
 rows: 3456 date: 2012-01-03 -> 2025-09-30
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,_gspc,2012-01-03,1258.859985,1284.619995,1258.859985,1277.060059,3943710000,2012-01-03 05:00:00
1,_gspc,2012-01-04,1277.030029,1278.72998,1268.099976,1277.300049,3592580000,2012-01-04 05:00:00



=== 02_adequado ===
[02_adequado] abev3_sa_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_adequado\abev3_sa_1d.parquet
 rows: 3409 date: 2012-01-02 -> 2025-09-19
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,ABEV3.SA,2012-01-02,10.890463,10.9804,10.746562,10.872475,119582,2012-01-02 04:00:00
1,ABEV3.SA,2012-01-03,10.892461,10.946424,10.654626,10.748561,2099952,2012-01-03 04:00:00



[02_adequado] itub4_sa_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_adequado\itub4_sa_1d.parquet
 rows: 3409 date: 2012-01-02 -> 2025-09-19
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,ITUB4.SA,2012-01-02,14.165285,14.177703,13.867242,14.086635,8201763,2012-01-02 04:00:00
1,ITUB4.SA,2012-01-03,14.107332,14.43849,14.107332,14.43849,15453407,2012-01-03 04:00:00



[02_adequado] petr4_sa_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_adequado\petr4_sa_1d.parquet
 rows: 3409 date: 2012-01-02 -> 2025-09-19
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,PETR4.SA,2012-01-02,21.51,22.120001,21.26,21.73,20391300,2012-01-02 04:00:00
1,PETR4.SA,2012-01-03,21.83,22.41,21.809999,22.41,22940500,2012-01-03 04:00:00



[02_adequado] _bvsp_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_adequado\_bvsp_1d.parquet
 rows: 3400 date: 2012-01-03 -> 2025-09-19
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,_bvsp,2012-01-03,57836.0,59288.0,57836.0,59265.0,3083000,2012-01-03 04:00:00
1,_bvsp,2012-01-04,59263.0,59519.0,58558.0,59365.0,2252000,2012-01-04 04:00:00



[02_adequado] ewz_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_adequado\ewz_1d.parquet
 rows: 3449 date: 2012-01-03 -> 2025-09-19
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,ewz,2012-01-03,59.099998,60.110001,59.060001,59.700001,20052500,2012-01-03 07:00:00
1,ewz,2012-01-04,59.580002,60.470001,59.560001,59.919998,11113200,2012-01-04 07:00:00



[02_adequado] _gspc_1d.parquet -> exists=True
 path: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_adequado\_gspc_1d.parquet
 rows: 3449 date: 2012-01-03 -> 2025-09-19
 tz(datetime_sp): None
 columns: ['ticker', 'date', 'open', 'high', 'low', 'close', 'volume', 'datetime_sp']
 dtypes: {'ticker': 'object', 'date': 'object', 'open': 'float64', 'high': 'float64', 'low': 'float64', 'close': 'float64', 'volume': 'int64', 'datetime_sp': 'datetime64[ns]'}


Unnamed: 0,ticker,date,open,high,low,close,volume,datetime_sp
0,_gspc,2012-01-03,1258.859985,1284.619995,1258.859985,1277.060059,3943710000,2012-01-03 07:00:00
1,_gspc,2012-01-04,1277.030029,1278.72998,1268.099976,1277.300049,3592580000,2012-01-04 07:00:00





## Validação leve — 02_adequado

> Varrer `00_data/02_adequado`, checar duplicatas por (ticker, date), resumir linhas e intervalo de datas, e salvar um mini-manifesto CSV na própria pasta.

In [18]:
# Validação leve de 02_adequado: manifesto e duplicatas
from pathlib import Path
import pandas as pd

BASE = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_adequado")
OUT_MAN = BASE / "manifest_02_adequado.csv"

rows = []
problems = []

for p in sorted(BASE.glob("*.parquet")):
    try:
        df = pd.read_parquet(p)
        # normalizar colunas esperadas
        ren = {"Date":"date","Open":"open","High":"high","Low":"low","Close":"close","Adj Close":"adj_close","Volume":"volume"}
        df = df.rename(columns=ren)
        if "datetime_sp" in df.columns:
            s = pd.to_datetime(df["datetime_sp"], errors="coerce")
            df["date"] = pd.to_datetime(df.get("date", s), errors="coerce").dt.date.astype(str)
        else:
            df["date"] = pd.to_datetime(df.get("date"), errors="coerce").dt.date.astype(str)
        if "ticker" not in df.columns or df["ticker"].isna().any():
            # inferir do nome do arquivo
            name = p.name
            if name.endswith("_1d.parquet"):
                tk = name[:-len("_1d.parquet")].replace("_sa"," ").replace("_"," ").strip().replace(" ", "").upper()
            else:
                tk = name.upper()
            df["ticker"] = tk
        # checar duplicatas
        dup = int(df.duplicated(subset=["ticker","date"], keep=False).sum()) if not df.empty else 0
        if dup > 0:
            problems.append(f"[DUP] {p.name} duplicatas={dup}")
        # resumo
        rows.append({
            "file": p.name,
            "path": str(p),
            "ticker": str(df["ticker"].iloc[0]) if not df.empty else None,
            "rows": int(df.shape[0]),
            "date_min": None if df.empty else str(pd.to_datetime(df["date"]).min().date()),
            "date_max": None if df.empty else str(pd.to_datetime(df["date"]).max().date()),
            "has_datetime_sp": "datetime_sp" in df.columns,
            "dup_count": dup,
        })
    except Exception as e:
        problems.append(f"[ERRO] {p.name}: {e}")

mf = pd.DataFrame(rows).sort_values(["ticker"]).reset_index(drop=True)
mf.to_csv(OUT_MAN, index=False)
print("Manifesto salvo em:", OUT_MAN)
print("Arquivos:", len(mf), "Linhas totais:", int(mf["rows"].sum()) if not mf.empty else 0)
if not mf.empty:
    display(mf.head(10))
if problems:
    print("Problemas:")
    for x in problems[:100]:
        print(" ", x)
else:
    print("Sem problemas detectados.")

Manifesto salvo em: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_adequado\manifest_02_adequado.csv
Arquivos: 31 Linhas totais: 101298


Unnamed: 0,file,path,ticker,rows,date_min,date_max,has_datetime_sp,dup_count
0,abev3_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,ABEV3.SA,3409,2012-01-02,2025-09-19,True,0
1,b3sa3_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,B3SA3.SA,3409,2012-01-02,2025-09-19,True,0
2,bbas3_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,BBAS3.SA,3409,2012-01-02,2025-09-19,True,0
3,cple6_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,CPLE6.SA,3408,2012-01-02,2025-09-19,True,0
4,csna3_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,CSNA3.SA,3409,2012-01-02,2025-09-19,True,0
5,elet3_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,ELET3.SA,3409,2012-01-02,2025-09-19,True,0
6,ggbr4_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,GGBR4.SA,3409,2012-01-02,2025-09-19,True,0
7,hapv3_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,HAPV3.SA,1840,2018-04-26,2025-09-19,True,0
8,itub4_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,ITUB4.SA,3409,2012-01-02,2025-09-19,True,0
9,lren3_sa_1d.parquet,G:\Drives compartilhados\BOLSA_2026\a_bolsa202...,LREN3.SA,3409,2012-01-02,2025-09-19,True,0


Sem problemas detectados.


## Painéis wide 1D (curado) → MinIO

> Gera matrizes diárias (linhas = datas, colunas = 31 tickers) para métricas selecionadas a partir de `00_data/02_adequado` e salva em `00_data/minio_panels_stage`. Se variáveis `MINIO_*` estiverem definidas, faz upload para um bucket MinIO.

## Silver: panel_close (SSOT)

> Consolida todos os painéis de fechamento diário em uma única tabela ampla (`date` + colunas=tickers) e salva em `00_data/02_curado/panel_close.parquet`.

> Preferência de fonte:
- Se existirem CSVs por ano em `00_data/minio_panels_stage/panel_close_Y*.csv`, iremos concatenar todos (fonte: stage_csv).
- Caso contrário, geramos o painel a partir de `00_data/02_adequado` por pivot (fonte: from_02_adequado).

> Também gravamos um manifesto JSON com metadados (linhas, colunas, intervalo de datas, tickers, origem).

## Silver: panel_volume (SSOT)

> Consolida os painéis de volume diário em uma única tabela ampla (`date` + colunas=tickers) e salva em `00_data/02_curado/panel_volume.parquet`.

> Preferência de fonte:
- Se existirem CSVs por ano em `00_data/minio_panels_stage/panel_volume_Y*.csv`, concatenamos todos (fonte: stage_csv).
- Caso contrário, geramos o painel a partir de `00_data/02_adequado` por pivot (fonte: from_02_adequado).

> Também gravamos um manifesto JSON (linhas, colunas, intervalo de datas, tickers, origem).

In [23]:
# Construir e materializar o Silver SSOT: panel_volume.parquet
from pathlib import Path
import json
import pandas as pd

BASE_ADEQ = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_adequado")
STAGE = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/minio_panels_stage")
CURADO = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_curado")
CURADO.mkdir(parents=True, exist_ok=True)

out_parquet = CURADO / "panel_volume.parquet"
out_manifest = CURADO / "panel_volume_manifest.json"

# 1) Se houver CSVs por ano em STAGE, concatenar
csvs = sorted(STAGE.glob("panel_volume_Y*.csv"))
source = None
if csvs:
    pieces = []
    for p in csvs:
        df = pd.read_csv(p, parse_dates=[0])
        if df.columns[0].lower() in ("date", "data"):
            df = df.rename(columns={df.columns[0]: "date"})
        else:
            df = df.rename(columns={df.columns[0]: "date"})
        pieces.append(df)
    panel = pd.concat(pieces, ignore_index=True)
    panel["date"] = pd.to_datetime(panel["date"]).dt.normalize()
    panel = panel.drop_duplicates(subset=["date"]).sort_values("date").reset_index(drop=True)
    source = "stage_csv"
else:
    # 2) Caso contrário, construir a partir de 02_adequado
    files = sorted(BASE_ADEQ.glob("*.parquet"))
    if not files:
        raise FileNotFoundError("Sem arquivos em 02_adequado para construir o painel de volume.")
    all_rows = []
    for p in files:
        df = pd.read_parquet(p)
        ren = {"Date":"date","Open":"open","High":"high","Low":"low","Close":"close","Adj Close":"adj_close","Volume":"volume"}
        df = df.rename(columns=ren)
        # ticker
        if "ticker" not in df.columns or df["ticker"].isna().any():
            name = p.name
            if name.endswith("_1d.parquet"):
                tk = name[:-len("_1d.parquet")].replace("_sa"," ").replace("_"," ").strip().replace(" ", "").upper()
            else:
                tk = name.upper()
            df["ticker"] = tk
        # date -> datetime64[ns]
        if "datetime_sp" in df.columns:
            s = pd.to_datetime(df["datetime_sp"], errors="coerce")
            df["date"] = pd.to_datetime(df.get("date", s), errors="coerce").dt.normalize()
        else:
            df["date"] = pd.to_datetime(df.get("date"), errors="coerce").dt.normalize()
        keep = ["ticker","date","volume"]
        for c in keep:
            if c not in df.columns:
                df[c] = pd.NA
        all_rows.append(df[keep])
    tidy = pd.concat(all_rows, ignore_index=True)
    tidy = tidy.drop_duplicates(subset=["ticker","date"]).sort_values(["date","ticker"]) 
    panel = tidy.pivot(index="date", columns="ticker", values="volume").sort_index().reset_index()
    source = "from_02_adequado"

# Escrever Parquet
panel.to_parquet(out_parquet, index=False, engine="pyarrow", compression="snappy")

# Manifesto
meta = {
    "path": str(out_parquet),
    "rows": int(panel.shape[0]),
    "cols": int(panel.shape[1]),
    "date_min": None if panel.empty else str(pd.to_datetime(panel["date"]).min().date()),
    "date_max": None if panel.empty else str(pd.to_datetime(panel["date"]).max().date()),
    "tickers": [] if panel.empty else [c for c in panel.columns if c != "date"],
    "source": source,
}
with open(out_manifest, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Silver panel_volume salvo em:", out_parquet)
print("Manifesto:", out_manifest)
print("Linhas:", meta["rows"], "Colunas:", meta["cols"])

Silver panel_volume salvo em: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_curado\panel_volume.parquet
Manifesto: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_curado\panel_volume_manifest.json
Linhas: 3539 Colunas: 32


In [None]:
# Construir e materializar o Silver SSOT: panel_close.parquet
from pathlib import Path
import json
import pandas as pd

BASE_ADEQ = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_adequado")
STAGE = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/minio_panels_stage")
CURADO = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_curado")
CURADO.mkdir(parents=True, exist_ok=True)

out_parquet = CURADO / "panel_close.parquet"
out_manifest = CURADO / "panel_close_manifest.json"

# 1) Se houver CSVs por ano em STAGE, concatenar
csvs = sorted(STAGE.glob("panel_close_Y*.csv"))
source = None
if csvs:
    pieces = []
    for p in csvs:
        df = pd.read_csv(p, parse_dates=[0])
        # primeira coluna é data (index ou coluna)
        if df.columns[0].lower() in ("date", "data"):
            df = df.rename(columns={df.columns[0]: "date"})
        else:
            df = df.rename(columns={df.columns[0]: "date"})
        pieces.append(df)
    panel = pd.concat(pieces, ignore_index=True)
    panel["date"] = pd.to_datetime(panel["date"]).dt.normalize()  # datetime64[ns], meia-noite
    panel = panel.drop_duplicates(subset=["date"]).sort_values("date").reset_index(drop=True)
    source = "stage_csv"
else:
    # 2) Caso contrário, construir a partir de 02_adequado
    files = sorted(BASE_ADEQ.glob("*.parquet"))
    if not files:
        raise FileNotFoundError("Sem arquivos em 02_adequado para construir o painel.")
    all_rows = []
    for p in files:
        df = pd.read_parquet(p)
        ren = {"Date":"date","Open":"open","High":"high","Low":"low","Close":"close","Adj Close":"adj_close","Volume":"volume"}
        df = df.rename(columns=ren)
        # ticker
        if "ticker" not in df.columns or df["ticker"].isna().any():
            name = p.name
            if name.endswith("_1d.parquet"):
                tk = name[:-len("_1d.parquet")].replace("_sa"," ").replace("_"," ").strip().replace(" ", "").upper()
            else:
                tk = name.upper()
            df["ticker"] = tk
        # date -> datetime64[ns]
        if "datetime_sp" in df.columns:
            s = pd.to_datetime(df["datetime_sp"], errors="coerce")
            df["date"] = pd.to_datetime(df.get("date", s), errors="coerce").dt.normalize()
        else:
            df["date"] = pd.to_datetime(df.get("date"), errors="coerce").dt.normalize()
        keep = ["ticker","date","close"]
        for c in keep:
            if c not in df.columns:
                df[c] = pd.NA
        all_rows.append(df[keep])
    tidy = pd.concat(all_rows, ignore_index=True)
    tidy = tidy.drop_duplicates(subset=["ticker","date"]).sort_values(["date","ticker"]) 
    panel = tidy.pivot(index="date", columns="ticker", values="close").sort_index().reset_index()  # date + wide
    source = "from_02_adequado"

# Escrever Parquet (colunas= date + tickers) com pyarrow/snappy
panel.to_parquet(out_parquet, index=False, engine="pyarrow", compression="snappy")

# Manifesto
meta = {
    "path": str(out_parquet),
    "rows": int(panel.shape[0]),
    "cols": int(panel.shape[1]),
    "date_min": None if panel.empty else str(pd.to_datetime(panel["date"]).min().date()),
    "date_max": None if panel.empty else str(pd.to_datetime(panel["date"]).max().date()),
    "tickers": [] if panel.empty else [c for c in panel.columns if c != "date"],
    "source": source,
}
with open(out_manifest, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)

print("Silver panel_close salvo em:", out_parquet)
print("Manifesto:", out_manifest)
print("Linhas:", meta["rows"], "Colunas:", meta["cols"])

Silver panel_close salvo em: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_curado\panel_close.parquet
Manifesto: G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_curado\panel_close_manifest.json
Linhas: 3539 Colunas: 32


### Verificação dos SSOT (close/volume)

> Para cada ticker: primeira/última data com valor e contagem de NaNs; além de resumo global por arquivo.

In [26]:
from pathlib import Path
import pandas as pd

def summarize_ssot(parquet_path: str | Path) -> pd.DataFrame:
    p = Path(parquet_path)
    if not p.exists():
        raise FileNotFoundError(f"Arquivo não encontrado: {p}")
    df = pd.read_parquet(p)
    if "date" not in df.columns:
        raise ValueError("Coluna 'date' não encontrada no SSOT.")
    df["date"] = pd.to_datetime(df["date"], errors="coerce")
    df = df.sort_values("date").reset_index(drop=True)

    tickers = [c for c in df.columns if c != "date"]
    total_rows = len(df)

    rows = []
    for tk in sorted(tickers):
        s = df[tk]
        non_null_mask = ~s.isna()
        if non_null_mask.any():
            first_date = df.loc[non_null_mask, "date"].iloc[0].date()
            last_date  = df.loc[non_null_mask, "date"].iloc[-1].date()
        else:
            first_date = None
            last_date  = None
        nan_count = int(s.isna().sum())
        coverage = 0.0 if total_rows == 0 else 100.0 * (total_rows - nan_count) / total_rows
        rows.append({
            "ticker": tk,
            "date_first_non_null": None if first_date is None else str(first_date),
            "date_last_non_null":  None if last_date  is None else str(last_date),
            "nan_count": nan_count,
            "total_rows": total_rows,
            "coverage_pct": round(coverage, 2),
        })

    out = pd.DataFrame(rows).sort_values("ticker").reset_index(drop=True)
    print("Datas SSOT (globais):", str(df["date"].min().date()), "→", str(df["date"].max().date()))
    print("Tickers:", len(tickers), "| Linhas totais:", total_rows)
    return out

BASE = Path(r"G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_curado")
summary_close = summarize_ssot(BASE / "panel_close.parquet")
summary_volume = summarize_ssot(BASE / "panel_volume.parquet")

print("\nClose — primeiras linhas:")
display(summary_close.head(10))

print("\nVolume — primeiras linhas:")
display(summary_volume.head(10))

Datas SSOT (globais): 2012-01-02 → 2025-10-01
Tickers: 31 | Linhas totais: 3547
Datas SSOT (globais): 2012-01-02 → 2025-10-01
Tickers: 31 | Linhas totais: 3547

Close — primeiras linhas:


Unnamed: 0,ticker,date_first_non_null,date_last_non_null,nan_count,total_rows,coverage_pct
0,ABEV3.SA,2012-01-02,2025-10-01,132,3547,96.28
1,B3SA3.SA,2012-01-02,2025-10-01,131,3547,96.31
2,BBAS3.SA,2012-01-02,2025-10-01,131,3547,96.31
3,CPLE6.SA,2012-01-02,2025-10-01,133,3547,96.25
4,CSNA3.SA,2012-01-02,2025-10-01,132,3547,96.28
5,ELET3.SA,2012-01-02,2025-10-01,132,3547,96.28
6,GGBR4.SA,2012-01-02,2025-10-01,132,3547,96.28
7,HAPV3.SA,2018-04-26,2025-10-01,1701,3547,52.04
8,ITUB4.SA,2012-01-02,2025-10-01,131,3547,96.31
9,LREN3.SA,2012-01-02,2025-10-01,131,3547,96.31



Volume — primeiras linhas:


Unnamed: 0,ticker,date_first_non_null,date_last_non_null,nan_count,total_rows,coverage_pct
0,ABEV3.SA,2012-01-02,2025-10-01,132,3547,96.28
1,B3SA3.SA,2012-01-02,2025-10-01,131,3547,96.31
2,BBAS3.SA,2012-01-02,2025-10-01,131,3547,96.31
3,CPLE6.SA,2012-01-02,2025-10-01,133,3547,96.25
4,CSNA3.SA,2012-01-02,2025-10-01,132,3547,96.28
5,ELET3.SA,2012-01-02,2025-10-01,132,3547,96.28
6,GGBR4.SA,2012-01-02,2025-10-01,132,3547,96.28
7,HAPV3.SA,2018-04-26,2025-10-01,1701,3547,52.04
8,ITUB4.SA,2012-01-02,2025-10-01,131,3547,96.31
9,LREN3.SA,2012-01-02,2025-10-01,131,3547,96.31
