# Montar ou identificar os diretórios locais

In [2]:
# Montar Google Drive (Colab) ou detectar Google Drive Desktop (local)
# - Em Colab: monta em /content/drive
# - Local (Windows): usa Google Drive for Desktop (ex.: G:\Drives compartilhados)

import os
from pathlib import Path

IN_COLAB = False
try:
    from google.colab import drive as _gdrive  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

DRIVE_ROOT = None  # caminho raiz do Drive

if IN_COLAB:
    # Monta o Google Drive no Colab
    _gdrive.mount('/content/drive', force_remount=False)
    DRIVE_ROOT = '/content/drive'
    print(f"[OK] Google Drive montado no Colab em: {DRIVE_ROOT}")
else:
    # Detecta Google Drive for Desktop no Windows
    candidates = [
        r'G:\\Drives compartilhados',   # PT-BR: Shared drives
        r'G:\\Shared drives',           # EN: Shared drives
        r'G:\\My Drive',                # EN: Meu Drive (conta pessoal)
        os.path.expandvars(r'%USERPROFILE%\\Google Drive'),  # legado
        os.path.expandvars(r'%USERPROFILE%\\Google Drive (Shared drives)'),
    ]
    DRIVE_ROOT = next((p for p in candidates if os.path.exists(p)), None)
    if DRIVE_ROOT:
        print(f"[OK] Google Drive detectado localmente em: {DRIVE_ROOT}")
    else:
        print('[AVISO] Google Drive não encontrado automaticamente.\n'
              '       Verifique se o Google Drive for Desktop está instalado e o caminho correto (ex.: G:\\Drives compartilhados).')

# Variáveis úteis para uso posterior
SHARED_DRIVES = str(Path(DRIVE_ROOT) if DRIVE_ROOT else '')
print('IN_COLAB =', IN_COLAB)
print('DRIVE_ROOT =', DRIVE_ROOT)
print('SHARED_DRIVES =', SHARED_DRIVES)

[OK] Google Drive detectado localmente em: G:\\Drives compartilhados
IN_COLAB = False
DRIVE_ROOT = G:\\Drives compartilhados
SHARED_DRIVES = G:\Drives compartilhados


In [3]:
# Verificar parquet em 00_data/01_raw e resumir esquema
import os
from pathlib import Path
import json

RAW_DIR = Path('G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/01_raw')
assert RAW_DIR.exists(), f"Diretório não encontrado: {RAW_DIR}"

# Garantir pyarrow para leitura robusta
try:
    import pyarrow.parquet as pq
    import pyarrow as pa
except Exception:
    import sys, subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'pyarrow'])
    import pyarrow.parquet as pq
    import pyarrow as pa

from collections import Counter, defaultdict

files = sorted([p for p in RAW_DIR.glob('*.parquet')])
print(f"Encontrados {len(files)} arquivos parquet em {RAW_DIR}")

schemas = {}
errors = {}
row_counts = {}
example_values = defaultdict(dict)

for p in files:
    try:
        meta = pq.ParquetFile(p)
        schema = meta.schema_arrow
        schemas[p.name] = schema
        row_counts[p.name] = meta.metadata.num_rows
        # coletar alguns valores de exemplo das primeiras 5 linhas
        table = meta.read_row_groups([0], columns=[f.name for f in schema]) if meta.metadata.num_row_groups>0 else meta.read()
        head = table.slice(0, min(5, table.num_rows))
        for f in schema:
            col = head.column(f.name) if f.name in head.column_names else None
            if col is not None:
                vals = col.to_pylist()
                example_values[p.name][f.name] = vals
    except Exception as e:
        errors[p.name] = str(e)

# Determinar colunas comuns e tipos
all_columns = [set(s.names) for s in schemas.values()]
common_cols = set.intersection(*all_columns) if all_columns else set()

# mapa de tipos por coluna (contagem)
col_type_counter = defaultdict(Counter)
for fname, schema in schemas.items():
    for f in schema:
        col_type_counter[f.name][str(f.type)] += 1

print("\nErros de leitura:")
if errors:
    for k,v in errors.items():
        print(f"- {k}: {v}")
else:
    print("Nenhum erro")

print("\nColunas comuns (presentes em todos):")
print(sorted(common_cols))

print("\nTipos por coluna (top-1):")
for col, counter in sorted(col_type_counter.items()):
    top = counter.most_common(1)[0]
    print(f"- {col}: {top[0]} (freq {top[1]}/{len(schemas)})")

print("\nAmostras de valores (primeiras 5 linhas) para 2 arquivos:")
for fname in list(schemas.keys())[:2]:
    print(f"\n== {fname} (linhas={row_counts.get(fname)}) ==")
    samples = example_values.get(fname, {})
    for col, vals in list(samples.items())[:10]:
        print(f"  {col}: {vals}")

Encontrados 31 arquivos parquet em G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\01_raw

Erros de leitura:
Nenhum erro

Colunas comuns (presentes em todos):
['Adj Close', 'Close', 'Date', 'Dividends', 'High', 'Low', 'Open', 'Stock Splits', 'Volume']

Tipos por coluna (top-1):
- Adj Close: double (freq 31/31)
- Capital Gains: double (freq 1/31)
- Close: double (freq 31/31)
- Date: timestamp[ns, tz=America/Sao_Paulo] (freq 25/31)
- Dividends: double (freq 31/31)
- High: double (freq 31/31)
- Low: double (freq 31/31)
- Open: double (freq 31/31)
- Stock Splits: double (freq 31/31)
- Volume: int64 (freq 31/31)

Amostras de valores (primeiras 5 linhas) para 2 arquivos:

== _bvsp_ohlcv_actions_20120101_20250922.parquet (linhas=3400) ==
  Open: [57836.0, 59263.0, 59354.0, 58565.0, 58601.0]
  High: [59288.0, 59519.0, 59354.0, 59261.0, 59220.0]
  Low: [57836.0, 58558.0, 57963.0, 58355.0, 58599.0]
  Close: [59265.0, 59365.0, 58546.0, 58600.0, 59083.0]
  Adj Close: [59265.0, 59365

In [4]:
# Resumo compacto (JSON) das estruturas parquet
import json, re

def top_types_for(cols):
    out = {}
    for c in cols:
        cnt = col_type_counter.get(c, None)
        if cnt:
            t, n = cnt.most_common(1)[0]
            out[c] = {"type": t, "freq": n, "total": len(schemas)}
    return out

summary = {}
summary["file_count"] = len(files)
summary["error_count"] = len(errors)
summary["error_files"] = sorted(list(errors.keys()))[:5]
summary["has_ohlcv_actions_pattern"] = sum(1 for p in files if "_ohlcv_actions_" in p.name)

names = [p.name for p in files]
summary["sample_files"] = names[:5]

# Colunas comuns e tipos
cc = sorted(list(common_cols))
summary["common_columns"] = cc
summary["common_columns_types"] = top_types_for(cc)

# Amostra de colunas de 2 arquivos
sample_cols = {}
for fname in names[:2]:
    sch = schemas.get(fname)
    if sch:
        sample_cols[fname] = sch.names
summary["sample_columns_by_file"] = sample_cols

# Padrões por nome: conta tags tipo '_sa_', '_nyb_', etc
patterns = {
    "_sa_": sum(1 for n in names if "_sa_" in n),
    "_nyb_": sum(1 for n in names if "_nyb_" in n),
    "_metadata_": sum(1 for n in names if "_metadata_" in n),
    "_ohlcv_actions_": sum(1 for n in names if "_ohlcv_actions_" in n),
}
summary["name_patterns_counts"] = patterns

print(json.dumps(summary, ensure_ascii=False, indent=2))

{
  "file_count": 31,
  "error_count": 0,
  "error_files": [],
  "has_ohlcv_actions_pattern": 31,
  "sample_files": [
    "_bvsp_ohlcv_actions_20120101_20250922.parquet",
    "_gspc_ohlcv_actions_20120101_20250922.parquet",
    "_tnx_ohlcv_actions_20120101_20250922.parquet",
    "_vix_ohlcv_actions_20120101_20250922.parquet",
    "abev3_sa_ohlcv_actions_20120101_20250922.parquet"
  ],
  "common_columns": [
    "Adj Close",
    "Close",
    "Date",
    "Dividends",
    "High",
    "Low",
    "Open",
    "Stock Splits",
    "Volume"
  ],
  "common_columns_types": {
    "Adj Close": {
      "type": "double",
      "freq": 31,
      "total": 31
    },
    "Close": {
      "type": "double",
      "freq": 31,
      "total": 31
    },
    "Date": {
      "type": "timestamp[ns, tz=America/Sao_Paulo]",
      "freq": 25,
      "total": 31
    },
    "Dividends": {
      "type": "double",
      "freq": 31,
      "total": 31
    },
    "High": {
      "type": "double",
      "freq": 31,
      "tot

In [6]:
# Ajustar timezone para America/Sao_Paulo nos 6 arquivos divergentes e salvar em 00_data/02_processed
import os
from pathlib import Path
import pandas as pd

RAW_DIR = Path('G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/01_raw')
PROC_DIR = Path('G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_processed')
PROC_DIR.mkdir(parents=True, exist_ok=True)

# Utilizar resultados computados previamente (schemas, files) se existirem no kernel; caso contrário, recomputar leve
try:
    files
    schemas
except NameError:
    from collections import Counter, defaultdict
    import pyarrow.parquet as pq
    files = sorted([p for p in RAW_DIR.glob('*.parquet')])
    schemas = {}
    for p in files:
        meta = pq.ParquetFile(p)
        schemas[p.name] = meta.schema_arrow

# Identificar quais não têm tz America/Sao_Paulo
not_sp = []
for p in files:
    sch = schemas.get(p.name)
    if not sch:
        continue
    if 'Date' in sch.names:
        f = next((f for f in sch if f.name == 'Date'), None)
        t = str(f.type) if f else ''
        if 'timestamp' in t and 'tz=America/Sao_Paulo' in t:
            continue
        if 'timestamp' in t:
            not_sp.append(p)
    else:
        # Pode estar como index no parquet (sem coluna explícita)
        not_sp.append(p)

print(f"Arquivos a ajustar (esperados 6): {len(not_sp)}")
for p in not_sp:
    print('-', p.name)

# Função de ajuste: garantir Date com tz America/Sao_Paulo
import pytz
sp_tz = pytz.timezone('America/Sao_Paulo')

fixed = []
for p in not_sp:
    df = pd.read_parquet(p)  # pandas com pyarrow engine

    # Descobrir a coluna/índice de data
    date_col = None
    for cand in ['Date', 'date', 'DATE', 'Datetime', 'datetime', 'Timestamp', 'timestamp']:
        if cand in df.columns:
            date_col = cand
            break
    if date_col is None:
        # Tentar índice
        if isinstance(df.index, pd.DatetimeIndex):
            df = df.reset_index().rename(columns={'index':'Date'})
            date_col = 'Date'
        else:
            print(f"[SKIP] Sem coluna/índice de data reconhecida: {p.name}")
            continue

    d = pd.to_datetime(df[date_col], errors='coerce', utc=False)

    if pd.api.types.is_datetime64tz_dtype(d):
        # já tem tz, converter
        df[date_col] = d.dt.tz_convert('America/Sao_Paulo')
    else:
        # naive -> assumir horário local SP
        df[date_col] = d.dt.tz_localize('America/Sao_Paulo')

    # Garantir nome padronizado 'Date'
    if date_col != 'Date':
        df = df.rename(columns={date_col: 'Date'})

    out_path = PROC_DIR / p.name
    df.to_parquet(out_path, engine='pyarrow', index=False)
    fixed.append(out_path)

print(f"Ajustados e salvos: {len(fixed)} em {PROC_DIR}")
for q in fixed:
    print('  ->', q.name)

# Verificar tipos após escrita
import pyarrow.parquet as pq
ok = 0
for q in fixed:
    meta = pq.ParquetFile(q)
    sch = meta.schema_arrow
    f = next((f for f in sch if f.name == 'Date'), None)
    if f and 'tz=America/Sao_Paulo' in str(f.type):
        ok += 1
print(f"Verificação: {ok}/{len(fixed)} com Date em tz=America/Sao_Paulo")

Arquivos a ajustar (esperados 6): 6
- _gspc_ohlcv_actions_20120101_20250922.parquet
- _tnx_ohlcv_actions_20120101_20250922.parquet
- _vix_ohlcv_actions_20120101_20250922.parquet
- bz=f_ohlcv_actions_20120101_20250922.parquet
- dx-y.nyb_ohlcv_actions_20120101_20250922.parquet
- ewz_ohlcv_actions_20120101_20250922.parquet
Ajustados e salvos: 6 em G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_processed
  -> _gspc_ohlcv_actions_20120101_20250922.parquet
  -> _tnx_ohlcv_actions_20120101_20250922.parquet
  -> _vix_ohlcv_actions_20120101_20250922.parquet
  -> bz=f_ohlcv_actions_20120101_20250922.parquet
  -> dx-y.nyb_ohlcv_actions_20120101_20250922.parquet
  -> ewz_ohlcv_actions_20120101_20250922.parquet


  if pd.api.types.is_datetime64tz_dtype(d):
  if pd.api.types.is_datetime64tz_dtype(d):
  if pd.api.types.is_datetime64tz_dtype(d):
  if pd.api.types.is_datetime64tz_dtype(d):
  if pd.api.types.is_datetime64tz_dtype(d):
  if pd.api.types.is_datetime64tz_dtype(d):


Verificação: 6/6 com Date em tz=America/Sao_Paulo


In [7]:
# Padronizar camada SILVER: schema consistente, tz America/Sao_Paulo, coluna Symbol, salvar em 00_data/02_processed
from pathlib import Path
import re
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

RAW_DIR = Path('G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/01_raw')
SILVER_DIR = Path('G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_processed')
SILVER_DIR.mkdir(parents=True, exist_ok=True)

# Lista alvo de colunas na ordem final
TARGET_COLS = ['Symbol','Date','Open','High','Low','Close','Adj Close','Volume','Dividends','Stock Splits']

# Extrair símbolo a partir do nome do arquivo
# Exemplos: "abev3_sa_ohlcv_actions_...", "_gspc_ohlcv_actions_..."
# Regra: pegar token inicial até o primeiro "_ohlcv_actions_"; remover prefixo '_' se houver; manter sufixo de mercado (ex.: _sa_)
SYMBOL_RE = re.compile(r"^(.*?)_ohlcv_actions_", re.IGNORECASE)


def infer_symbol(fname: str) -> str:
    m = SYMBOL_RE.match(fname)
    raw = m.group(1) if m else fname.split('.')[0]
    # normalizar: tirar duplicidade de underscores, remover prefixo '_'
    raw = raw.lstrip('_')
    return raw.lower()


def ensure_tz_sp(s: pd.Series) -> pd.Series:
    s = pd.to_datetime(s, errors='coerce', utc=False)
    if isinstance(s.dtype, pd.DatetimeTZDtype):
        return s.dt.tz_convert('America/Sao_Paulo')
    else:
        return s.dt.tz_localize('America/Sao_Paulo')


files = sorted(RAW_DIR.glob('*.parquet'))
print(f"Total arquivos a processar: {len(files)}")

written = []
for p in files:
    df = pd.read_parquet(p)

    # Descobrir/Padronizar Date
    date_col = None
    for cand in ['Date','date','DATE','Datetime','datetime','Timestamp','timestamp']:
        if cand in df.columns:
            date_col = cand
            break
    if date_col is None and isinstance(df.index, pd.DatetimeIndex):
        df = df.reset_index().rename(columns={'index':'Date'})
        date_col = 'Date'
    if date_col is None:
        print(f"[SKIP] {p.name}: sem coluna/índice de data.")
        continue

    # Tipos numéricos / faltantes
    for col in ['Open','High','Low','Close','Adj Close','Volume','Dividends','Stock Splits']:
        if col in df.columns:
            if col == 'Volume':
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')  # permitir NA, salva como int64 no parquet
            else:
                df[col] = pd.to_numeric(df[col], errors='coerce').astype('float64')
        else:
            # coluna ausente vira NA
            df[col] = pd.Series([pd.NA]*len(df)) if col=='Volume' else pd.Series([float('nan')]*len(df))

    # Garantir timezone America/Sao_Paulo
    df[date_col] = ensure_tz_sp(df[date_col])
    if date_col != 'Date':
        df = df.rename(columns={date_col: 'Date'})

    # Adicionar Symbol
    symbol = infer_symbol(p.name)
    df.insert(0, 'Symbol', symbol)

    # Reordenar colunas
    missing = [c for c in TARGET_COLS if c not in df.columns]
    for c in missing:
        if c == 'Volume' and c not in df.columns:
            df[c] = pd.Series([pd.NA]*len(df))
        elif c not in df.columns:
            df[c] = float('nan')
    df = df[TARGET_COLS]

    # Escrever com pyarrow, mantendo tz na Date
    out = SILVER_DIR / p.name
    df.to_parquet(out, engine='pyarrow', index=False)
    written.append(out)

print(f"Gravados {len(written)} arquivos em {SILVER_DIR}")

# Verificação: todos com schema idêntico e tz correta
schemas = []
for q in written:
    meta = pq.ParquetFile(q)
    sch = meta.schema_arrow
    schemas.append((q.name, sch))

# verificar nomes/ordem das colunas
ok_cols = all(s.names == TARGET_COLS for _, s in schemas)
# verificar tz da Date
ok_tz = all('tz=America/Sao_Paulo' in str(next(f for f in s if f.name=='Date').type) for _, s in schemas)

print('OK_COLS =', ok_cols)
print('OK_TZ_SP =', ok_tz)
if not ok_cols:
    bad = [n for n,s in schemas if s.names != TARGET_COLS]
    print('Arquivos com colunas fora do padrão:', bad[:5])
if not ok_tz:
    bad = []
    for n, s in schemas:
        f = next((f for f in s if f.name=='Date'), None)
        if 'tz=America/Sao_Paulo' not in str(f.type):
            bad.append(n)
    print('Arquivos com timezone fora do padrão:', bad[:5])

Total arquivos a processar: 31
Gravados 31 arquivos em G:\Drives compartilhados\BOLSA_2026\a_bolsa2026_gemini\00_data\02_processed
OK_COLS = True
OK_TZ_SP = True


In [8]:
# Mostrar dados de PETR4 em setembro/2022 (camada silver)
from pathlib import Path
import pandas as pd

# Montar caminho para 02_processed usando SHARED_DRIVES se disponível
try:
    base = Path(SHARED_DRIVES) / 'BOLSA_2026' / 'a_bolsa2026_gemini'
    proc_dir = base / '00_data' / '02_processed'
    assert proc_dir.exists()
except Exception:
    proc_dir = Path(r'G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_processed')

# Localizar arquivo silver de PETR4
petr4_file = next(proc_dir.glob('petr4_sa_ohlcv_actions_*.parquet'))

# Ler dados
df = pd.read_parquet(petr4_file)

# Garantir Date com tz America/Sao_Paulo (deveria já estar, mas deixamos robusto)
if not isinstance(df['Date'].dtype, pd.DatetimeTZDtype):
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.tz_localize('America/Sao_Paulo')

# Filtro de setembro de 2022 (intervalo fechado-aberto)
start = pd.Timestamp('2022-09-01', tz='America/Sao_Paulo')
end = pd.Timestamp('2022-10-01', tz='America/Sao_Paulo')
mask = (df['Date'] >= start) & (df['Date'] < end)
sep22 = df.loc[mask].sort_values('Date').reset_index(drop=True)

print('Arquivo:', petr4_file.name)
print('Período:', start.date(), 'a', (end - pd.Timedelta(seconds=1)).date())
print('Linhas:', len(sep22))
sep22

Arquivo: petr4_sa_ohlcv_actions_20120101_20250922.parquet
Período: 2022-09-01 a 2022-09-30
Linhas: 21


Unnamed: 0,Symbol,Date,Open,High,Low,Close,Adj Close,Volume,Dividends,Stock Splits
0,petr4_sa,2022-09-01 00:00:00-03:00,33.360001,33.950001,32.759998,33.849998,18.471289,69184700,0.0,0.0
1,petr4_sa,2022-09-02 00:00:00-03:00,34.439999,34.57,33.259998,33.419998,18.236647,81792300,0.0,0.0
2,petr4_sa,2022-09-05 00:00:00-03:00,33.849998,34.110001,33.150002,33.34,18.192995,76337600,0.0,0.0
3,petr4_sa,2022-09-06 00:00:00-03:00,32.27,32.59,31.51,32.099998,17.516346,111813500,0.0,0.0
4,petr4_sa,2022-09-08 00:00:00-03:00,32.310001,32.779999,31.4,31.799999,17.352642,63159800,0.0,0.0
5,petr4_sa,2022-09-09 00:00:00-03:00,32.490002,32.669998,31.700001,31.790001,17.347187,49871900,0.0,0.0
6,petr4_sa,2022-09-12 00:00:00-03:00,32.150002,32.73,31.459999,31.48,17.178028,71374500,0.0,0.0
7,petr4_sa,2022-09-13 00:00:00-03:00,30.940001,31.440001,30.52,30.65,16.725109,88327800,0.0,0.0
8,petr4_sa,2022-09-14 00:00:00-03:00,30.75,31.43,30.629999,31.120001,16.981581,49039700,0.0,0.0
9,petr4_sa,2022-09-15 00:00:00-03:00,30.92,31.23,30.799999,31.059999,16.948839,45553200,0.0,0.0


## Banco de Dados (Postgres/SQLite) – Camada Silver → SQL

Vamos criar um schema SQL para persistir os dados deduplicados e com timezone America/Sao_Paulo, e carregar todos os 31 Parquets da pasta 02_processed.

In [9]:
# Configuração de conexão (usa Postgres via env DATABASE_URL; se não setado, cai para SQLite local)
import os
from pathlib import Path

# Ex.: DATABASE_URL=postgresql+psycopg2://user:pass@host:5432/dbname
DB_URL = os.getenv('DATABASE_URL')
if not DB_URL:
    # fallback local para validação
    DB_URL = 'sqlite:///G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/03_final/ohlcv.db'
    Path('G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/03_final').mkdir(parents=True, exist_ok=True)

print('DB_URL =', DB_URL)

DB_URL = sqlite:///G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/03_final/ohlcv.db


In [10]:
# Criar engine SQLAlchemy e schema da tabela
import sqlalchemy as sa
from sqlalchemy import text

engine = sa.create_engine(DB_URL, future=True)

with engine.begin() as conn:
    # Postgres: timestamptz; SQLite: armazena texto ISO8601 com tz ou timestamp naive (vamos manter texto ISO para portabilidade)
    # Chave primária: (symbol, date)
    conn.execute(text("""
    CREATE TABLE IF NOT EXISTS ohlcv_daily (
        symbol TEXT NOT NULL,
        date   TEXT NOT NULL,
        open   DOUBLE PRECISION,
        high   DOUBLE PRECISION,
        low    DOUBLE PRECISION,
        close  DOUBLE PRECISION,
        adj_close DOUBLE PRECISION,
        volume BIGINT,
        dividends DOUBLE PRECISION,
        stock_splits DOUBLE PRECISION,
        PRIMARY KEY (symbol, date)
    )
    """))

print('Tabela ohlcv_daily pronta.')

Tabela ohlcv_daily pronta.


In [12]:
# Carregar todos os Parquets de 02_processed para o banco (upsert deduplicado)
from pathlib import Path
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import text

SILVER_DIR = Path('G:/Drives compartilhados/BOLSA_2026/a_bolsa2026_gemini/00_data/02_processed')
files = sorted(SILVER_DIR.glob('*.parquet'))
print('Arquivos a carregar:', len(files))

dialect = sa.create_engine(DB_URL).dialect.name
is_pg = (dialect == 'postgresql')

# Preparar comandos de upsert específicos por SGBD
if is_pg:
    upsert_sql = text("""
        INSERT INTO ohlcv_daily (symbol, date, open, high, low, close, adj_close, volume, dividends, stock_splits)
        VALUES (:symbol, :date, :open, :high, :low, :close, :adj_close, :volume, :dividends, :stock_splits)
        ON CONFLICT (symbol, date) DO UPDATE SET
            open=EXCLUDED.open,
            high=EXCLUDED.high,
            low=EXCLUDED.low,
            close=EXCLUDED.close,
            adj_close=EXCLUDED.adj_close,
            volume=EXCLUDED.volume,
            dividends=EXCLUDED.dividends,
            stock_splits=EXCLUDED.stock_splits
    """)
else:
    # SQLite
    upsert_sql = text("""
        INSERT OR REPLACE INTO ohlcv_daily (symbol, date, open, high, low, close, adj_close, volume, dividends, stock_splits)
        VALUES (:symbol, :date, :open, :high, :low, :close, :adj_close, :volume, :dividends, :stock_splits)
    """)

engine = sa.create_engine(DB_URL, future=True)

def to_rows(df: pd.DataFrame):
    # Garantir colunas e tipos
    cols = ['Symbol','Date','Open','High','Low','Close','Adj Close','Volume','Dividends','Stock Splits']
    for c in cols:
        if c not in df.columns:
            df[c] = pd.NA
    # Timezone: persistir ISO 8601 com offset (-03:00) mantendo America/Sao_Paulo
    dt = df['Date']
    if not isinstance(dt.dtype, pd.DatetimeTZDtype):
        dt = pd.to_datetime(dt, errors='coerce').dt.tz_localize('America/Sao_Paulo')
    else:
        dt = dt.dt.tz_convert('America/Sao_Paulo')
    date_iso = dt.apply(lambda x: x.isoformat() if pd.notnull(x) else None)

    # Deduplicar por (Symbol, Date)
    tmp = df.copy()
    tmp['__date_iso__'] = date_iso
    tmp = tmp.dropna(subset=['__date_iso__'])
    tmp = tmp.sort_values(['Symbol','__date_iso__']).drop_duplicates(['Symbol','__date_iso__'], keep='last')

    # Construir DataFrame final com nomes de colunas do SQL
    out = pd.DataFrame({
        'symbol': tmp['Symbol'].astype(str).str.lower(),
        'date': tmp['__date_iso__'],
        'open': pd.to_numeric(tmp['Open'], errors='coerce'),
        'high': pd.to_numeric(tmp['High'], errors='coerce'),
        'low': pd.to_numeric(tmp['Low'], errors='coerce'),
        'close': pd.to_numeric(tmp['Close'], errors='coerce'),
        'adj_close': pd.to_numeric(tmp['Adj Close'], errors='coerce'),
        'volume': pd.to_numeric(tmp['Volume'], errors='coerce').astype('Int64'),
        'dividends': pd.to_numeric(tmp['Dividends'], errors='coerce'),
        'stock_splits': pd.to_numeric(tmp['Stock Splits'], errors='coerce'),
    })

    # Converter NaN/<NA> para None (para bind SQL)
    out = out.astype(object).where(pd.notnull(out), None)
    return out.to_dict(orient='records')

loaded = 0
with engine.begin() as conn:
    for p in files:
        df = pd.read_parquet(p)
        # Assegurar presença da coluna Symbol no silver
        if 'Symbol' not in df.columns:
            # Inferir do nome, fallback
            sym = p.name.split('_ohlcv_actions_')[0].lstrip('_').lower()
            df.insert(0, 'Symbol', sym)
        rows = to_rows(df)
        if rows:
            conn.execute(upsert_sql, rows)
            loaded += len(rows)
        print(f"{p.name}: upsert {len(rows)} linhas")

print('Total de linhas upsertadas:', loaded)

Arquivos a carregar: 31
_bvsp_ohlcv_actions_20120101_20250922.parquet: upsert 3400 linhas
_gspc_ohlcv_actions_20120101_20250922.parquet: upsert 3449 linhas
_tnx_ohlcv_actions_20120101_20250922.parquet: upsert 3448 linhas
_vix_ohlcv_actions_20120101_20250922.parquet: upsert 3449 linhas
abev3_sa_ohlcv_actions_20120101_20250922.parquet: upsert 3409 linhas
b3sa3_sa_ohlcv_actions_20120101_20250922.parquet: upsert 3409 linhas
bbas3_sa_ohlcv_actions_20120101_20250922.parquet: upsert 3409 linhas
bz=f_ohlcv_actions_20120101_20250922.parquet: upsert 3432 linhas
cple6_sa_ohlcv_actions_20120101_20250922.parquet: upsert 3408 linhas
csna3_sa_ohlcv_actions_20120101_20250922.parquet: upsert 3409 linhas
dx-y.nyb_ohlcv_actions_20120101_20250922.parquet: upsert 3450 linhas
elet3_sa_ohlcv_actions_20120101_20250922.parquet: upsert 3409 linhas
ewz_ohlcv_actions_20120101_20250922.parquet: upsert 3449 linhas
ggbr4_sa_ohlcv_actions_20120101_20250922.parquet: upsert 3409 linhas
hapv3_sa_ohlcv_actions_20120101_2

In [13]:
# Verificações rápidas no banco: contagem total e amostra por símbolo
import sqlalchemy as sa
from sqlalchemy import text

engine = sa.create_engine(DB_URL, future=True)
with engine.begin() as conn:
    total = conn.execute(text('SELECT COUNT(*) FROM ohlcv_daily')).scalar_one()
    print('Total linhas:', total)
    rows = conn.execute(text('SELECT symbol, MIN(date), MAX(date), COUNT(*) FROM ohlcv_daily GROUP BY symbol ORDER BY symbol LIMIT 10')).fetchall()
    for r in rows:
        print(r)

Total linhas: 101298
('abev3_sa', '2012-01-02T00:00:00-02:00', '2025-09-19T00:00:00-03:00', 3409)
('b3sa3_sa', '2012-01-02T00:00:00-02:00', '2025-09-19T00:00:00-03:00', 3409)
('bbas3_sa', '2012-01-02T00:00:00-02:00', '2025-09-19T00:00:00-03:00', 3409)
('bvsp', '2012-01-03T00:00:00-02:00', '2025-09-19T00:00:00-03:00', 3400)
('bz=f', '2012-01-03T03:00:00-02:00', '2025-09-19T01:00:00-03:00', 3432)
('cple6_sa', '2012-01-02T00:00:00-02:00', '2025-09-19T00:00:00-03:00', 3408)
('csna3_sa', '2012-01-02T00:00:00-02:00', '2025-09-19T00:00:00-03:00', 3409)
('dx-y.nyb', '2012-01-03T03:00:00-02:00', '2025-09-19T01:00:00-03:00', 3450)
('elet3_sa', '2012-01-02T00:00:00-02:00', '2025-09-19T00:00:00-03:00', 3409)
('ewz', '2012-01-03T03:00:00-02:00', '2025-09-19T01:00:00-03:00', 3449)
