# Debug do Extract (yfinance)

Este notebook replica o fluxo de `functions/extract.py` para debugar problemas do `yfinance` (retorno vazio, bloqueio/HTML, `No timezone found`, etc.).

Obs.: aqui roda localmente (seu PC). O comportamento em AWS pode variar por IP/rede/bloqueios do Yahoo.

In [1]:
# (Opcional) Instale dependências se necessário
# !pip install -r ../requirements.txt

import os
import platform
import sys
import tempfile
import time
from datetime import datetime, timedelta

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import requests
import yfinance as yf
from dateutil.relativedelta import relativedelta

print('Python:', sys.version)
print('Platform:', platform.platform())
print('yfinance:', getattr(yf, '__version__', 'unknown'))
print('pandas:', pd.__version__)
print('requests:', requests.__version__)
print('pyarrow:', pa.__version__)

Python: 3.13.0 (tags/v3.13.0:60403a5, Oct  7 2024, 09:38:07) [MSC v.1941 64 bit (AMD64)]
Platform: Windows-11-10.0.26200-SP0
yfinance: 1.0
pandas: 2.3.3
requests: 2.32.5
pyarrow: 22.0.0


In [10]:
# Fix de SSL/CA no Windows (especialmente em paths com acentos/OneDrive ou redes corporativas).
# Objetivo: fazer requests/curl_cffi conseguirem validar HTTPS nos endpoints do Yahoo.

import shutil
import tempfile
from pathlib import Path

import certifi

src_ca = Path(certifi.where())
dst_ca = Path(tempfile.gettempdir()) / "certifi-cacert.pem"
try:
    shutil.copyfile(src_ca, dst_ca)
    os.environ["SSL_CERT_FILE"] = str(dst_ca)
    os.environ["REQUESTS_CA_BUNDLE"] = str(dst_ca)
    os.environ["CURL_CA_BUNDLE"] = str(dst_ca)
    print("CA bundle copiado para:", dst_ca)
except Exception as e:
    print("Falha ao copiar CA bundle:", type(e).__name__, e)

# Smoke test (se falhar, você pode testar verify=False só para diagnóstico)
test_url = "https://query1.finance.yahoo.com/"
try:
    rr = requests.get(test_url, timeout=10)
    print("requests smoke:", rr.status_code, rr.headers.get("content-type"))
except Exception as e:
    print("requests smoke: FAIL ->", type(e).__name__, e)

CA bundle copiado para: C:\Users\WILLIA~1.MEN\AppData\Local\Temp\certifi-cacert.pem
requests smoke: FAIL -> SSLError HTTPSConnectionPool(host='query1.finance.yahoo.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1020)')))


## Configuração (datas/tickers)

In [2]:
# Mesmo cálculo do extract.py (D-1 e 6 meses atrás)
today = datetime.now()
end_date_obj = today - timedelta(days=1)
end_date = end_date_obj.strftime('%Y-%m-%d')
start_date_obj = end_date_obj - relativedelta(months=6)
start_date = start_date_obj.strftime('%Y-%m-%d')

tickers = ['TOTS3.SA', 'LWSA3.SA', 'POSI3.SA', 'INTB3.SA', 'WEGE3.SA']

print('start_date:', start_date)
print('end_date  :', end_date)
print('tickers   :', tickers)

start_date: 2025-07-11
end_date  : 2026-01-11
tickers   : ['TOTS3.SA', 'LWSA3.SA', 'POSI3.SA', 'INTB3.SA', 'WEGE3.SA']


## Health check de conectividade (Yahoo)

Aqui a ideia é só verificar se existe saída HTTPS e se o host responde. HTTP 401/403 ainda indica conectividade.

In [3]:
from urllib.error import HTTPError, URLError
from urllib.request import Request, urlopen

def check_outbound_https(url: str, timeout_seconds: float = 3.0):
    try:
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        with urlopen(req, timeout=timeout_seconds) as resp:
            return True, f'HTTP {resp.status}'
    except HTTPError as e:
        return True, f'HTTPError {e.code}: {e.reason}'
    except URLError as e:
        return False, f'URLError: {getattr(e, "reason", str(e))}'
    except Exception as e:
        return False, f'Exception: {type(e).__name__}: {e}'

ok, detail = check_outbound_https('https://finance.yahoo.com/robots.txt', timeout_seconds=3.0)
print('Outbound HTTPS check (Yahoo):', 'ok=' + str(ok), 'detail=' + detail)

Outbound HTTPS check (Yahoo): ok=True detail=HTTP 200


In [11]:
# Diagnóstico: verificar se os endpoints do Yahoo que o yfinance usa estão respondendo JSON
# (às vezes robots.txt dá 200 mas as APIs de dados retornam bloqueio/HTML/vazio).

import json as _json

quote_url = "https://query1.finance.yahoo.com/v7/finance/quote?symbols=WEGE3.SA"
print('GET', quote_url)

def _try_parse_json(text: str, label: str):
    try:
        _ = _json.loads(text)
        print(label, 'json: OK')
    except Exception as e:
        print(label, 'json: FAIL ->', type(e).__name__, e)

# 1) requests (normal)
try:
    r = requests.get(quote_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'})
    print('requests:', r.status_code, r.headers.get('content-type'))
    print('requests body head:', repr(r.text[:200]))
    _try_parse_json(r.text, 'requests')
except Exception as e:
    print('requests: FAIL ->', type(e).__name__, e)
    print('requests: tentando verify=False (apenas diagnóstico!)')
    try:
        r = requests.get(quote_url, timeout=10, headers={'User-Agent': 'Mozilla/5.0'}, verify=False)
        print('requests (no-verify):', r.status_code, r.headers.get('content-type'))
        print('requests (no-verify) body head:', repr(r.text[:200]))
        _try_parse_json(r.text, 'requests (no-verify)')
    except Exception as e2:
        print('requests (no-verify): FAIL ->', type(e2).__name__, e2)

# 2) curl_cffi (muito mais parecido com o que o yfinance 1.0 usa)
try:
    from curl_cffi import requests as crequests
    rc = crequests.get(quote_url, timeout=10, impersonate='chrome')
    print('curl_cffi:', rc.status_code, rc.headers.get('content-type'))
    print('curl_cffi body head:', repr(rc.text[:200]))
    _try_parse_json(rc.text, 'curl_cffi')
except Exception as e:
    print('curl_cffi: FAIL ->', type(e).__name__, e)
    print('curl_cffi: tentando verify=False (apenas diagnóstico!)')
    try:
        rc = crequests.get(quote_url, timeout=10, impersonate='chrome', verify=False)
        print('curl_cffi (no-verify):', rc.status_code, rc.headers.get('content-type'))
        print('curl_cffi (no-verify) body head:', repr(rc.text[:200]))
        _try_parse_json(rc.text, 'curl_cffi (no-verify)')
    except Exception as e2:
        print('curl_cffi (no-verify): FAIL ->', type(e2).__name__, e2)

GET https://query1.finance.yahoo.com/v7/finance/quote?symbols=WEGE3.SA
requests: FAIL -> SSLError HTTPSConnectionPool(host='query1.finance.yahoo.com', port=443): Max retries exceeded with url: /v7/finance/quote?symbols=WEGE3.SA (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1020)')))
requests: tentando verify=False (apenas diagnóstico!)




requests (no-verify): 401 application/json;charset=utf-8
requests (no-verify) body head: '{"finance":{"result":null,"error":{"code":"Unauthorized","description":"User is unable to access this feature - https://bit.ly/yahoo-finance-api-feedback"}}}'
requests (no-verify) json: OK
curl_cffi: FAIL -> CertificateVerifyError Failed to perform, curl: (60) SSL certificate problem: unable to get local issuer certificate. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.
curl_cffi: tentando verify=False (apenas diagnóstico!)
curl_cffi (no-verify): 429 text/html
curl_cffi (no-verify) body head: 'Too Many Requests\r\n'
curl_cffi (no-verify) json: FAIL -> JSONDecodeError Expecting value: line 1 column 1 (char 0)


## Download com User-Agent + fallback ticker-a-ticker

Isso tenta reduzir erros como `Expecting value: line 1 column 1` e `No timezone found` (geralmente resposta não-JSON/HTML/bloqueio).

In [7]:
def download_yfinance_with_fallback(tickers, start_date, end_date, max_attempts=3, sleep_seconds=1.5):
    """
    Importante: yfinance (versões recentes) usa curl_cffi internamente e pode falhar se você
    passar um requests.Session "normal" via parâmetro session.

    Por isso, aqui NÃO setamos session; deixamos o próprio yfinance gerenciar.
    """
    last_exc = None

    # 1) tenta multi-ticker (menos agressivo: threads=False)
    for attempt in range(1, max_attempts + 1):
        try:
            df = yf.download(
                tickers=tickers,
                start=start_date,
                end=end_date,
                group_by='ticker',
                progress=False,
                timeout=20,
                threads=False,
            )
            if df is not None and not df.empty:
                print(f'multi-ticker OK: shape={df.shape}')
                return df
            print(f'multi-ticker vazio (attempt={attempt}/{max_attempts})')
        except Exception as e:
            last_exc = e
            print(f'multi-ticker falhou (attempt={attempt}/{max_attempts}): {type(e).__name__}: {e}')
        time.sleep(sleep_seconds)

    # 2) fallback 1 por 1
    print('Fallback: ticker-a-ticker...')
    frames = []
    for ticker in tickers:
        for attempt in range(1, max_attempts + 1):
            try:
                df_one = yf.download(
                    tickers=ticker,
                    start=start_date,
                    end=end_date,
                    progress=False,
                    timeout=20,
                    threads=False,
                )
                if df_one is None or df_one.empty:
                    print(f'{ticker}: vazio (attempt={attempt}/{max_attempts})')
                else:
                    print(f'{ticker}: OK shape={df_one.shape}')
                    df_one.columns = pd.MultiIndex.from_product([[ticker], df_one.columns])
                    frames.append(df_one)
                    break
            except Exception as e:
                print(f'{ticker}: falha (attempt={attempt}/{max_attempts}): {type(e).__name__}: {e}')
            time.sleep(sleep_seconds)

    if not frames:
        if last_exc is not None:
            raise last_exc
        return pd.DataFrame()

    return pd.concat(frames, axis=1).sort_index()

In [8]:
df = download_yfinance_with_fallback(tickers, start_date, end_date)
print('df empty?', df is None or df.empty)
df.tail() if df is not None and not df.empty else None


5 Failed downloads:
['LWSA3.SA', 'INTB3.SA', 'TOTS3.SA', 'POSI3.SA', 'WEGE3.SA']: TypeError("'NoneType' object is not subscriptable")


multi-ticker vazio (attempt=1/3)



5 Failed downloads:
['LWSA3.SA', 'INTB3.SA', 'TOTS3.SA', 'POSI3.SA', 'WEGE3.SA']: TypeError("'NoneType' object is not subscriptable")


multi-ticker vazio (attempt=2/3)



5 Failed downloads:
['LWSA3.SA', 'INTB3.SA', 'TOTS3.SA', 'POSI3.SA', 'WEGE3.SA']: TypeError("'NoneType' object is not subscriptable")


multi-ticker vazio (attempt=3/3)



1 Failed download:
['TOTS3.SA']: TypeError("'NoneType' object is not subscriptable")


Fallback: ticker-a-ticker...
TOTS3.SA: vazio (attempt=1/3)



1 Failed download:
['TOTS3.SA']: TypeError("'NoneType' object is not subscriptable")


TOTS3.SA: vazio (attempt=2/3)



1 Failed download:
['TOTS3.SA']: TypeError("'NoneType' object is not subscriptable")


TOTS3.SA: vazio (attempt=3/3)



1 Failed download:
['LWSA3.SA']: TypeError("'NoneType' object is not subscriptable")


LWSA3.SA: vazio (attempt=1/3)



1 Failed download:
['LWSA3.SA']: TypeError("'NoneType' object is not subscriptable")


LWSA3.SA: vazio (attempt=2/3)



1 Failed download:
['LWSA3.SA']: TypeError("'NoneType' object is not subscriptable")


LWSA3.SA: vazio (attempt=3/3)



1 Failed download:
['POSI3.SA']: TypeError("'NoneType' object is not subscriptable")


POSI3.SA: vazio (attempt=1/3)



1 Failed download:
['POSI3.SA']: TypeError("'NoneType' object is not subscriptable")


POSI3.SA: vazio (attempt=2/3)



1 Failed download:
['POSI3.SA']: TypeError("'NoneType' object is not subscriptable")


POSI3.SA: vazio (attempt=3/3)



1 Failed download:
['INTB3.SA']: TypeError("'NoneType' object is not subscriptable")


INTB3.SA: vazio (attempt=1/3)



1 Failed download:
['INTB3.SA']: TypeError("'NoneType' object is not subscriptable")


INTB3.SA: vazio (attempt=2/3)



1 Failed download:
['INTB3.SA']: TypeError("'NoneType' object is not subscriptable")


INTB3.SA: vazio (attempt=3/3)



1 Failed download:
['WEGE3.SA']: TypeError("'NoneType' object is not subscriptable")


WEGE3.SA: vazio (attempt=1/3)



1 Failed download:
['WEGE3.SA']: TypeError("'NoneType' object is not subscriptable")


WEGE3.SA: vazio (attempt=2/3)



1 Failed download:
['WEGE3.SA']: TypeError("'NoneType' object is not subscriptable")


WEGE3.SA: vazio (attempt=3/3)
df empty? True


In [13]:
# Teste alternativo: história via Ticker.history (às vezes o caminho/endpoint difere do multi.download)
one = 'WEGE3.SA'
try:
    hist = yf.Ticker(one).history(period='6mo', interval='1d')
    print('Ticker.history shape:', getattr(hist, 'shape', None))
    print('empty?', hist is None or hist.empty)
    hist.tail() if hist is not None and not hist.empty else None
except Exception as e:
    print('Ticker.history FAIL ->', type(e).__name__, e)

Ticker.history FAIL -> TypeError 'NoneType' object is not subscriptable


## Transformação (stack + data_pregao) + escrita parquet em dataset local

In [12]:
if df is None or df.empty:
    print('Sem dados para transformar.')
else:
    df_stack = df.stack(level=0).reset_index().rename(columns={'level_1': 'Ticker'})
    df_stack['Ticker'] = df_stack['Ticker'].astype('string')
    df_stack['data_pregao'] = pd.to_datetime(df_stack['Date'], errors='coerce').dt.date

    print('df_stack shape:', df_stack.shape)
    df_stack.tail()

    with tempfile.TemporaryDirectory() as tmp_dir:
        dataset_dir = os.path.join(tmp_dir, 'raw_dataset')
        table = pa.Table.from_pandas(df_stack, preserve_index=False)
        pq.write_to_dataset(table, root_path=dataset_dir, partition_cols=['data_pregao'])
        # Lista alguns arquivos gerados
        shown = 0
        for root, _, files in os.walk(dataset_dir):
            for fn in files:
                if not fn.endswith('.parquet'):
                    continue
                print('parquet:', os.path.join(root, fn))
                shown += 1
                if shown >= 10:
                    break
            if shown >= 10:
                break

Sem dados para transformar.
