In [11]:
# -*- coding: utf-8 -*-
"""
News → impactos (30m, 1D, 3D, 5D) e 1 rótulo principal (1D) à la FinMarBa,
com diagnósticos para entender "por que ficou NaN?" e contagens de etapas.

- Lê TODA a sua base de notícias (parquet/csv), filtra ticker AAPL e calcula impactos
  para TODO o histórico de notícias desse ticker.
- Carrega preços US (ADR) como no seu ETL: Excel "Hist_Origem_BDRs.xlsx",
  aba "AAPL", colunas "Date" e "Last Price".
- Não agrega por horário. Remove apenas duplicatas exatas por headline normalizada (opcional).
- Saída: mesma base + colunas:
    impact_30m, impact_1d, impact_3d, impact_5d, label
  (label = classificação por quantis do impacto de 1D: -1/0/+1)

REQUISITOS:
- pandas, numpy, openpyxl (para ler xlsx) ou equivalente.
"""

from __future__ import annotations
from pathlib import Path
import numpy as np
import pandas as pd
import re, unicodedata
import pytz

# ========================= CONFIG =========================
DEFAULT_OUT_NEWS = Path("../data/investing_news.parquet")   # sua base de notícias (parquet ou csv)
PATH_US_XLSX     = Path("../data/Hist_Origem_BDRs.xlsx")    # mesmo arquivo do seu ETL
US_SHEET         = "AAPL"                                   # olhando só AAPL por enquanto
US_COL_DATE      = "Date"
US_COL_LAST      = "Last Price"

TZ_NEWS = "America/Sao_Paulo"    # timestamps das notícias (BR)
TZ_US   = "America/New_York"     # timestamps do preço US (sem shift, igual seu ETL)
US_CLOSE_HHMM = (16, 0)          # 16:00 ET ~ fechamento regular

# merge_asof intraday: tolerância e "staleness" para 30min
ASOF_TOL_30M        = "2H"                       # aceita tick até 2h antes do alvo
MAX_STALENESS_30M   = pd.Timedelta("2H")         # se casado >2h longe do alvo → invalida

# saída
OUTPUT_CSV   = Path("../data/news_impact_labels_AAPL.csv")
TRY_PARQUET  = True

# debug e dedupe
DEBUG        = True              # liga prints e colunas de diagnóstico
DEDUP_STRICT = True              # remove duplicatas exatas de headline normalizada por ticker

# ========================= HELPERS =========================

def _is_parquet(path: Path) -> bool:
    return path.suffix.lower() in {".parquet", ".pq", ".parq"}

def _strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def _normalize_headline(txt: str) -> str:
    if pd.isna(txt): return ""
    s = str(txt).strip().lower()
    s = _strip_accents(s)
    s = s.replace("…", " ").replace("...", " ")
    s = re.sub(r"[\s\-\–\—\_\/\\\|\.\,\;\:\!\?\“\”\"\'\(\)\[\]\{\}]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

from pathlib import Path
import pandas as pd
import numpy as np
import re
import unicodedata
import glob

TZ_NEWS = "America/Sao_Paulo"
DEBUG = True  # use o mesmo DEBUG do resto do script

def _strip_accents(s: str) -> str:
    s = unicodedata.normalize("NFKD", s)
    return "".join(ch for ch in s if not unicodedata.combining(ch))

def _normalize_headline(txt: str) -> str:
    if pd.isna(txt): return ""
    s = str(txt).strip().lower()
    s = _strip_accents(s)
    s = s.replace("…", " ").replace("...", " ")
    s = re.sub(r"[\s\-\–\—\_\/\\\|\.\,\;\:\!\?\“\”\"\'\(\)\[\]\{\}]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _load_news(pathlike) -> pd.DataFrame:
    """
    Lê notícias de:
      - arquivo único (parquet/csv),
      - diretório (lendo todos .parquet/.csv recursivamente),
      - pattern com wildcard (ex.: "../data/investing_news_*.parquet").

    Normaliza colunas e parseia 'datetime' (com e sem offset de fuso).
    """

    # --------- resolver lista de arquivos ----------
    p = Path(str(pathlike))
    if "*" in str(p) or "?" in str(p):
        paths = sorted(glob.glob(str(p)))
    elif p.is_dir():
        paths = sorted([str(x) for x in p.rglob("*.parquet")]) + \
                sorted([str(x) for x in p.rglob("*.csv")])
    elif p.is_file():
        paths = [str(p)]
    else:
        raise FileNotFoundError(f"Nenhum arquivo/padrão encontrado em: {pathlike}")

    if not paths:
        raise FileNotFoundError(f"Nenhum arquivo encontrado em: {pathlike}")

    if DEBUG:
        print(f"[NEWS] Arquivos encontrados: {len(paths)}")
        if len(paths) <= 10:
            for fp in paths: print("  -", fp)

    # --------- aliases de colunas ----------
    # mapeia nomes alternativos para aumentar recall
    aliases = {
        "id":        ["id", "uuid", "news_id"],
        "datetime":  ["datetime","date","timestamp","time","published_at","published_at_utc","published","pub_time"],
        "source":    ["source","provider","origem","fonte"],
        "headline":  ["headline","title","titulo","título","news_title"],
        "ticker":    ["ticker","symbol","sigla","ativo","codigo","código"],
        "sector":    ["sector","setor","categoria"],
        "country":   ["country","pais","país","region"],
        "url":       ["url","link","href"],
        "language":  ["language","lang","idioma"],
    }

    def pick_col(df, names):
        lower = {c.lower(): c for c in df.columns}
        for nm in names:
            if nm in lower:
                return lower[nm]
        return None

    dfs = []
    kept = 0
    skipped = 0

    for fp in paths:
        try:
            if fp.lower().endswith((".parquet", ".pq", ".parq")):
                df = pd.read_parquet(fp)
            else:
                df = pd.read_csv(fp)
        except Exception as e:
            print(f"[WARN] Falha ao ler {fp}: {e}")
            skipped += 1
            continue

        # escolhe colunas (algumas podem faltar; preenchidas em branco)
        col_id   = pick_col(df, aliases["id"])
        col_dt   = pick_col(df, aliases["datetime"])
        col_src  = pick_col(df, aliases["source"])
        col_hl   = pick_col(df, aliases["headline"])
        col_tk   = pick_col(df, aliases["ticker"])
        col_sec  = pick_col(df, aliases["sector"])
        col_ctry = pick_col(df, aliases["country"])
        col_url  = pick_col(df, aliases["url"])
        col_lang = pick_col(df, aliases["language"])

        # exigência mínima para manter o arquivo: datetime + headline + ticker + url
        if any(x is None for x in [col_dt, col_hl, col_tk, col_url]):
            if DEBUG:
                print(f"[NEWS][skip] {fp} sem cols mínimas (precisa de datetime/headline/ticker/url). Tem: {list(df.columns)[:8]}...")
            skipped += 1
            continue

        out = pd.DataFrame({
            "id":       df[col_id]   if col_id   else "",
            "datetime": df[col_dt],
            "source":   df[col_src]  if col_src  else "",
            "headline": df[col_hl],
            "ticker":   df[col_tk],
            "sector":   df[col_sec]  if col_sec  else "",
            "country":  df[col_ctry] if col_ctry else "",
            "url":      df[col_url],
            "language": df[col_lang] if col_lang else "",
        })
        dfs.append(out)
        kept += 1

    if not dfs:
        raise ValueError("Não foi possível ler nenhum dataframe válido (verifique colunas mínimas e caminhos).")

    news = pd.concat(dfs, ignore_index=True)

    # --------- limpeza básica ----------
    news["ticker"] = news["ticker"].astype(str).str.upper().str.strip()
    news["headline"] = news["headline"].astype(str)

    # --------- parse robusto de datetime ----------
    s = news["datetime"].astype(str).str.strip()

    # Heurística: tem offset? (Z / +hh:mm / -hh:mm)
    has_offset = s.str.contains(r'(Z|[+-]\d{2}:\d{2})$', regex=True, na=False)

    # 1) com offset → parse com utc=True e converte para TZ_NEWS
    dt_offset = pd.to_datetime(s.where(has_offset), errors="coerce", utc=True)
    dt_offset = dt_offset.dt.tz_convert(TZ_NEWS)

    # 2) sem offset → parse naive e localiza TZ_NEWS
    dt_naive  = pd.to_datetime(s.where(~has_offset), errors="coerce")
    try:
        dt_naive = dt_naive.dt.tz_localize(TZ_NEWS, nonexistent="shift_forward", ambiguous="NaT")
    except Exception:
        # fallback: se der erro de ambiguidade, tenta sem parâmetros
        dt_naive = dt_naive.dt.tz_localize(TZ_NEWS)

    # combina as duas metades
    news["news_dt"] = dt_offset.fillna(dt_naive)

    # dropa apenas as linhas sem data válida
    before = len(news)
    news = news.dropna(subset=["news_dt"]).copy()
    after = len(news)
    if DEBUG:
        print(f"[NEWS] Linhas totais concatenadas: {before} | após parse de datetime: {after} | descartadas: {before-after}")

    return news

def dedupe_exact_headlines(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["headline_norm"] = df["headline"].map(_normalize_headline)
    df = df.sort_values("news_dt").drop_duplicates(subset=["ticker","headline_norm"], keep="first")
    return df

def load_us_prices_xlsx(path: Path, sheet: str, col_date: str, col_last: str) -> pd.DataFrame:
    raw = pd.read_excel(path, sheet_name=sheet)
    if col_date not in raw.columns or col_last not in raw.columns:
        raise ValueError(f"{path.name} precisa ter colunas '{col_date}' e '{col_last}' na aba '{sheet}'.")
    df = raw.rename(columns={col_date: "datetime", col_last: "price"})
    dt = pd.to_datetime(df["datetime"], errors="coerce")
    df["dt_us"] = pd.DatetimeIndex(dt).tz_localize(TZ_US, nonexistent="shift_forward", ambiguous="NaT")
    df["price"] = pd.to_numeric(df["price"], errors="coerce")
    df = (df.dropna(subset=["dt_us","price"])
            .sort_values("dt_us")
            .set_index("dt_us")[["price"]])
    return df

def is_intraday(index: pd.DatetimeIndex) -> bool:
    """Heurística: mais de 1 ponto no mesmo dia → intraday."""
    if len(index) < 3:
        return False
    counts = pd.Series(index.normalize()).value_counts()
    return bool((counts > 1).any())

def merge_asof_value(series: pd.Series, when: pd.Timestamp, direction="backward",
                     tolerance: str | None = None, return_ts: bool = False):
    """
    Faz um merge_asof simples sobre a série (indexada por timestamp).
    Retorna o preço; se return_ts=True, retorna também o timestamp casado.
    """
    if series.empty:
        return (np.nan, pd.NaT) if return_ts else np.nan

    s = series.dropna().sort_index()
    left  = pd.DataFrame({"key":[when]})
    right = s.reset_index()
    # padroniza nome do índice para 'key'
    idx_col = right.columns[0]
    right   = right.rename(columns={idx_col:"key", "price":"price"})
    right["ts_match"] = right["key"]

    tol = pd.to_timedelta(tolerance) if tolerance else None
    m = pd.merge_asof(
        left.sort_values("key"),
        right[["key","price","ts_match"]].sort_values("key"),
        on="key", direction=direction, tolerance=tol
    )
    price = m.iloc[0]["price"]
    ts    = m.iloc[0]["ts_match"]
    if return_ts:
        return (float(price) if pd.notna(price) else np.nan, ts)
    return float(price) if pd.notna(price) else np.nan

def first_close_after_event(close_daily: pd.Series, event_dt_us: pd.Timestamp) -> tuple[pd.Timestamp|None, float]:
    """
    Fechamento do dia 'âncora':
      - se a notícia sair até 16:00 ET → usa o FECHAMENTO do mesmo dia (se houver);
      - senão → usa o FECHAMENTO do próximo dia útil disponível.
    """
    close_daily = close_daily.sort_index()
    hh, mm = US_CLOSE_HHMM
    cutoff = event_dt_us.replace(hour=hh, minute=mm, second=0, microsecond=0)
    event_date = event_dt_us.date()
    idx_dates = close_daily.index

    def next_trading_after(d):
        pos = idx_dates.searchsorted(d, side="right")
        if pos >= len(idx_dates): return None
        return idx_dates[pos]

    if event_dt_us <= cutoff and (event_date in idx_dates):
        d0 = event_date
    else:
        d0 = next_trading_after(event_date)
    if d0 is None:
        return None, np.nan
    return d0, float(close_daily.loc[d0])

def daily_log_returns(close_daily: pd.Series, h: int) -> pd.Series:
    s = close_daily.sort_index()
    return np.log(s.shift(-h) / s)

def label_quantile(ret_hist: pd.Series, value: float, qlo=0.30, qhi=0.60, min_obs=60) -> float:
    rh = ret_hist.dropna()
    if pd.isna(value):
        return np.nan
    if len(rh) >= min_obs:
        q30, q60 = rh.quantile(qlo), rh.quantile(qhi)
        if value > q60: return 1.0
        if value < q30: return -1.0
        return 0.0
    # fallback: somente sinal
    return float(np.sign(value))

# ========================= PIPELINE =========================

def run_pipeline_aapl(
    default_out_news: Path = DEFAULT_OUT_NEWS,
    path_us_xlsx: Path = PATH_US_XLSX,
    us_sheet: str = US_SHEET,
    out_csv: Path = OUTPUT_CSV
) -> pd.DataFrame:

    # ---- notícias (TODO o histórico do arquivo) ----
    news = _load_news(default_out_news)
    n0 = len(news)
    news = news[news["ticker"] == "AAPL"].copy()
    n_aapl = len(news)
    if DEDUP_STRICT:
        news = dedupe_exact_headlines(news)
    n_after_dedup = len(news)

    # ---- preços US ----
    px = load_us_prices_xlsx(path_us_xlsx, us_sheet, US_COL_DATE, US_COL_LAST)
    if px.empty:
        raise SystemExit("Série de preços US vazia para AAPL.")
    intraday = is_intraday(px.index)

    # diário (fechamento por data US)
    px_daily = (
        px["price"].copy()
        .tz_convert(TZ_US)
        .to_frame("price")
        .assign(date=lambda d: d.index.date)
        .groupby("date")["price"].last()
    )

    # históricas para labels (1/3/5D)
    ret_hist = {h: daily_log_returns(px_daily, h) for h in (1,3,5)}

    # ---- DIAGNÓSTICOS INICIAIS ----
    if DEBUG:
        print(f"[DEBUG] Notícias totais: {n0}")
        print(f"[DEBUG] Notícias AAPL:   {n_aapl}")
        print(f"[DEBUG] Após dedupe:     {n_after_dedup}  (DEDUP_STRICT={DEDUP_STRICT})")
        if n_after_dedup:
            print(f"[DEBUG] Janela notícias: {news['news_dt'].min()}  →  {news['news_dt'].max()}")
        print(f"[DEBUG] px intraday? {intraday}")
        if len(px.index):
            print(f"[DEBUG] Janela px:    {px.index.min()}  →  {px.index.max()}")
        if len(px_daily.index):
            print(f"[DEBUG] Datas px_daily: {px_daily.index.min()} → {px_daily.index.max()}")
        if not intraday:
            print("[WARN] Preço US parece ser DIÁRIO (sem intraday). impact_30m ficará NaN (normal).")

    # ---- impactos por notícia (todo o histórico do arquivo para AAPL) ----
    recs, reasons = [], {
        "no_price_coverage": 0,   # notícia fora da janela de px_daily
        "no_anchor_close":   0,   # não achou fechamento âncora
        "no_30m_intraday":   0,   # não há intraday (ou stale)
        "ok":                0
    }

    for _, row in news.iterrows():
        dt_sp = row["news_dt"]
        dt_us = dt_sp.tz_convert(TZ_US)

        # 30m (intraday)
        if intraday:
            p_anchor_30, ts_anchor = merge_asof_value(
                px["price"], dt_us,
                direction="backward",
                tolerance=ASOF_TOL_30M,
                return_ts=True
            )
            p_30, ts_30 = merge_asof_value(
                px["price"], dt_us + pd.Timedelta(minutes=30),
                direction="backward",
                tolerance=ASOF_TOL_30M,
                return_ts=True
            )
            stale_anchor = pd.isna(ts_anchor) or (abs(dt_us - ts_anchor) > MAX_STALENESS_30M)
            stale_30     = pd.isna(ts_30)     or (abs((dt_us + pd.Timedelta(minutes=30)) - ts_30) > MAX_STALENESS_30M)

            if stale_anchor or stale_30 or not (np.isfinite(p_anchor_30) and np.isfinite(p_30)):
                lr_30m = np.nan
                dbg30  = "stale_or_missing"
                reasons["no_30m_intraday"] += 1
            else:
                lr_30m = np.log(p_30 / p_anchor_30)
                dbg30  = ""
        else:
            lr_30m = np.nan
            dbg30  = "no_intraday"
            reasons["no_30m_intraday"] += 1

        # âncora diária (1º fechamento após o evento)
        if (not len(px_daily.index)) or (dt_us.date() > px_daily.index.max()) or (dt_us.date() < px_daily.index.min()):
            reasons["no_price_coverage"] += 1
            anchor_date, p_anchor_d = None, np.nan
        else:
            anchor_date, p_anchor_d = first_close_after_event(px_daily, dt_us)
            if anchor_date is None:
                reasons["no_anchor_close"] += 1

        def lr_days(h: int):
            if anchor_date is None: return np.nan
            idx = px_daily.index.tolist()
            pos = idx.index(anchor_date)
            pos_h = pos + h
            if pos_h >= len(idx): return np.nan
            p_h = float(px_daily.iloc[pos_h])
            return np.log(p_h / p_anchor_d)

        lr_1d, lr_3d, lr_5d = lr_days(1), lr_days(3), lr_days(5)

        # label principal (1D) por quantis do histórico ANTERIOR ao anchor_date
        def hist_before(h):
            if anchor_date is None: return pd.Series([], dtype=float)
            return ret_hist[h].loc[ret_hist[h].index < anchor_date]

        label_main = label_quantile(hist_before(1), lr_1d)

        rec = {
            # base original
            "id": row["id"],
            "datetime": row["datetime"],
            "source": row["source"],
            "headline": row["headline"],
            "ticker": row["ticker"],
            "sector": row.get("sector",""),
            "country": row.get("country",""),
            "url": row["url"],
            "language": row.get("language",""),
            # impactos
            "impact_30m": lr_30m,
            "impact_1d":  lr_1d,
            "impact_3d":  lr_3d,
            "impact_5d":  lr_5d,
            # rótulo principal (1D)
            "label":      label_main,
        }

        if DEBUG:
            rec["debug_dt_us"] = dt_us
            rec["debug_anchor_date"] = anchor_date
            rec["debug_why_30m"] = dbg30
            rec["debug_has_1d"] = pd.notna(lr_1d)
            rec["debug_has_3d"] = pd.notna(lr_3d)
            rec["debug_has_5d"] = pd.notna(lr_5d)

        if (anchor_date is not None) and any(pd.notna(x) for x in [lr_1d, lr_3d, lr_5d]):
            reasons["ok"] += 1

        recs.append(rec)

    out = pd.DataFrame.from_records(recs)

    # ---- PRINTS FINAIS DE DEBUG ----
    if DEBUG:
        print("\n[DEBUG] Motivos agregados:")
        for k,v in reasons.items():
            print(f"  - {k}: {v}")
        # amostra de linhas sem 1D
        if ("debug_has_1d" in out) and (not out.empty):
            probl = out[~out["debug_has_1d"].fillna(False)]
            if len(probl):
                print(f"[DEBUG] Linhas sem 1D: {len(probl)} (mostrando até 5)")
                cols_show = ["datetime","headline","debug_dt_us","debug_anchor_date","impact_1d","impact_3d","impact_5d","debug_why_30m"]
                print(probl.head(5)[cols_show].to_string(index=False))

    # ---- salva ----
    out.to_csv(out_csv, index=False, encoding="utf-8")
    if TRY_PARQUET:
        try:
            out.to_parquet(out_csv.with_suffix(".parquet"), index=False)
        except Exception:
            pass

    return out

# -------------------- EXEC --------------------
if __name__ == "__main__":
    df_out = run_pipeline_aapl()
    print("\nLinhas geradas:", len(df_out))
    print("Preview:")
    print(df_out.head(10).to_string(index=False))


  MAX_STALENESS_30M   = pd.Timedelta("2H")         # se casado >2h longe do alvo → invalida


[NEWS] Arquivos encontrados: 1
  - ../data/investing_news.parquet


  has_offset = s.str.contains(r'(Z|[+-]\d{2}:\d{2})$', regex=True, na=False)


[NEWS] Linhas totais concatenadas: 7754 | após parse de datetime: 15 | descartadas: 7739
[DEBUG] Notícias totais: 15
[DEBUG] Notícias AAPL:   15
[DEBUG] Após dedupe:     15  (DEDUP_STRICT=True)
[DEBUG] Janela notícias: 2025-10-15 00:28:57.981822-03:00  →  2025-10-15 17:28:56.072530-03:00
[DEBUG] px intraday? True
[DEBUG] Janela px:    2010-10-25 11:30:00-04:00  →  2025-08-12 16:30:00-04:00
[DEBUG] Datas px_daily: 2010-10-25 → 2025-08-12

[DEBUG] Motivos agregados:
  - no_price_coverage: 15
  - no_anchor_close: 0
  - no_30m_intraday: 15
  - ok: 0
[DEBUG] Linhas sem 1D: 15 (mostrando até 5)
                        datetime                                                                                      headline                      debug_dt_us debug_anchor_date  impact_1d  impact_3d  impact_5d    debug_why_30m
2025-10-15T00:28:57.981822-03:00                             Oferta da Foxconn por participação em unidade do Grupo ZF estagna 2025-10-14 23:28:57.981822-04:00              Non

  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_timedelta(tolerance) if tolerance else None
  tol = pd.to_

In [12]:
dados = pd.read_csv(OUTPUT_CSV)
dados

Unnamed: 0,id,datetime,source,headline,ticker,sector,country,url,language,impact_30m,impact_1d,impact_3d,impact_5d,label,debug_dt_us,debug_anchor_date,debug_why_30m,debug_has_1d,debug_has_3d,debug_has_5d
0,cf4c4abd-c5ea-5cd0-9e3f-e7fcfcea20af,2025-10-15T00:28:57.981822-03:00,Investing.com,Oferta da Foxconn por participação em unidade ...,AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR,,,,,,2025-10-14 23:28:57.981822-04:00,,stale_or_missing,False,False,False
1,140dc159-df5a-5cfb-adc2-632c97101a3a,2025-10-15T04:28:57.981730-03:00,Investing.com,Globalstar Inc recebe atualização de avaliação...,AAPL,Tecnologia,BR,https://br.investing.com/news/pro/clear-street...,pt-BR,,,,,,2025-10-15 03:28:57.981730-04:00,,stale_or_missing,False,False,False
2,e87dbcce-327e-50ee-af29-a8bc1ec14812,2025-10-15T05:28:57.981622-03:00,Investing.com,Clear Street inicia cobertura da Globalstar co...,AAPL,Tecnologia,BR,https://br.investing.com/news/analyst-ratings/...,pt-BR,,,,,,2025-10-15 04:28:57.981622-04:00,,stale_or_missing,False,False,False
3,74ee13f8-122f-585c-941b-5541c819f4a6,2025-10-15T06:28:57.981358-03:00,Investing.com,Samsung lidera mercado global de smartphones c...,AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR,,,,,,2025-10-15 05:28:57.981358-04:00,,stale_or_missing,False,False,False
4,19692d8f-aa08-5862-97c3-dd10cf53fdee,2025-10-15T06:28:57.981522-03:00,Investing.com,Apple busca mudanças na lei tributária da Índi...,AAPL,Tecnologia,BR,https://br.investing.com/news/economy-news/app...,pt-BR,,,,,,2025-10-15 05:28:57.981522-04:00,,stale_or_missing,False,False,False
5,255b5cb9-b785-55a0-a5b4-41b111471094,2025-10-15T08:28:56.073568-03:00,Reuters,"Apple aumentará investimentos na China, segund...",AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR,,,,,,2025-10-15 07:28:56.073568-04:00,,stale_or_missing,False,False,False
6,158700b4-73dc-57eb-aa2b-26e3c6185e46,2025-10-15T10:28:56.072886-03:00,Investing.com,Apple apresenta chip M5 com desempenho de IA 4...,AAPL,Tecnologia,BR,https://br.investing.com/news/company-news/app...,pt-BR,,,,,,2025-10-15 09:28:56.072886-04:00,,stale_or_missing,False,False,False
7,2924842a-f1f7-5992-8de8-e9a0d4a8d43e,2025-10-15T10:28:56.072985-03:00,Investing.com,BofA Securities mantém recomendação de compra ...,AAPL,Tecnologia,BR,https://br.investing.com/news/analyst-ratings/...,pt-BR,,,,,,2025-10-15 09:28:56.072985-04:00,,stale_or_missing,False,False,False
8,221b6c48-5c6a-5290-a827-fb2660f24ccf,2025-10-15T10:28:56.073083-03:00,Investing.com,Apple Vision Pro com chip M5 e banda de malha ...,AAPL,Tecnologia,BR,https://br.investing.com/news/company-news/app...,pt-BR,,,,,,2025-10-15 09:28:56.073083-04:00,,stale_or_missing,False,False,False
9,ea0af311-5fcb-5299-beab-8aa347d24a12,2025-10-15T10:28:56.073173-03:00,Investing.com,Apple lança novo iPad Pro com chip M5 e desemp...,AAPL,Tecnologia,BR,https://br.investing.com/news/assorted/apple-l...,pt-BR,,,,,,2025-10-15 09:28:56.073173-04:00,,stale_or_missing,False,False,False


In [8]:
#Ver tamanho do dataframe
dados.shape

(15, 20)

In [10]:
dados = pd.read_parquet(DEFAULT_OUT_NEWS)
dados

Unnamed: 0,id,datetime,source,headline,ticker,sector,country,url,language
0,be5e51fd-5ce5-5f20-a5fc-7a5783813a05,2025-10-15T17:28:56.072530-03:00,Reuters,"Apple lança novas versões de MacBook, iPad e h...",AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR
1,56dc0ac0-edb7-57ea-a6e6-2052f91d0e60,2025-10-15T11:28:56.072778-03:00,Investing.com,UBS: Tempos de espera para iPhone 17 continuam...,AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR
2,238d7af7-c297-56c8-a865-de1b58394c29,2025-10-15T10:28:56.073472-03:00,Investing.com,Apple recebe atualização de classificação de a...,AAPL,Tecnologia,BR,https://br.investing.com/news/pro/bofa-sobre-a...,pt-BR
3,21111957-bafa-575b-9913-7125f7f2782f,2025-10-15T10:28:56.073367-03:00,Investing.com,Apple anuncia chip M5 com desempenho de IA 4 v...,AAPL,Tecnologia,BR,https://br.investing.com/news/assorted/apple-a...,pt-BR
4,7af568fb-59b5-501b-9c3b-8030d4373300,2025-10-15T10:28:56.073270-03:00,Investing.com,Apple lança MacBook Pro de 14 polegadas com ch...,AAPL,Tecnologia,BR,https://br.investing.com/news/assorted/apple-l...,pt-BR
...,...,...,...,...,...,...,...,...,...
7749,64952411-9287-5126-8329-1f9ca3897b15,2014-04-02T10:38:00-03:00,Investing.com,Bolsas dos EUA sobem antes de dados; Dow Jones...,AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR
7750,a0d552a5-007a-5492-90dd-35409b592920,2014-03-31T13:39:00-03:00,Investing.com,"Bolsas dos EUA abrem em alta, discurso de Yell...",AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR
7751,b860f070-bf1a-547a-a739-735c04969fe8,2014-03-31T10:39:00-03:00,Investing.com,Bolsas dos EUA sobem com especulações de estím...,AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR
7752,a2e68f61-9668-53f7-8402-945bb287279f,2014-03-25T13:38:00-03:00,Investing.com,Bolsas dos EUA abrem em alta após relatório im...,AAPL,Tecnologia,BR,https://br.investing.com/news/stock-market-new...,pt-BR
