In [4]:

from __future__ import annotations
import re
import time
import uuid
import os
import random
import argparse
from pathlib import Path
from typing import Optional, List, Dict
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import pandas as pd
import requests
from bs4 import BeautifulSoup
from dateutil import parser as du
from tqdm import tqdm
import logging

# ---------------- Config & Globals ----------------
SP_TZ = ZoneInfo("America/Sao_Paulo")

DEFAULT_EXCEL = "../data/BaseRefAtivos.xlsx"
DEFAULT_OUT = "../data/investing_news.parquet"
DEFAULT_MAX_PAGES = 1400  # <- limite padrão para facilitar debug rápido

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/127.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
    "Referer": "https://br.investing.com/", # Informa de onde você "veio"
    "DNT": "1", # Do Not Track
    "Upgrade-Insecure-Requests": "1",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
}

SESSION = requests.Session()
SESSION.headers.update(HEADERS)

RELATIVE_REGEX = re.compile(
    r"(?P<num>\d+)\s*(?P<unit>min|mins|minutos|minuto|hora|horas|dia|dias)\s*atr[aá]s",
    flags=re.IGNORECASE,
)

# ---------------- Logging ----------------
def setup_logging(verbose: bool = False, debug: bool = False) -> None:
    level = logging.INFO
    if verbose or debug:
        level = logging.DEBUG
    logging.basicConfig(
        level=level,
        format="%(asctime)s | %(levelname)-7s | %(message)s",
        datefmt="%H:%M:%S",
    )

log = logging.getLogger(__name__)

# ---------------- Utils ----------------
def normalize_news_url(url: str) -> str:
    """Garante base ...-news e remove sufixo /<n> para paginar manualmente."""
    url = (url or "").strip()
    if not url:
        return url
    if url.endswith("-new"):
        url = url + "s"
    url = re.sub(r"/+$", "", url)
    m = re.search(r"/(\d+)$", url)
    if m:
        url = url[: - (len(m.group(0)))]
    return url

def page_url(url_base: str, page: int) -> str:
    return f"{url_base}/{page}"

def parse_datetime_from_time_tag(time_tag) -> Optional[datetime]:
    if time_tag is None:
        return None

    text = (time_tag.get_text() or "").strip().lower()
    m = RELATIVE_REGEX.search(text)
    if m:
        num = int(m.group("num"))
        unit = m.group("unit")
        now_sp = datetime.now(SP_TZ)

        if unit.startswith("min"):
            dt = now_sp - timedelta(minutes=num)
        elif unit.startswith("hora"):
            dt = now_sp - timedelta(hours=num)
        else:
            dt = now_sp - timedelta(days=num)
        return dt

    dt_attr = time_tag.get("datetime")
    if dt_attr:
        try:
            dt = du.parse(dt_attr)
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
        except Exception as e:
            log.debug(f"Falha parse datetime attr '{dt_attr}': {e}")

    # fallback: parse do texto absoluto, ex.: "12 de out. de 2025"
    try:
        dt = du.parse(text, dayfirst=True, fuzzy=True, languages=["pt"])
        if dt is not None:
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=SP_TZ)
            return dt.astimezone(SP_TZ)
    except Exception as e:
        log.debug(f"Falha parse texto de data '{text}': {e}")

    return None

def deduce_country_language_from_url(url: str) -> tuple[str, str]:
    if "br.investing.com" in url:
        return ("BR", "pt-BR")
    return ("", "")

def make_news_id(url: str, title: str) -> str:
    base = (url or "").strip() + "||" + (title or "").strip()
    return str(uuid.uuid5(uuid.NAMESPACE_URL, base))

def html_debug_dump(html: str, company: str, page: int, outdir: Path) -> None:
    outdir.mkdir(parents=True, exist_ok=True)
    fn = outdir / f"{safe_filename(company)}_p{page:04d}.html"
    try:
        fn.write_text(html, encoding="utf-8")
        log.debug(f"HTML salvo para debug: {fn}")
    except Exception as e:
        log.warning(f"Falha ao salvar HTML de debug: {e}")

def safe_filename(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s.strip())

# ---------------- HTTP com backoff ----------------
def fetch(url: str, max_retries: int = 5, timeout: int = 25) -> Optional[requests.Response]:
    for i in range(max_retries):
        try:
            r = SESSION.get(url, timeout=timeout)
            if r.status_code == 200:
                return r
            if r.status_code in (404, 410):
                log.info(f"HTTP {r.status_code} em {url} (provável fim).")
                return None
            if r.status_code in (429, 500, 502, 503, 504):
                wait = (1.5 * (i + 1)) + random.random()
                log.warning(f"HTTP {r.status_code} em {url}; retry em {wait:.1f}s")
                time.sleep(wait)
                continue
            log.warning(f"HTTP {r.status_code} em {url}; sem retry programado.")
            return None
        except requests.RequestException as e:
            wait = (1.2 * (i + 1)) + random.random()
            log.warning(f"Erro rede: {e}; retry em {wait:.1f}s")
            time.sleep(wait)
    log.error(f"Falhou após {max_retries} tentativas: {url}")
    return None

# ---------------- Parsing ----------------
def parse_news_items(html: str, ticker: str, sector: str) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    ul = soup.find("ul", attrs={"data-test": "news-list"})
    if not ul:
        log.debug("Ul[data-test='news-list'] não encontrada — página pode ter mudado.")
        return []

    items: List[Dict] = []
    arts = ul.select("article[data-test='article-item']")
    if not arts:
        log.debug("Nenhum article[data-test='article-item'] encontrado nesta página.")
        return []

    for art in arts:
        a_title = art.select_one("a[data-test='article-title-link']")
        if not a_title:
            log.debug("a[data-test='article-title-link'] ausente em um article; pulando.")
            continue

        headline = (a_title.get_text() or "").strip()
        url = a_title.get("href") or ""
        if url.startswith("/"):
            url = "https://br.investing.com" + url

        a_provider = art.select_one("a[data-test='article-provider-link']")
        source = (a_provider.get_text().strip() if a_provider else "").strip()

        t = art.select_one("time[data-test='article-publish-date']")
        dt = parse_datetime_from_time_tag(t)

        country, language = deduce_country_language_from_url(url)
        _id = make_news_id(url, headline)

        items.append(
            {
                "id": _id,
                "datetime": dt.isoformat() if dt else None,
                "source": source,
                "headline": headline,
                "ticker": ticker,
                "sector": sector,
                "country": country,
                "url": url,
                "language": language,
            }
        )
    return items

def guess_last_page(html: str) -> Optional[int]:
    soup = BeautifulSoup(html, "html.parser")
    pag_links = soup.select("div.flex.items-center.gap-2 a")
    nums = []
    for a in pag_links:
        txt = (a.get_text() or "").strip()
        if txt.isdigit():
            nums.append(int(txt))
    return max(nums) if nums else None

# ---------------- Scraper por empresa ----------------
def scrape_company(
    link_news: str,
    ticker_orig: str,
    sector: str,
    company_label: str,
    polite_sleep: float = 0.7,
    max_pages: Optional[int] = DEFAULT_MAX_PAGES,
    save_html_debug: bool = False,
    html_debug_dir: Path = Path("./_html_debug"),
) -> List[Dict]:
    url_base = normalize_news_url(link_news)
    if not url_base:
        log.warning(f"[{company_label}] Link News vazio.")
        return []

    first_url = page_url(url_base, 1)
    r1 = fetch(first_url)
    if not r1:
        log.warning(f"[{company_label}] Não foi possível carregar a página 1: {first_url}")
        return []

    if save_html_debug:
        html_debug_dump(r1.text, company_label, 1, html_debug_dir)

    items = parse_news_items(r1.text, ticker_orig, sector)
    log.info(f"[{company_label}] p1: {len(items)} items.")

    last_page = guess_last_page(r1.text)
    if last_page is None:
        # fallback: iterar enquanto vier notícia (parando após X vazias seguidas)
        page = 2
        empty_streak = 0
        while True:
            if max_pages and page > max_pages:
                log.info(f"[{company_label}] max_pages atingido ({max_pages}).")
                break

            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                empty_streak += 1
                log.debug(f"[{company_label}] página {page} falhou ({empty_streak} vazias).")
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas vazias seguidas; encerrando.")
                    break
                page += 1
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            if not chunk:
                empty_streak += 1
                if empty_streak >= 3:
                    log.info(f"[{company_label}] 3 páginas sem itens; encerrando.")
                    break
            else:
                items.extend(chunk)
                empty_streak = 0
            page += 1
    else:
        total_pages = last_page
        if max_pages:
            total_pages = min(total_pages, max_pages)
        for page in range(2, total_pages + 1):
            url = page_url(url_base, page)
            time.sleep(polite_sleep)
            r = fetch(url)
            if not r:
                log.debug(f"[{company_label}] Falha ao carregar p{page}.")
                continue

            if save_html_debug:
                html_debug_dump(r.text, company_label, page, html_debug_dir)

            chunk = parse_news_items(r.text, ticker_orig, sector)
            log.info(f"[{company_label}] p{page}: {len(chunk)} items.")
            items.extend(chunk)

    return items

# ---------------- Execução principal ----------------
def read_excel(excel_path: Path) -> pd.DataFrame:
    df = pd.read_excel(excel_path)
    expected_cols = {
        "Empresa", "Setor", "Ticker BDR", "Ticker Original (EUA)", "Bolsa (EUA)", "Link News",
    }
    missing = expected_cols - set(df.columns)
    if missing:
        raise ValueError(f"Colunas faltantes no Excel: {sorted(missing)}")
    return df

def merge_incremental(df_new: pd.DataFrame, out_parquet: Path) -> pd.DataFrame:
    if out_parquet.exists():
        df_old = pd.read_parquet(out_parquet)
        df_all = pd.concat([df_old, df_new], ignore_index=True)
        df_all = df_all.drop_duplicates(subset=["id"]).reset_index(drop=True)
        return df_all
    return df_new

def sort_by_datetime(df: pd.DataFrame) -> pd.DataFrame:
    def _safe_parse_iso(x):
        try:
            return du.parse(x)
        except Exception:
            return None
    if "datetime" in df.columns:
        df["_dt_sort"] = df["datetime"].map(_safe_parse_iso)
        df = df.sort_values("_dt_sort", ascending=False).drop(columns=["_dt_sort"])
    return df

def run(
    excel: Path,
    out_parquet: Path,
    only: Optional[str],
    max_pages: Optional[int],
    resume: bool,
    save_html_debug: bool,
    workers: int,
):
    df_ref = read_excel(excel)

    # filtro --only por ticker original OU empresa (case-insensitive, contém)
    if only:
        mask = (
            df_ref["Ticker Original (EUA)"].astype(str).str.contains(only, case=False, na=False) |
            df_ref["Empresa"].astype(str).str.contains(only, case=False, na=False)
        )
        df_ref = df_ref[mask].copy()
        log.info(f"Filtrando --only '{only}'. {len(df_ref)} linha(s) no Excel após filtro.")

    all_rows: List[Dict] = []

    # processamento sequencial (simples e mais debugável)
    for _, row in tqdm(df_ref.iterrows(), total=len(df_ref), desc="Empresas"):
        empresa = str(row["Empresa"]).strip()
        setor = str(row["Setor"]).strip()
        ticker_orig = str(row["Ticker Original (EUA)"]).strip()
        link_news = str(row["Link News"]).strip()

        if not link_news or link_news.lower() == "nan":
            log.warning(f"[{empresa}] Link News vazio; pulando.")
            continue

        label = ticker_orig or empresa
        items = scrape_company(
            link_news=link_news,
            ticker_orig=ticker_orig or empresa,
            sector=setor,
            company_label=label,
            polite_sleep=0.7,
            max_pages=max_pages if max_pages is not None else DEFAULT_MAX_PAGES,
            save_html_debug=save_html_debug,
        )
        if not items:
            log.info(f"[{label}] Nenhuma notícia encontrada.")
            continue

        df_company = pd.DataFrame(items).drop_duplicates(subset=["id"]).reset_index(drop=True)

        if resume and out_parquet.exists():
            df_merged = merge_incremental(df_company, out_parquet)
            df_merged = sort_by_datetime(df_merged)
            df_merged.to_parquet(out_parquet, index=False)
            log.info(f"[{label}] Merge incremental -> {len(df_merged)} linhas em {out_parquet}")
        else:
            all_rows.extend(df_company.to_dict("records"))

    # flush final quando não está em modo resume
    if not resume and all_rows:
        df = pd.DataFrame(all_rows).drop_duplicates(subset=["id"]).reset_index(drop=True)
        df = sort_by_datetime(df)
        df.to_parquet(out_parquet, index=False)
        log.info(f"Salvo {len(df):,} notícias em {out_parquet}")
    elif not all_rows and not out_parquet.exists():
        log.warning("Nenhuma notícia encontrada e arquivo de saída ainda não existe.")

# ---------------- CLI ----------------
def parse_args():
    p = argparse.ArgumentParser(description="Scraper de notícias do Investing.com (PT-BR)")
    p.add_argument("--excel", default=DEFAULT_EXCEL, help="Caminho do Excel de referência")
    p.add_argument("--out", default=DEFAULT_OUT, help="Arquivo Parquet de saída")
    p.add_argument("--only", default=None, help="Filtra por Ticker Original (EUA) ou Empresa (contém, case-insensitive)")
    p.add_argument("--max-pages", type=int, default=DEFAULT_MAX_PAGES, help="Limite máx. de páginas por ativo (padrão 15)")
    p.add_argument("--resume", action="store_true", help="Mescla incremental com parquet existente (checkpoint por empresa)")
    p.add_argument("--save-html-debug", action="store_true", help="Salva HTML das páginas em ./_html_debug")
    p.add_argument("--workers", type=int, default=1, help="(reservado) Nº de workers em paralelo (mantido sequencial por debug)")
    p.add_argument("--verbose", action="store_true", help="Logs detalhados (DEBUG)")
    p.add_argument("--debug", action="store_true", help="Equivalente a --verbose")

    # tolerar args estranhos do Jupyter/VSCode:
    args, _unknown = p.parse_known_args()
    return args

def main():
    import sys
    # sanitiza argv quando rodar dentro de notebooks com ipykernel
    if any("ipykernel" in x for x in sys.argv):
        # mantém os próprios args reconhecidos, graças ao parse_known_args acima
        pass

    args = parse_args()
    setup_logging(verbose=args.verbose or args.debug, debug=args.debug)

    excel = Path(args.excel)
    out_parquet = Path(args.out)

    log.info(f"Excel: {excel.resolve()}")
    log.info(f"Saída: {out_parquet.resolve()}")
    if args.only:
        log.info(f"Filtro --only: {args.only}")
    log.info(f"Limite de páginas: {args.max_pages}")
    if args.resume:
        log.info("Modo incremental: ON")
    if args.save_html_debug:
        log.info("Salvar HTML debug: ON")

    try:
        run(
            excel=excel,
            out_parquet=out_parquet,
            only=args.only,
            max_pages=args.max_pages,
            resume=args.resume,
            save_html_debug=args.save_html_debug,
            workers=args.workers,
        )
    except Exception as e:
        log.exception(f"Falha fatal: {e}")

if __name__ == "__main__":
    main()

00:49:44 | INFO    | Excel: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/BaseRefAtivos.xlsx
00:49:44 | INFO    | Saída: /Users/emanuelgandra/Desktop/Projetos /TesteQuant/QuantumSpreadHunters---Quantamental/data/investing_news.parquet
00:49:44 | INFO    | Limite de páginas: 1400


Empresas:   0%|          | 0/43 [00:46<?, ?it/s]


KeyboardInterrupt: 