In [None]:
#!/usr/bin/env python3
"""
Crawler STIL 2024 (SOL · OJS) — gera CSV com artigos de NLP.
Usa meta-tags para extrair todos os campos e tqdm.notebook para barra.
Requisitos:
    pip install requests beautifulsoup4 tqdm pandas
"""

import csv
import re
import time
import sys
import requests
import pandas as pd

from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List, Dict

# tqdm notebook ou fallback CLI
try:
    from tqdm.notebook import tqdm
except ImportError:
    from tqdm import tqdm

BASE  = "https://sol.sbc.org.br"
INDEX = f"{BASE}/index.php/bracis"

# Filtro opcional NLP (remova se quiser TODO)
KW = re.compile(
    r"\b(nlp|linguagem|language|text|texto|corpus|bert|transformer|llm|"
    r"summariz|resum|translation|tradu[cç]|sentiment|opini|"
    r"classifica|parsing|tagging|entity|question|answer|dialog|speech)\b",
    flags=re.I,
)

def get_index_links() -> List[Dict[str,str]]:
    """
    Coleta todos os artigos da página principal, stripping de sufixos de versão.
    Retorna lista de dicts {'url': URL_base_do_artigo}
    """
    html = requests.get(INDEX, timeout=20).text
    soup = BeautifulSoup(html, "html.parser")
    links, seen = [], set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if "/article/view/" in href:
            # Mantém só /article/view/<id>
            m = re.match(r"(.*?/article/view/\d+)", href)
            if not m:
                continue
            url = urljoin(BASE, m.group(1))
            if url not in seen:
                seen.add(url)
                links.append({"url": url})
    return links

def parse_article(url: str) -> Dict[str,str]:
    """
    Extrai de meta-tags: title, authors, pages, doi, pdf_url, article_url.
    """
    resp = requests.get(url, timeout=20)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Título via meta
    meta_title = soup.find("meta", {"name": "citation_title"})
    title = meta_title["content"].strip() if meta_title and meta_title.get("content") else "N/A"

    # Autores via meta
    authors_meta = soup.find_all("meta", {"name": "citation_author"})
    authors = [m["content"].strip() for m in authors_meta if m.get("content")]
    authors = ", ".join(authors) if authors else "N/A"

    # Páginas via metas first/last
    first = soup.find("meta", {"name": "citation_firstpage"})
    last  = soup.find("meta", {"name": "citation_lastpage"})
    if first and first.get("content") and last and last.get("content"):
        pages = f"{first['content']}-{last['content']}"
    else:
        pages = ""

    # DOI via meta
    meta_doi = soup.find("meta", {"name": "citation_doi"})
    doi = meta_doi["content"].strip() if meta_doi and meta_doi.get("content") else ""

    # PDF URL via meta
    meta_pdf = soup.find("meta", {"name": "citation_pdf_url"})
    pdf_url = meta_pdf["content"].strip() if meta_pdf and meta_pdf.get("content") else url

    return {
        "title": title,
        "authors": authors,
        "pages": pages,
        "doi": doi,
        "article_url": url,
        "pdf_url": pdf_url
    }

def main() -> None:
    print("[INFO] Coletando links do BRACIS 2024 …")
    entries = get_index_links()
    print(f"[INFO] {len(entries)} artigos encontrados.")

    rows: List[Dict[str,str]] = []
    for item in tqdm(entries, desc="Raspando artigos"):
        try:
            art = parse_article(item["url"])
            # Aplicar filtro NLP opcional
            if not KW.search(art["title"]):
                continue
            rows.append(art)
        except Exception as e:
            print(f"[WARN] Falha em {item['url']}: {e}", file=sys.stderr)
        time.sleep(0.4)

    # Salva CSV
    outfile = "lista_artigos_bracis_crawler.csv"
    cols = ["title", "authors", "pages", "doi", "article_url", "pdf_url"]
    with open(outfile, "w", newline="", encoding="utf-8") as fp:
        writer = csv.DictWriter(fp, fieldnames=cols)
        writer.writeheader()
        writer.writerows(rows)

    print(f"[INFO] CSV '{outfile}' criado com {len(rows)} artigos.")

if __name__ == "__main__":
    main()


[INFO] Coletando links do BRACIS 2024 …
[INFO] 116 artigos encontrados.


Raspando artigos:   0%|          | 0/116 [00:00<?, ?it/s]

[INFO] CSV 'bracis2024_nlp.csv' criado com 22 artigos.
