In [None]:
# %% [markdown]
# ## Web Scraping EMSERH - Vers√£o Final Corrigida
#
# 1. Executar todas as c√©lulas em ordem
# 2. Configurar par√¢metros no bloco CONFIG

# %% [code]
%pip install requests beautifulsoup4 pandas tqdm --quiet

# %% [code]
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
from google.colab import files
import warnings
warnings.filterwarnings('ignore')

# Configura√ß√µes otimizadas
CONFIG = {
    'base_url': 'https://www.emserh.ma.gov.br',
    'max_pages': 576,          # Altere para 579 quando validar
    'request_delay': 0.5,     # Delay entre requisi√ß√µes (segundos)
    'timeout': 30,
    'max_workers': 3,         # Threads paralelas
    'backup_interval': 50,    # Backup a cada X artigos
    'json_params': {
        'indent': 2,
        'ensure_ascii': False  # Mant√©m caracteres especiais
    }
}

# %% [code]
def get_news_links():
    """Coleta links com tratamento robusto de erros"""
    links = []
    print(f'üîç Coletando links de {CONFIG["max_pages"]} p√°ginas...')

    try:
        for page in tqdm(range(1, CONFIG['max_pages'] + 1), desc='P√°ginas'):
            try:
                url = f"{CONFIG['base_url']}/noticias/page/{page}/"
                response = requests.get(url, timeout=CONFIG['timeout'])
                response.raise_for_status()

                soup = BeautifulSoup(response.text, 'html.parser')
                card_body = soup.find('div', class_='card-body')

                if card_body:
                    for article in card_body.find_all('div', class_='row mt-3'):
                        if (a := article.find('a', href=True)):
                            link = a['href']
                            if not link.startswith('http'):
                                link = f"{CONFIG['base_url']}{link}" if link.startswith('/') else f"{CONFIG['base_url']}/{link}"
                            links.append(link)

                time.sleep(CONFIG['request_delay'])

            except Exception as e:
                tqdm.write(f'üö® Erro na p√°gina {page}: {str(e)}')
                continue

        return list(dict.fromkeys(links))

    except Exception as e:
        print(f'‚ùå Falha cr√≠tica na coleta de links: {str(e)}')
        return []

# %% [code]
def scrape_article(link):
    """Vers√£o corrigida com serializa√ß√£o HTML adequada"""
    try:
        response = requests.get(link, timeout=CONFIG['timeout'])
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extra√ß√£o segura de elementos
        autor = data_publicacao = titulo = conteudo = None

        # Autor
        if (svg := soup.find('svg', class_='fa-user')):
            autor = svg.find_next_sibling('span').get_text(strip=True) if svg.find_next_sibling('span') else None

        # T√≠tulo (serializa√ß√£o correta)
        if (card_header := soup.find('div', class_='card-header')):
            h1 = card_header.find('h1')
            if h1:
                h1_clone = BeautifulSoup(str(h1), 'html.parser')
                for img in h1_clone.find_all('img'):
                    img.decompose()
                titulo = h1_clone.decode_contents()  # Corrige escape de tags

        # Conte√∫do (serializa√ß√£o correta)
        conteudo = []
        if (card_body := soup.find('div', class_='card-body')):
            for p in card_body.find_all('p'):
                if not p.find_parent('div', class_='wp-block-image'):
                    p_clone = BeautifulSoup(str(p), 'html.parser')
                    for img in p_clone.find_all('img'):
                        img.decompose()
                    conteudo.append(p_clone.decode_contents())  # Corrige escape de tags

        # Data
        if (data_tag := soup.find('span', class_='data-post')):
            data_publicacao = data_tag.get_text(strip=True)

        return {
            'title': titulo,
            'text': ' '.join(conteudo),
            'pub_date': data_publicacao
        }

    except Exception as e:
        tqdm.write(f'‚ö†Ô∏è Erro no artigo {link}: {str(e)}')
        return None

# %% [code]
def main():
    """Execu√ß√£o principal com serializa√ß√£o JSON corrigida"""
    print('üöÄ Iniciando processo de scraping...')

    # Fase 1: Coleta de links
    links = get_news_links()
    if not links:
        print('Nenhum link encontrado.')
        return

    print(f' {len(links)} links coletados com sucesso')

    # Fase 2: Raspagem paralela
    dados = []
    backup_count = 0

    with ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
        futures = {executor.submit(scrape_article, link): link for link in links}

        try:
            with tqdm(total=len(links), desc='üì¶ Processando artigos') as pbar:
                for future in futures:
                    future.add_done_callback(lambda _: pbar.update(1))
                    time.sleep(CONFIG['request_delay'] / CONFIG['max_workers'])

                for future in futures:
                    result = future.result()
                    if result:
                        dados.append(result)
                        backup_count += 1

                        # Backup peri√≥dico
                        if backup_count % CONFIG['backup_interval'] == 0:
                            timestamp = pd.Timestamp.now().strftime("%H%M%S")
                            backup_file = f"backup_{timestamp}.json"
                            with open(backup_file, 'w', encoding='utf-8') as f:
                                json.dump(dados, f, **CONFIG['json_params'])
                            tqdm.write(f' Backup salvo: {backup_file}')

        except KeyboardInterrupt:
            print('\n‚ö†Ô∏è Interrup√ß√£o do usu√°rio! Salvando dados parciais...')

    # Fase 3: Salvamento final
    if dados:
        timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
        filename = f"noticias_emserh_{timestamp}.json"

        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(
                dados,
                f,
                **CONFIG['json_params']
            )

        files.download(filename)
        print(f'\n Dados salvos em {filename}')
    else:
        print(' Nenhum dado foi coletado')

# %% [code]
if __name__ == "__main__":
    main()