In [1]:
import requests
from bs4 import BeautifulSoup
from collections import deque
from typing import Set, Tuple, Dict

In [2]:
def fetch_page_content_with_retries(url: str, retries: int = 3) -> str:
    for attempt in range(1, retries + 1):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            print(f"Tentativa {attempt} falhou para {url}: {e}")
            if attempt == retries:
                raise RuntimeError(f"Falha ao acessar {url} após {retries} tentativas.")
        except Exception as e:
            print(f"Erro inesperado ao acessar {url}: {e}")
            break

def extract_links(html_content: str, base_url: str) -> Set[str]:
    soup = BeautifulSoup(html_content, 'html.parser')
    anchor_tags = soup.find_all('a', href=True)

    urls = set()
    for tag in anchor_tags:
        href = tag['href']
        if href.startswith('/'):
            urls.add(base_url.rstrip('/') + href)
        elif href.startswith(base_url):
            urls.add(href)
    return urls

def crawl_links_bfs(base_url: str) -> Set[str]:
    visited = set()
    found_links = set()
    to_visit = deque([base_url])

    while to_visit:
        current_url = to_visit.popleft()
        if current_url in visited:
            continue

        try:
            html_content = fetch_page_content_with_retries(current_url)
            new_links = extract_links(html_content, base_url)

            for link in new_links:
                if link not in found_links:
                    print(f"New link found: {link}")
                    found_links.add(link)
                if link not in visited:
                    to_visit.append(link)

        except RuntimeError as e:
            print(f"Erro: {e}")

        visited.add(current_url)

    return found_links

def fetch_and_save_html(urls: Set[str]) -> Tuple[Dict[str, str], Set[str]]:
    url_content_map = {}
    failed_urls = set()

    for url in urls:
        try:
            html_content = fetch_page_content_with_retries(url)
            url_content_map[url] = html_content
            print(f"HTML salvo para: {url}")
        except RuntimeError as e:
            print(f"Falha ao acessar {url}: {e}")
            failed_urls.add(url)

    return url_content_map, failed_urls

import json
from typing import Optional

def save_dict_to_json(data: dict, file_path: str) -> None:
    try:
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump(data, file, ensure_ascii=False, indent=4)
        print(f"Conteúdo salvo com sucesso em: {file_path}")
    except (OSError, IOError) as e:
        print(f"Erro ao salvar o arquivo {file_path}: {e}")

def load_json_to_dict(file_path: str) -> Optional[dict]:
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return json.load(file)
    except FileNotFoundError:
        print(f"Arquivo não encontrado: {file_path}")
    except json.JSONDecodeError as e:
        print(f"Erro ao decodificar o JSON no arquivo {file_path}: {e}")
    except (OSError, IOError) as e:
        print(f"Erro ao ler o arquivo {file_path}: {e}")
    return None

In [3]:
initial_url = "https://www.civilopedia.net"
print(f"Iniciando a busca a partir de: {initial_url}\n")

all_links = crawl_links_bfs(initial_url)
print(f"\nTotal de links encontrados: {len(all_links)}")


Iniciando a busca a partir de: https://www.civilopedia.net

New link found: https://www.civilopedia.net/gathering-storm/resources
New link found: https://www.civilopedia.net/gathering-storm/technologies
New link found: https://www.civilopedia.net/gathering-storm/moments
New link found: https://www.civilopedia.net/gathering-storm/unitpromotions
New link found: https://www.civilopedia.net/gathering-storm/citystates
New link found: https://www.civilopedia.net/gathering-storm/improvements
New link found: https://www.civilopedia.net/gathering-storm/buildings
New link found: https://www.civilopedia.net/gathering-storm/civilizations
New link found: https://www.civilopedia.net/gathering-storm/civics
New link found: https://www.civilopedia.net/privacy_policy
New link found: https://www.civilopedia.net/gathering-storm/wonders
New link found: https://www.civilopedia.net/gathering-storm/governors
New link found: https://www.civilopedia.net/copyright
New link found: https://www.civilopedia.net/gath

In [5]:
print(f"\nIniciando download do html de {len(all_links)} links")
html_contents, failed_links = fetch_and_save_html(all_links)

print(f"\nTotal de links baixados: {len(html_contents)}")
print(f"\nTotal de links não puderam ser baixados: {len(failed_links)}")


Iniciando download do html de 1131 links
HTML salvo para: https://www.civilopedia.net/gathering-storm/technologies
HTML salvo para: https://www.civilopedia.net/gathering-storm/improvements/improvement_mountain_road
HTML salvo para: https://www.civilopedia.net/gathering-storm/wonders/building_hanging_gardens
HTML salvo para: https://www.civilopedia.net/gathering-storm/units/unit_at_crew
HTML salvo para: https://www.civilopedia.net/gathering-storm/wonders/project_send_aid
HTML salvo para: https://www.civilopedia.net/gathering-storm/wonders/project_terrestrial_laser
HTML salvo para: https://www.civilopedia.net/gathering-storm/technologies/tech_advanced_power_cells
HTML salvo para: https://www.civilopedia.net/gathering-storm/wonders/project_decommission_nuclear_power_plant
HTML salvo para: https://www.civilopedia.net/gathering-storm/governments/policy_trade_confederation
HTML salvo para: https://www.civilopedia.net/gathering-storm/units/unit_heavy_chariot
HTML salvo para: https://www.civi

In [7]:
save_dict_to_json(data=html_contents, file_path=f"data/civilopedia_all_links.json")

Conteúdo salvo com sucesso em: data/civilopedia_all_links.json
