### Util: fetch web pages

This utility fetches web pages from given URLs and saves them to a specified directory. It uses the `requests` library to handle HTTP requests and `BeautifulSoup` for parsing HTML content. The utility also includes error handling for various HTTP status codes and connection issues.

In [None]:
import os
import re
import requests
from pathlib import Path
from bs4 import BeautifulSoup
import json
import logging
from PyPDF2 import PdfReader

# 日志设置
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 配置代理
# proxies = {
#     "http": "http://192.168.22.135:1080",
#     "https": "http://192.168.22.135:1080",
# }

# Using the current working directory instead of __file__
notebook_dir = Path(os.getcwd())
folder_dir = notebook_dir.parent / "data" / "raw" / "france"

save_folder = Path("../data/raw/france")
save_folder.mkdir(parents=True, exist_ok=True)

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}

# 提取PDF文本
def extract_text_from_pdf(pdf_path):
    try:
        reader = PdfReader(str(pdf_path))
        text = ""
        for page in reader.pages:
            text += page.extract_text() or ""
        return text.strip()
    except Exception as e:
        logger.warning(f"提取PDF文本失败: {e}")
        return ""

# 从Content-Disposition解析文件名
def get_filename_from_content_disposition(cd_header):
    if not cd_header:
        return None
    fname = re.findall('filename="?([^\'";]+)"?', cd_header)
    if fname:
        return fname[0]
    return None

# 核心函数
def fetch_and_save(item):
    id_ = item.get('id')
    url = item.get('链接')
    if not id_ or not url:
        return f"跳过 id={id_}"

    try:
        response = requests.get(url, headers=headers, timeout=30)
        if response.status_code != 200:
            logger.warning(f"获取 {url} 失败，状态码: {response.status_code}")
            return f"失败: id={id_}"

        # 检测内容类型
        content_type = response.headers.get('Content-Type', '').lower()
        filename_base = str(id_)

        if 'application/pdf' in content_type:
            pdf_path = save_folder / f"{filename_base}.pdf"
            with open(pdf_path, 'wb') as f:
                f.write(response.content)
            logger.info(f"PDF文件已保存至: {pdf_path}")

            text = extract_text_from_pdf(pdf_path)
            if text:
                text_path = save_folder / f"{filename_base}.txt"
                with open(text_path, 'w', encoding='utf-8') as f:
                    f.write(text)
                logger.info(f"PDF提取的文本已保存至: {text_path}")

            return f"保存PDF并提取文本: {id_}"

        elif 'text/html' in content_type:
            html_path = save_folder / f"{filename_base}.html"
            with open(html_path, 'wb') as f:
                f.write(response.content)
            logger.info(f"HTML文件已保存至: {html_path}")

            soup = BeautifulSoup(response.content, 'html.parser')
            for script in soup(["script", "style"]):
                script.extract()
            text = soup.get_text(separator='\n', strip=True)

            text_path = save_folder / f"{filename_base}.txt"
            with open(text_path, 'w', encoding='utf-8') as f:
                f.write(text)
            logger.info(f"HTML提取的文本已保存至: {text_path}")

            return f"保存HTML并提取文本: {id_}"

        elif 'application/json' in content_type:
            json_path = save_folder / f"{filename_base}.json"
            with open(json_path, 'w', encoding='utf-8') as f:
                f.write(response.text)
            logger.info(f"JSON已保存至: {json_path}")

            return f"保存JSON: {id_}"

        else:
            # 其他未知类型原样保存
            raw_path = save_folder / f"{filename_base}.bin"
            with open(raw_path, 'wb') as f:
                f.write(response.content)
            logger.info(f"未知内容已保存至: {raw_path}")

            return f"保存原始文件: {id_}"

    except Exception as e:
        logger.warning(f"爬取失败 id={id_}, url={url}, 错误: {e}")
        return f"失败: id={id_}"

In [None]:
# Example usage
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

max_workers = 30

# 列出所有json文件
json_files = list(folder_dir.glob("*.json"))
json_contents = []
txt_files = list(folder_dir.glob("*.txt"))
txt_file_names = {os.path.splitext(txt_file.name)[0] for txt_file in txt_files}
json_files = [json_file for json_file in json_files if os.path.splitext(json_file.name)[0] not in txt_file_names]
json_contents = [json.loads(json_file.read_text(encoding="utf-8")) for json_file in json_files]

json_contents[:2]

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    future_to_item = {executor.submit(fetch_and_save, item): item for item in json_contents}

    for future in tqdm(as_completed(future_to_item), total=len(future_to_item), desc="下载进度"):
        result = future.result()
        print(result)
