# 打造装备的解析

https://tlidb.com/cn/Craft

In [46]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin
import re # Import regular expressions module

## 抓取装备列表

In [58]:
headers = {
    "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Language":"en,zh-CN;q=0.9,zh;q=0.8,ja;q=0.7",
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [72]:
def extract_chinese_chars(text):
    """
    从给定文本中提取所有中文字符。
    """
    # 正则表达式匹配所有 Unicode 中文字符
    # \u4e00-\u9fff 是基本汉字范围
    # 你可以根据需要扩展这个范围，例如加上中文标点 \u3000-\u303f
    chinese_chars = re.findall(r'[\u4e00-\u9fff]+', text)
    return "".join(chinese_chars)

def scrape_items_and_extract_chinese_name(url,headers):
    """
    直接从页面上所有 class="card-body" 的 div 中抓取装备词条，
    并提取中文名称。

    Args:
        url (str): 目标网页的URL.

    Returns:
        str: 包含抓取数据的JSON格式字符串，如果抓取失败或未找到数据则返回None。
    """
    scraped_data = []

    try:
        print(f"Fetching URL: {url}")
        response = requests.get(url, headers=headers, timeout=15)
        response.raise_for_status()
        html_content = response.text
        print("Successfully fetched page content.")
    except requests.exceptions.RequestException as e:
        print(f"请求页面时发生错误: {e}")
        return None
        
    soup = BeautifulSoup(html_content, 'html.parser')

    all_card_bodies = soup.find_all('div', class_='card-body')

    if not all_card_bodies:
        print("未能找到任何 class='card-body' 的 div 元素。")
        return None
    
    print(f"Found {len(all_card_bodies)} div elements with class 'card-body'. Processing all of them.")

    for i, card_body in enumerate(all_card_bodies):
        print(f"Processing card-body #{i+1}...")
        item_links_in_body = card_body.find_all('a', class_='p-2', href=True)

        if not item_links_in_body:
            print(f"  Card-body #{i+1} contains no <a> tags with class 'p-2' and href.")
            continue

        for a_tag in item_links_in_body:
            original_name = a_tag.get_text(strip=True)
            chinese_name = extract_chinese_chars(original_name)
            relative_link = a_tag['href']
            full_link = urljoin(url, relative_link)

            if original_name and full_link: # Ensure we have original name and link
                entry = {
                    "name_cn": chinese_name if chinese_name else original_name, # Fallback if no Chinese chars found
                    "name_original": original_name,
                    "link": full_link
                }
                scraped_data.append(entry)
        
        # Changed print statement slightly to reflect what's being counted
        extracted_count = sum(1 for item in scraped_data if item['link'].startswith(urljoin(url, ""))) # Count items from this card_body
        print(f"  Potentially extracted items from card-body #{i+1}. Total items so far: {len(scraped_data)}")


    if not scraped_data:
        print("未能从任何 'card-body' 中提取到有效的装备信息。")
        return None

    return json.dumps(scraped_data, ensure_ascii=False, indent=4)

In [74]:
target_page_url = "https://tlidb.com/cn/Craft"
json_data = scrape_items_and_extract_chinese_name(target_page_url,headers)

if json_data:
    print("\n成功从所有 'card-body' 区域抓取到数据：")
else:
    print("\n抓取失败或未找到数据。请检查上面的日志输出。")

Fetching URL: https://tlidb.com/cn/Craft
Successfully fetched page content.
<!DOCTYPE html>
<html lang="cn" data-bs-theme="dark">
  <head>
    <meta charset="utf-8">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
    <title>火炬编年史, Torchlight: Infinite Wiki</title>
    <meta name="color-scheme" content="dark">
    <link href="/favicon.ico" rel="shortcut icon" >

    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bootstrap/5.3.1/css/bootstrap.min.css" integrity="sha512-Z/def5z5u2aR89OuzYcxmDJ0Bnd5V1cKqBEbvLOiUNWdg9PQeXVvXLI90SE4QOHGlfLqUnDNVAYyZi8UwUTmWQ==" crossorigin="anonymous" referrerpolicy="no-referrer" />
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/jqueryui/1.13.2/themes/base/jquery-ui.min.css" integrity="sha512-ELV+xyi8IhEApPS/pSj66+Jiw+sOT1Mqkzlh8ExXihe4zfqbWkxPRi8wptXIO9g73FSlhmquFlUOuMSoXz5IRw==" crossorigin="anonymous" referrer

In [70]:
json_data

'[\n    {\n        "name_cn": "STR Helmet",\n        "name_original": "STR Helmet",\n        "link": "https://tlidb.com/cn/STR_Helmet"\n    },\n    {\n        "name_cn": "DEX Helmet",\n        "name_original": "DEX Helmet",\n        "link": "https://tlidb.com/cn/DEX_Helmet"\n    },\n    {\n        "name_cn": "INT Helmet",\n        "name_original": "INT Helmet",\n        "link": "https://tlidb.com/cn/INT_Helmet"\n    },\n    {\n        "name_cn": "STR Chest Armor",\n        "name_original": "STR Chest Armor",\n        "link": "https://tlidb.com/cn/STR_Chest_Armor"\n    },\n    {\n        "name_cn": "DEX Chest Armor",\n        "name_original": "DEX Chest Armor",\n        "link": "https://tlidb.com/cn/DEX_Chest_Armor"\n    },\n    {\n        "name_cn": "INT Chest Armor",\n        "name_original": "INT Chest Armor",\n        "link": "https://tlidb.com/cn/INT_Chest_Armor"\n    },\n    {\n        "name_cn": "STR Gloves",\n        "name_original": "STR Gloves",\n        "link": "https://tlidb

## 抓取具体词条

基础词条、前后缀、解梦词条