In [6]:
import os
import json
import time
import random
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

In [7]:
# 初始URL
base_url = "https://tlidb.com/cn/"

# 创建保存文件的根目录
output_dir = "../data/site"
os.makedirs(output_dir, exist_ok=True)

# 创建static文件夹
static_dir = os.path.join(output_dir, "static")
os.makedirs(static_dir, exist_ok=True)

In [14]:
# 用于存储已经访问过的URL，避免重复抓取
visited_urls = set()

# 用于存储网站的sitemap
sitemap_file = os.path.join(output_dir, "sitemap.json")

def load_sitemap():
    if os.path.exists(sitemap_file):
        with open(sitemap_file, 'r', encoding='utf-8') as f:
            return json.load(f)
    return {}

def save_sitemap(sitemap):
    with open(sitemap_file, 'w', encoding='utf-8') as f:
        json.dump(sitemap, f, ensure_ascii=False, indent=4)

def download_page(url, output_path):
    # 检查文件是否已经存在
    if os.path.exists(output_path):
        print(f"File already exists: {output_path}")
        return

    # 发送HTTP请求获取页面内容
    response = requests.get(url)
    response.encoding = 'utf-8'  # 设置编码为UTF-8
    html_content = response.text

    # 解析HTML内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 下载页面中的图片和样式文件
    for img in soup.find_all('img'):
        img_url = urljoin(url, img['src'])
        img_path = os.path.join(static_dir, os.path.basename(urlparse(img_url).path))
        download_file(img_url, img_path)
        img['src'] = os.path.join('static', os.path.basename(img_path))

    for link in soup.find_all('link', rel='stylesheet'):
        css_url = urljoin(url, link['href'])
        css_path = os.path.join(static_dir, os.path.basename(urlparse(css_url).path))
        download_file(css_url, css_path)
        link['href'] = os.path.join('static', os.path.basename(css_path))

    # 保存HTML文件
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(str(soup))

def download_file(url, output_path):
    # 检查文件是否已经存在
    if os.path.exists(output_path):
        print(f"File already exists: {output_path}")
        return

    response = requests.get(url)
    with open(output_path, 'wb') as f:
        f.write(response.content)

def crawl(url, sitemap):
    if url in visited_urls:
        return
    visited_urls.add(url)

    # 解析URL路径，创建相应的文件夹层级
    parsed_url = urlparse(url)
    path_parts = parsed_url.path.strip('/').split('/')
    if path_parts[-1] == '':
        path_parts.pop()

    # 构建输出路径
    output_path = os.path.join(output_dir, *path_parts) + '.html'
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # 下载页面
    try:
        download_page(url, output_path)
    except Exception as e:
        print(f"Error downloading {url}: {e}")
        time.sleep(random.randint(10, 15))
        try:
            download_page(url, output_path)
        except Exception as e:
            print(f"Retry failed for {url}: {e}")
            return

    # 更新sitemap
    sitemap[url] = output_path
    save_sitemap(sitemap)

    # 解析页面中的链接，递归抓取
    try:
        response = requests.get(url)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        for link in soup.find_all('a', href=True):
            next_url = urljoin(url, link['href'])
            if next_url.startswith(base_url):
                crawl(next_url, sitemap)
    except Exception as e:
        print(f"Error parsing {url}: {e}")
        time.sleep(random.randint(10, 15))
        try:
            response = requests.get(url)
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('a', href=True):
                next_url = urljoin(url, link['href'])
                if next_url.startswith(base_url):
                    crawl(next_url, sitemap)
        except Exception as e:
            print(f"Retry failed for {url}: {e}")
            return

In [15]:


# 加载sitemap
sitemap = load_sitemap()

# 如果sitemap为空，从初始URL开始抓取
if not sitemap:
    crawl(base_url, sitemap)
else:
    # 从sitemap中恢复抓取
    for url in sitemap:
        crawl(url, sitemap)

File already exists: ../data/site\cn.html
File already exists: ../data/site\cn\Hero.html
File already exists: ../data/site\cn\Talent.html
File already exists: ../data/site\cn\Inventory.html
File already exists: ../data/site\cn\Legendary_Gear.html
File already exists: ../data/site\cn\Pactspirit.html
File already exists: ../data/site\cn\Drop_Source.html
File already exists: ../data/site\cn\Active_Skill.html
File already exists: ../data/site\cn\Support_Skill.html
File already exists: ../data/site\cn\Passive_Skill.html
File already exists: ../data/site\cn\Activation_Medium_Skill.html
File already exists: ../data/site\cn\Noble_Support_Skill.html
File already exists: ../data/site\cn\Magnificent_Support_Skill.html
File already exists: ../data/site\cn\Tip.html
File already exists: ../data/site\cn\Hyperlink.html
File already exists: ../data/site\static\UI_S6Gameplay_Tile_bench.webp
File already exists: ../data/site\static\UI_Item_S5Gameplay_DreamTreasure_Almighty_C_128.webp
File already exists:

KeyboardInterrupt: 

In [17]:
sitemap = {k:sitemap[k].replace('\\','/') for k in sitemap}

In [18]:
sitemap

{'https://tlidb.com/cn/': '../data/site/cn.html',
 'https://tlidb.com/cn/Hero': '../data/site/cn/Hero.html',
 'https://tlidb.com/cn/Talent': '../data/site/cn/Talent.html',
 'https://tlidb.com/cn/Inventory': '../data/site/cn/Inventory.html',
 'https://tlidb.com/cn/Legendary_Gear': '../data/site/cn/Legendary_Gear.html',
 'https://tlidb.com/cn/Pactspirit': '../data/site/cn/Pactspirit.html',
 'https://tlidb.com/cn/Drop_Source': '../data/site/cn/Drop_Source.html',
 'https://tlidb.com/cn/Active_Skill': '../data/site/cn/Active_Skill.html',
 'https://tlidb.com/cn/Support_Skill': '../data/site/cn/Support_Skill.html',
 'https://tlidb.com/cn/Passive_Skill': '../data/site/cn/Passive_Skill.html',
 'https://tlidb.com/cn/Activation_Medium_Skill': '../data/site/cn/Activation_Medium_Skill.html',
 'https://tlidb.com/cn/Noble_Support_Skill': '../data/site/cn/Noble_Support_Skill.html',
 'https://tlidb.com/cn/Magnificent_Support_Skill': '../data/site/cn/Magnificent_Support_Skill.html',
 'https://tlidb.com/