In [11]:
import re
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [12]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'X-Requested-With': 'XMLHttpRequest'
    }


In [13]:
# 获取网页内容
def get_html(url):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.encoding = 'utf-8'
        response.raise_for_status()  # 检查请求是否成功
        return response.text
    except requests.RequestException as e:
        print(f"请求失败: {url}, 错误信息: {e}")
        return None


In [14]:
#保存文本到文件
def save_to_file(save_path, lines):
    try:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        with open(save_path, 'w', encoding='utf-8') as f:
            for line in lines:
                f.write(line + '\n')
        print(f"成功保存到: {save_path}")
    except Exception as e:
        print(f"文件保存失败: {save_path}, 错误信息: {e}")


In [15]:
def extract_chapter_title(soup):
    bookname_div = soup.find('div', class_='bookname')
    if bookname_div:
        title_tag = bookname_div.find('h1')
        if title_tag:
            return title_tag.get_text(strip=True)
        
    return None  
        

In [16]:
def get_next_page(soup, base_url):
    next_link_tag = soup.find('a', string = re.compile('下一章'))
    if next_link_tag:
        href = next_link_tag.get('href')
    if href:
        return urljoin(base_url, href)

In [17]:
def crawler(crawler_url, save_path, chapter_num):
    html_content = get_html(crawler_url)
    if not html_content:
        return None
    
    try:
        soup = BeautifulSoup(get_html(crawler_url), 'html.parser')
        
        content_div = soup.find('div', class_='content')
        if not content_div:
            print(f"未找到内容区域: {crawler_url}")
            return None
        # 获取标题名
        chapter_title = extract_chapter_title(soup)
        print(f"正在爬取第 {chapter_num} 章: {chapter_title}")
        # 处理内容
        lines = [chapter_title, " "] 
        content_lines = [line.strip() for line in content_div.get_text().split('\n') if line.strip()]
        lines.extend(content_lines)
        # 保存到文件
        save_to_file(save_path, lines)
        # 返回下一章URL
        return get_next_page(soup, crawler_url)
    except Exception as e:
        print(f"解析失败: {crawler_url}, 错误信息: {e}")
        return None


In [20]:
if __name__ == "__main__":
    start_url = "http://www.biqugewx.info/biquge/25300/55819903"
    save_dir = r"E:\论文\代码\数据处理\data"
    
    current_url = start_url
    chapter_count = 1573
    max_chapters = 1623
    
    while current_url and chapter_count <= max_chapters:
        save_path = os.path.join(save_dir, f"data_{chapter_count}.txt")
        print(f"正在爬取第 {chapter_count} 章, URL: {current_url}")
        
        next_url = crawler(current_url, save_path, chapter_count)
        current_url = next_url
        chapter_count += 1
        
        import time
        time.sleep(0.5)
    
    print("爬取完成！")

正在爬取第 1573 章, URL: http://www.biqugewx.info/biquge/25300/55819903
正在爬取第 1573 章: VIP卷 第一千五百七十三章   本源帝气
成功保存到: E:\论文\代码\数据处理\data\data_1573.txt
正在爬取第 1574 章, URL: http://www.biqugewx.info/biquge/25300/55819904
正在爬取第 1574 章: VIP卷 第一千五百七十四章  天墓之魂
成功保存到: E:\论文\代码\数据处理\data\data_1574.txt
正在爬取第 1575 章, URL: http://www.biqugewx.info/biquge/25300/55819905
正在爬取第 1575 章: VIP卷 第一千五百七十五章    抽取灵魂本源
成功保存到: E:\论文\代码\数据处理\data\data_1575.txt
正在爬取第 1576 章, URL: http://www.biqugewx.info/biquge/25300/55819906
正在爬取第 1576 章: VIP卷 第一千五百七十六章   帝境灵魂
成功保存到: E:\论文\代码\数据处理\data\data_1576.txt
正在爬取第 1577 章, URL: http://www.biqugewx.info/biquge/25300/55819907
正在爬取第 1577 章: VIP卷 第一千五百七十七章   出天墓
成功保存到: E:\论文\代码\数据处理\data\data_1577.txt
正在爬取第 1578 章, URL: http://www.biqugewx.info/biquge/25300/55819908
正在爬取第 1578 章: VIP卷 第一千五百七十八章    大战前夕
成功保存到: E:\论文\代码\数据处理\data\data_1578.txt
正在爬取第 1579 章, URL: http://www.biqugewx.info/biquge/25300/55819909
正在爬取第 1579 章: VIP卷 第一千五百七十九章    大军出动
成功保存到: E:\论文\代码\数据处理\data\data_1579.txt
正在爬