In [3]:
import sqlite3
import pandas as pd
import requests
from lxml import etree
import time

# 定义请求头
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0'
}

# 读取 Excel 表格数据
url_list_df = pd.read_csv('mc-onlinenews.csv')
url_list_df = url_list_df[url_list_df['media_name'] == 'rt.com']

# 数据库文件路径
db_path = 'rt_news_data.db'

# 数据库初始化函数
def init_db():
    # 创建数据库连接
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    # 创建表格（如果不存在）
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS news (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        url TEXT UNIQUE,
        publish_date TEXT,
        title TEXT,
        language TEXT,
        text TEXT
    )
    ''')
    conn.commit()
    conn.close()

# 插入数据到数据库
def save_to_db(news):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    try:
        cursor.execute('''
        INSERT INTO news (url, publish_date, title, text, language)
        VALUES (?, ?, ?, ?, ?)
        ''', (news['url'], news['date'], news['title'], news['text'], news['language']))
        conn.commit()
    except sqlite3.IntegrityError:
        # 如果 URL 已存在，跳过插入
        print(f"Skipping duplicate URL: {news['url']}")
    conn.close()

# 获取数据库中最后一个 URL
def get_last_url():
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    cursor.execute('SELECT url FROM news ORDER BY id DESC LIMIT 1')
    result = cursor.fetchone()
    conn.close()
    return result[0] if result else None

# 主逻辑
def main():
    init_db()  # 初始化数据库
    last_url = get_last_url()  # 获取最后一个爬取的 URL
    url_list = url_list_df.url.tolist()

    # 如果有最后一个 URL，从其下一个开始爬取
    if last_url in url_list:
        start_index = url_list.index(last_url) + 1
    else:
        start_index = 0  # 如果没有记录，从头开始

    for url in url_list[start_index:]:
        print(f'Trying fetching URL: {url}')
        try:
            # 请求网页数据
            resp = requests.get(url=url, headers=headers)
            assert resp.status_code == 200, f'Request failed for URL: {url}'
            data = resp.text
            html_tree = etree.HTML(data)

            # 提取文本内容
            text_content = html_tree.xpath("//div[@class='article__text text ']//text()")
            news = {
                'url': url,
                'date': url_list_df[url_list_df['url'] == url]['publish_date'].values[0],
                'title': url_list_df[url_list_df['url'] == url]['title'].values[0],
                'text': "\n".join(text_content).strip(),
                'language': url_list_df[url_list_df['url'] == url]['language'].item()
            }
            # 保存到数据库
            save_to_db(news)
            print(f"Fetched and saved data for URL: {url}")
            time.sleep(1)  # 防止请求过于频繁
        except Exception as e:
            print(f"Error fetching URL {url}: {e}")

if __name__ == '__main__':
    main()


FileNotFoundError: [Errno 2] No such file or directory: 'mc-onlinenews.csv'