In [None]:
#Code to crawl fraud related data updated on Uwants within 24 hours

import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import quote
import time

# 定义搜索关键词
keywords = ["騙徒手法層出不窮"]

# 基础 URL，修改为直接筛选过去24小时内的帖子
base_url = 'https://www.uwants.com/search.php?searchsubmit=true&srchtxt={}&srchtime=1d&orderby=most_relevant'

# 定义请求头，模拟浏览器请求
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "Cache-Control": "max-age=0"
}

# 定义函数用于提取帖子详情
def extract_posts(url):
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')

    # 查找帖子容器
    posts = soup.find_all("tr")
    post_data = []

    for post in posts:
        title_tag = post.find("span", class_="search-result-subject")
        forum_tag = post.find("td", class_="search-result-forum")
        author_tag = post.find("td", class_="search-result-author")
        reply_view_tag = post.find("td", class_="search-result-nums")
        last_post_tag = post.find("td", class_="search-result-lastpost")

        if title_tag and forum_tag and author_tag and reply_view_tag and last_post_tag:
            title = title_tag.get_text(strip=True)
            post_url = title_tag.find("a")["href"]

            forum = forum_tag.get_text(strip=True)
            author = author_tag.find("a").get_text(strip=True)
            reply_view = reply_view_tag.get_text(strip=True)
            last_post = last_post_tag.get_text(strip=True)

            # 获取帖子内容页
            post_content, responses, first_post_time = extract_post_details(post_url)

            post_data.append({
                "帖子标题": title,
                "帖子内容": post_content,
                "帖子URL": post_url,
                "版區": forum,
                "作者": author,
                "回覆/查看": reply_view,
                "回复内容": responses,
                "最初發表时间": first_post_time,  # 使用楼主的发帖时间
            })

    return post_data

# 获取帖子详细内容及回复，同时提取楼主的发帖时间
def extract_post_details(post_url):
    all_post_content = []
    all_responses = []
    first_post_time = None  # 楼主发帖时间

    try:
        response = requests.get(post_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # 找到所有帖子
        posts = soup.find_all('table', class_='threadpost')

        for i, post in enumerate(posts):
            post_info = post.find('div', class_='postinfo')
            post_content_div = post.find('div', class_='postmessage defaultpost')

            # 提取发帖时间
            if post_info and "發表於" in post_info.get_text():
                post_time = post_info.get_text(strip=True).split("發表於")[1].strip()
            else:
                post_time = None

            # 提取帖子内容
            if post_content_div:
                span = post_content_div.find('span', id=lambda x: x and x.startswith('postorig_'))
                if span:
                    # 删除引用内容
                    for quote in span.find_all('div', class_='quote'):
                        quote.decompose()
                    content = span.get_text(strip=True)
                else:
                    content = ""

            # 判断是否为楼主（第一条帖子）
            if i == 0:
                first_post_time = post_time  # 记录楼主发帖时间
                all_post_content.append(content)
            else:
                all_responses.append(content)

    except Exception as e:
        print(f"Error fetching post details from {post_url}: {e}")

    return " | ".join(all_post_content), " | ".join(all_responses), first_post_time

# 打开 CSV 文件准备写入，使用 utf-8-sig 编码，确保 Excel 正确识别
output_file = 'D:/uwants_search_results.csv'  # 保存到 D 盘根目录
with open(output_file, mode='w', newline='', encoding='utf-8-sig') as file:
    writer = csv.DictWriter(file, fieldnames=["搜索关键词", "帖子标题", "帖子内容", "帖子URL", "版區", "作者", "回覆/查看", "回复内容", "最初發表时间"])
    writer.writeheader()

    # 遍历每个关键词
    for keyword in keywords:
        print(f"正在搜索关键词: {keyword}")

        encoded_keyword = quote(keyword)

        # 构建访问过去24小时内帖子列表的 URL
        search_url = base_url.format(encoded_keyword)
        print(f"正在提取过去24小时内的帖子数据...")

        try:
            # 提取当前页面的帖子数据
            posts_data = extract_posts(search_url)

            # 写入每个帖子的相关信息到 CSV 文件
            for post in posts_data:
                writer.writerow({
                    "搜索关键词": keyword,
                    "帖子标题": post["帖子标题"],
                    "帖子内容": post["帖子内容"],
                    "帖子URL": post["帖子URL"],
                    "版區": post["版區"],
                    "作者": post["作者"],
                    "回覆/查看": post["回覆/查看"],
                    "回复内容": post["回复内容"],
                    "最初發表时间": post["最初發表时间"]  # 使用楼主的发帖时间
                })

        except Exception as e:
            print(f"提取帖子时发生错误: {e}")

        time.sleep(10)  # 设置爬取间隔，避免被封禁

print("所有关键词的搜索结果已提取并保存至 'uwants_search_results.csv'.")
