In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

class TestScraper:
    """
    测试网站爬虫类
    """

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

    def test_httpbin(self):
        """测试httpbin.org"""
        print("=== 测试 httpbin.org ===")

        tests = {
            'JSON响应': 'https://httpbin.org/json',
            'HTML响应': 'https://httpbin.org/html',
            '请求头信息': 'https://httpbin.org/headers',
            'IP信息': 'https://httpbin.org/ip'
        }

        results = []
        for name, url in tests.items():
            try:
                response = self.session.get(url)
                data = {
                    '测试项目': name,
                    'URL': url,
                    '状态码': response.status_code,
                    '内容类型': response.headers.get('content-type', ''),
                    '数据大小': len(response.text)
                }
                results.append(data)
                print(f"✅ {name}: 成功")

                # 显示部分内容
                if name == 'JSON响应':
                    print(f"   示例数据: {response.json()[:100]}...")

            except Exception as e:
                print(f"❌ {name}: 失败 - {e}")

        return pd.DataFrame(results)

    def test_quotes(self):
        """测试名言网站"""
        print("\n=== 测试 quotes.toscrape.com ===")

        base_url = 'http://quotes.toscrape.com'
        quotes_data = []

        try:
            # 爬取第一页
            response = self.session.get(base_url)
            soup = BeautifulSoup(response.text, 'html.parser')

            quotes = soup.find_all('div', class_='quote')
            for quote in quotes:
                text = quote.find('span', class_='text').get_text()
                author = quote.find('small', class_='author').get_text()
                tags = [tag.get_text() for tag in quote.find_all('a', class_='tag')]

                quotes_data.append({
                    '名言': text,
                    '作者': author,
                    '标签': ', '.join(tags)
                })

            print(f"✅ 成功提取 {len(quotes_data)} 条名言")

            # 尝试翻页
            next_button = soup.find('li', class_='next')
            if next_button:
                next_page = next_button.find('a')['href']
                print(f"   发现下一页: {next_page}")

        except Exception as e:
            print(f"❌ 爬取失败: {e}")

        return pd.DataFrame(quotes_data)

    def test_books(self):
        """测试图书网站（只爬取第一页作为示例）"""
        print("\n=== 测试 books.toscrape.com ===")

        base_url = 'http://books.toscrape.com'
        books_data = []

        try:
            response = self.session.get(base_url)
            soup = BeautifulSoup(response.text, 'html.parser')

            books = soup.find_all('article', class_='product_pod')
            for book in books:
                title = book.find('h3').find('a')['title']
                price = book.find('p', class_='price_color').get_text()
                availability = book.find('p', class_='instock').get_text().strip()

                books_data.append({
                    '书名': title,
                    '价格': price,
                    '库存状态': availability
                })

            print(f"✅ 成功提取 {len(books_data)} 本图书信息")

        except Exception as e:
            print(f"❌ 爬取失败: {e}")

        return pd.DataFrame(books_data)

    def test_api_data(self):
        """测试API数据获取"""
        print("\n=== 测试 API 数据获取 ===")

        try:
            # 测试假商店API
            response = self.session.get('https://fakestoreapi.com/products')
            products = response.json()

            api_data = []
            for product in products[:5]:  # 只取前5个作为示例
                api_data.append({
                    '商品ID': product['id'],
                    '标题': product['title'],
                    '价格': product['price'],
                    '分类': product['category']
                })

            print(f"✅ 成功获取 {len(products)} 个商品数据（显示前5个）")
            return pd.DataFrame(api_data)

        except Exception as e:
            print(f"❌ API测试失败: {e}")
            return pd.DataFrame()

def main():
    """主测试函数"""
    scraper = TestScraper()

    # 执行各项测试
    httpbin_results = scraper.test_httpbin()
    quotes_results = scraper.test_quotes()
    books_results = scraper.test_books()
    api_results = scraper.test_api_data()

    # 显示结果摘要
    print("\n" + "="*50)
    print("测试结果摘要:")
    print(f"HTTPBIN测试: {len(httpbin_results)} 项")
    print(f"名言采集: {len(quotes_results)} 条")
    print(f"图书采集: {len(books_results)} 本")
    print(f"API数据: {len(api_results)} 条")

    # 保存结果（可选）
    if not quotes_results.empty:
        quotes_results.to_csv('quotes_data.csv', index=False, encoding='utf-8-sig')
        print("名言数据已保存为 quotes_data.csv")

    if not books_results.empty:
        books_results.to_csv('books_data.csv', index=False, encoding='utf-8-sig')
        print("图书数据已保存为 books_data.csv")

if __name__ == "__main__":
    main()

=== 测试 httpbin.org ===
✅ JSON响应: 成功
❌ JSON响应: 失败 - unhashable type: 'slice'
✅ HTML响应: 成功
✅ 请求头信息: 成功
✅ IP信息: 成功

=== 测试 quotes.toscrape.com ===
✅ 成功提取 10 条名言
   发现下一页: /page/2/

=== 测试 books.toscrape.com ===
✅ 成功提取 20 本图书信息

=== 测试 API 数据获取 ===
✅ 成功获取 20 个商品数据（显示前5个）

测试结果摘要:
HTTPBIN测试: 4 项
名言采集: 10 条
图书采集: 20 本
API数据: 5 条
名言数据已保存为 quotes_data.csv
图书数据已保存为 books_data.csv
