1. 完整理解基础代码的功能和结果（即对每一行基础代码添加注释）

In [1]:
import pandas as pd
from urllib.request import urlopen  # 导入urlopen用于打开URL
from bs4 import BeautifulSoup       # 导入BeautifulSoup用于解析HTML
import ssl                          # 导入ssl用于处理SSL证书

context = ssl._create_unverified_context()  # 创建未验证的SSL上下文以忽略SSL证书验证

all_quotes = []  # 初始化一个空列表用于存储爬取到的名言

url = 'https://quotes.toscrape.com/page/1/'  # 设置目标网址

# 使用urlopen请求并打开网页内容
page = urlopen(url, context=context)  # 请求网页信息并忽略SSL证书验证

# 使用HTML解析器将网页内容组合为BeautifulSoup对象
soup = BeautifulSoup(page, 'html.parser')  # 将网页信息解析为BeautifulSoup对象

# 查找所有带有quote类的<div>标签，这些标签包含每个名言的信息
quotes = soup.find_all('div', class_='quote')

# 遍历找到的每个名言
for quote in quotes:
    # 提取名言的文本，查找带有text类的<span>标签并提取其文本
    text = quote.find('span', class_='text').text  
    # 提取作者的名字，查找带有author类的<small>标签并提取其文本
    author = quote.find('small', class_='author').text 
    # 查找带有tags类的<div>标签内的所有<a>标签
    tags = quote.find('div', class_='tags').find_all('a')  

    # 初始化一个空列表用于存储当前名言的标签
    tags_list = []
    # 遍历每个标签并提取其文本，每个标签的文本添加到tags_list中形成一个列表
    for tag in tags:
        tags_list.append(tag.text) 
    # 将名言文本、作者和标签组合为一个列表
    single_quote = [text, author, tags_list]
    # 将single_quote列表添加到all_quotes列表中
    all_quotes.append(single_quote)
    # 打印all_quotes列表以查看收集到的数据
    print(all_quotes)

# 简化版本
# for quote in quotes:
#     text = quote.find('span', class_='text').text
#     author = quote.find('small', class_='author').text
#     tags = [tag.text for tag in quote.find('div', class_='tags').find_all('a')] #直接生成列表
#     all_quotes.append([text, author, tags])

# print(all_quotes)

[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', ['change', 'deep-thoughts', 'thinking', 'world']]]
[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', ['change', 'deep-thoughts', 'thinking', 'world']], ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'J.K. Rowling', ['abilities', 'choices']]]
[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', ['change', 'deep-thoughts', 'thinking', 'world']], ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'J.K. Rowling', ['abilities', 'choices']], ['“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'Albert Einstein', ['insp

2. 添加代码，爬取相同网站的10个页面内容，并将爬取内容存储在同一个CSV格式文件（添加性能分析，time和%prun）

In [1]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import time

def scrape_quotes():
    # 创建SSL上下文，忽略SSL证书验证
    context = ssl._create_unverified_context()

    all_quotes = []

    # 记录起始时间
    start_time = time.time()

    # 爬取前10个页面的内容
    for page_num in range(1, 11):  # 循环迭代1到10的每个页面编号
        url = f'https://quotes.toscrape.com/page/{page_num}/'  # 构建当前页面的URL
        page = urlopen(url, context=context)
        soup = BeautifulSoup(page, 'html.parser')
        quotes = soup.find_all('div', class_='quote')  # 找到当前页面中所有名言的div标签

        # 遍历当前页面中的每个名言
        for quote in quotes:
            text = quote.find('span', class_='text').text
            author = quote.find('small', class_='author').text
            tags = [tag.text for tag in quote.find('div', class_='tags').find_all('a')]  # 直接生成列表
            all_quotes.append([text, author, tags])

    # 记录结束时间
    end_time = time.time()

    # 计算总耗时
    elapsed_time = end_time - start_time
    print(f"Total time taken to scrape data: {elapsed_time:.2f} seconds")

    # 将爬取到的所有名言数据存储为CSV格式文件
    df = pd.DataFrame(all_quotes, columns=['Quote', 'Author', 'Tags'])  # 创建DataFrame对象，用于保存数据
    df.to_csv('quotes.csv', index=False)

# 使用 %prun 分析 scrape_quotes 函数的性能
%prun scrape_quotes()


Total time taken to scrape data: 11.87 seconds
 

3. 模拟会员登录过程。（同上加性能分析）

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

def automate_login():
    # 启动WebDriver, 打开目标网页
    driver = webdriver.Safari()
    driver.get('http://quotes.toscrape.com/login')

    # 查找用户名输入框并输入用户名
    username_input = driver.find_element(By.ID, 'username')  # 通过ID定位用户名输入框
    username_input.send_keys('username')  # 输入用户名

    # 查找密码输入框并输入密码
    password_input = driver.find_element(By.ID, 'password')  # 通过ID定位密码输入框
    password_input.send_keys('password')  # 输入密码

    # 查找登录按钮并点击
    login_button = driver.find_element(By.CSS_SELECTOR, 'input[type="submit"]')  # 通过CSS选择器定位登录按钮
    login_button.click()  # 点击登录按钮

    # 等待登录完成，最多等待10秒
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.LINK_TEXT, 'Logout'))  # 等待直到"Logout"链接出现
    )

    # 打印登录后的页面标题
    print("登录成功！页面标题是：", driver.title)

    # 关闭浏览器
    driver.quit()  # 退出并关闭浏览器

# 记录起始时间
start_time = time.time()

# 使用 %prun 分析 automate_login 函数的性能
%prun automate_login()

# 记录结束时间
end_time = time.time()

# 计算总耗时
elapsed_time = end_time - start_time
print(f"Total time taken to automate login: {elapsed_time:.2f} seconds")


登录成功！页面标题是： Quotes to Scrape
 Total time taken to automate login: 6.24 seconds


4. 爬取虚拟书店内容和图片，并存储为CSV格式，图片单独命名存储。（包含图片、价格、评价，书名等书籍所有内容。图片可用wget下载即可，无须读写。）#测试虚拟地址技术

In [14]:
import os
import requests  # 用于下载图片，更改wget为使用requests
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import ssl
import time
import tempfile

# 创建临时目录来存储图片
#temp_dir = tempfile.TemporaryDirectory()

driver = webdriver.Safari()
driver.get('http://books.toscrape.com')

context = ssl._create_unverified_context()  

books_information = []  

# 爬取前10个页面的内容
for page_num in range(1, 11):
    url = f'http://books.toscrape.com/catalogue/page-{page_num}.html'
    driver.get(url)
    time.sleep(2)  # 等待页面加载

    # 使用BeautifulSoup解析当前页面的HTML内容
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    books = soup.find_all('article', class_='product_pod')

    for book in books:
        book_relative_url = book.find('h3').find('a')['href']
        book_url = 'http://books.toscrape.com/catalogue/' + book_relative_url
        
        # 打开书籍详情页
        book_page = urlopen(book_url, context=context)
        book_soup = BeautifulSoup(book_page, 'html.parser')
        
        # 提取书籍详情
        title = book_soup.find('h1').text
        price = book_soup.find('p', class_='price_color').text
        rating = book_soup.find('p', class_='star-rating')['class'][1]
        availability = book_soup.find('p', class_='instock availability').text.strip()
        description = book_soup.find('meta', {'name': 'description'})['content'].strip()
        image_url = 'http://books.toscrape.com/' + book_soup.find('img')['src'].replace('../', '')
        
        # 下载图片并保存 #保存到临时目录
        #image_filename = os.path.join(temp_dir.name, f"{title.replace('/', '_')}.jpg")
        #image_data = requests.get(image_url).content
        image_filename = f"images/{title.replace('/', '_')}.jpg"
        if not os.path.exists('images'):
            os.makedirs('images')
        image_data = requests.get(image_url).content
        with open(image_filename, 'wb') as handler:
            handler.write(image_data)
        
        # 存储书籍信息
        book_info = {
            'Title': title,
            'Price': price,
            'Rating': rating, #how many stars
            'Availability': availability,
            'Description': description,
            'Image': image_filename
        }
        books_information.append(book_info)

# 关闭WebDriver
driver.quit()

# 将书籍数据存储为CSV文件
#csv_filename = os.path.join(temp_dir.name, 'books.csv')
df = pd.DataFrame(books_information)
df.to_csv('books.csv', index=False)

print("数据爬取和保存完成。")
#print(f"书籍数据和图片保存在临时目录: {temp_dir.name}")

数据爬取和保存完成。


5.爬取豆瓣影评（网址：https://movie.douban.com ； 爬取：年度排行榜电影的影评；要求：至少一类电影的影评、至少10部电影的影评，比如：评分最高的外国语电影）

In [3]:
#爬取豆瓣top250的前十部电影的前十页影评
import requests # 导入requests模块，用于发送HTTP请求
from bs4 import BeautifulSoup
import random
import time
import csv 

# 设置用户代理池，防止被反爬机制封杀
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3', 
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
    # 加更多的用户代理
]

# 随机选择一个用户代理
def get_random_user_agent():
    return random.choice(user_agents)

# 爬取影评函数
def fetch_reviews(movie_id, movie_name, pages=10):
    headers = {'User-Agent': get_random_user_agent()}  # 设置请求头，使用随机选择的用户代理
    reviews = []  # 初始化影评列表

    for page in range(0, pages):
        url = f'https://movie.douban.com/subject/{movie_id}/comments?start={page*20}&limit=20&status=P&sort=new_score'# 构造请求URL
        response = requests.get(url, headers=headers) 
        
        if response.status_code != 200:  # 如果响应状态码不是200，打印错误信息
            print(f"Failed to fetch page {page+1} for movie {movie_id}")
            continue  # 跳过当前循环，继续下一页
        
        soup = BeautifulSoup(response.text, 'html.parser')
        comments = soup.find_all('div', class_='comment')
        
        for comment in comments:
            text = comment.find('span', class_='short').text.strip()
            reviews.append((movie_name, text))
        
        # 随机暂停1到3秒，模拟真人行为，防止被封
        time.sleep(random.uniform(1, 3))
        
    return reviews

# 主函数，爬取豆瓣电影TOP250的影评
def main():
    top_250_url = 'https://movie.douban.com/top250'
    response = requests.get(top_250_url, headers={'User-Agent': get_random_user_agent()})
    soup = BeautifulSoup(response.text, 'html.parser')
    movie_tags = soup.find_all('div', class_='hd')
    
    movie_info = []
    for tag in movie_tags[:10]:     # 这里限制为前10部电影
        movie_link = tag.a['href']  # 获取电影链接
        movie_id = movie_link.split('/')[-2]  # 提取电影ID
        movie_name = tag.a.span.text.strip()
        movie_info.append((movie_id, movie_name))
    
    all_reviews = []
    for movie_id, movie_name in movie_info:
        print(f"Fetching reviews for movie: {movie_name} (ID: {movie_id})")  # 打印当前处理的电影信息
        reviews = fetch_reviews(movie_id, movie_name, pages=10) 
        all_reviews.extend(reviews)
        print(f"Fetched {len(reviews)} reviews for movie: {movie_name} (ID: {movie_id})")  # 打印已获取影评数量
    
    # 保存结果到CSV文件
    with open('douban_reviews.csv', 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)  # 创建CSV写入对象
        writer.writerow(['电影名', '影评'])  # 写入CSV文件的表头
        writer.writerows(all_reviews)  #写入所有影评
    
    print("All reviews fetched and saved.")

if __name__ == '__main__':
    main()


Fetching reviews for movie: 肖申克的救赎 (ID: 1292052)
Fetched 200 reviews for movie: 肖申克的救赎 (ID: 1292052)
Fetching reviews for movie: 霸王别姬 (ID: 1291546)
Fetched 200 reviews for movie: 霸王别姬 (ID: 1291546)
Fetching reviews for movie: 阿甘正传 (ID: 1292720)
Fetched 200 reviews for movie: 阿甘正传 (ID: 1292720)
Fetching reviews for movie: 泰坦尼克号 (ID: 1292722)
Fetched 200 reviews for movie: 泰坦尼克号 (ID: 1292722)
Fetching reviews for movie: 千与千寻 (ID: 1291561)
Fetched 200 reviews for movie: 千与千寻 (ID: 1291561)
Fetching reviews for movie: 这个杀手不太冷 (ID: 1295644)
Fetched 200 reviews for movie: 这个杀手不太冷 (ID: 1295644)
Fetching reviews for movie: 美丽人生 (ID: 1292063)
Fetched 200 reviews for movie: 美丽人生 (ID: 1292063)
Fetching reviews for movie: 星际穿越 (ID: 1889243)
Fetched 200 reviews for movie: 星际穿越 (ID: 1889243)
Fetching reviews for movie: 盗梦空间 (ID: 3541415)
Fetched 200 reviews for movie: 盗梦空间 (ID: 3541415)
Fetching reviews for movie: 楚门的世界 (ID: 1292064)
Fetched 200 reviews for movie: 楚门的世界 (ID: 1292064)
All reviews fetc

添加一下技术：（1）多进程多线程技术（2）数据库交互（redis, mongodb）（3）虚拟地址 （在Spyder上测试成功）

In [3]:
import requests
from bs4 import BeautifulSoup
import random
import time
from multiprocessing import Pool, cpu_count
from pymongo import MongoClient

# 设置用户代理池，防止被反爬机制封杀
user_agents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0',
]

# 随机选择一个用户代理
def get_random_user_agent():
    return random.choice(user_agents)

# 爬取影评函数
def fetch_reviews(movie):
    movie_id, movie_name = movie
    headers = {'User-Agent': get_random_user_agent()}
    reviews = []

    for page in range(0, 2):  # 爬取前10页
        url = f'https://movie.douban.com/subject/{movie_id}/comments?start={page*20}&limit=20&status=P&sort=new_score'
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"Failed to fetch page {page+1} for movie {movie_id}")
            continue
        
        soup = BeautifulSoup(response.text, 'html.parser')
        comments = soup.find_all('div', class_='comment')
        
        for comment in comments:
            text = comment.find('span', class_='short').text.strip()
            reviews.append({
                'movie_name': movie_name,
                'review': text
            })
        
        time.sleep(random.uniform(1, 3))
        
    return reviews

# 主函数，爬取豆瓣电影TOP250的影评
def main():
    top_250_url = 'https://movie.douban.com/top250'
    response = requests.get(top_250_url, headers={'User-Agent': get_random_user_agent()})
    soup = BeautifulSoup(response.text, 'html.parser')
    movie_tags = soup.find_all('div', class_='hd')
    
    movie_info = []
    for tag in movie_tags[:2]:  # 限制为前10部电影
        movie_link = tag.a['href']
        movie_id = movie_link.split('/')[-2]
        movie_name = tag.a.span.text.strip()
        movie_info.append((movie_id, movie_name))

    # 使用多进程爬取影评
    with Pool(cpu_count()) as pool:
        results = pool.map(fetch_reviews, movie_info)
    
    # 将所有影评展平为一个列表
    all_reviews = [review for sublist in results for review in sublist]

    # 连接MongoDB并保存数据
    client = MongoClient('localhost', 27017)
    db = client['douban']
    collection = db['reviews']
    collection.insert_many(all_reviews)
    
    print("All reviews fetched and saved to MongoDB.")

if __name__ == '__main__':
    main()


Process SpawnPoolWorker-5:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Process SpawnPoolWorker-7:
Traceback (most recent call last):
Traceback (most recent call last):
Process SpawnPoolWorker-6:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.11/multiprocessing/pool.py", line 114, in worker
    task = get()
           ^^^^^
  File "/opt/anaconda3/lib/python3.11/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'fetch_reviews' o

KeyboardInterrupt: 