In [1]:
from urllib.request import urlopen  # 导入urlopen函数，用于发送HTTP请求并获取网页内容
from bs4 import BeautifulSoup    # 导入BeautifulSoup库，用于解析HTML页面
import ssl    # 导入ssl库，用于创建一个不验证SSL证书的上下文
import csv

### 任务一

In [2]:
# 创建一个不验证SSL证书的上下文
context = ssl._create_unverified_context()

# 初始化一个空列表，用于存储所有名言的信息
all_quotes = []

# 指定要抓取的网址
url = f'https://quotes.toscrape.com/page/1/' # 目标网址

# 打开指定的网页并获取其内容，同时使用上面创建的SSL上下文
page = urlopen(url, context=context) # 请求网页信息

# 使用BeautifulSoup库将网页内容解析为HTML结构
soup = BeautifulSoup(page, 'html.parser') 

# 查找网页中所有具有class属性为'quote'的div元素，这些元素包含了名言的信息
quotes = soup.find_all('div', class_='quote')

# 遍历所有找到的名言元素，并从中提取文本、作者和标签信息
for quote in quotes:
    # 提取名言文本
    text = quote.find('span', class_='text').text
    # 提取作者名
    author = quote.find('small', class_='author').text
    # 提取标签元素
    tags = quote.find('div', class_='tags').find_all('a')
    
    # 创建一个空列表tags_list，用于存储名言的标签
    tags_list = []
    
    # 遍历名言的标签元素，提取每个标签的文本，并将其添加到tags_list列表中
    for tag in tags:
        tags_list.append(tag.text)
    
    # 创建一个包含名言文本、作者和标签列表的列表，并将其添加到all_quotes列表中
    single_quote = [text, author, tags_list]
    all_quotes.append(single_quote)
    
    # 打印当前已抓取的所有名言信息
    print(all_quotes)


[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', ['change', 'deep-thoughts', 'thinking', 'world']]]
[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', ['change', 'deep-thoughts', 'thinking', 'world']], ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'J.K. Rowling', ['abilities', 'choices']]]
[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', ['change', 'deep-thoughts', 'thinking', 'world']], ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'J.K. Rowling', ['abilities', 'choices']], ['“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'Albert Einstein', ['insp

### 任务二

In [3]:
# 创建一个不验证SSL证书的上下文
context = ssl._create_unverified_context()

# 初始化一个空列表，用于存储所有名言的信息
all_quotes = []

for page_num in range(1, 11):
    # 指定要抓取的网址
    url = f'https://quotes.toscrape.com/page/1/' # 目标网址

    # 打开指定的网页并获取其内容，同时使用上面创建的SSL上下文
    page = urlopen(url, context=context) # 请求网页信息

    # 使用BeautifulSoup库将网页内容解析为HTML结构
    soup = BeautifulSoup(page, 'html.parser') 

    # 查找网页中所有具有class属性为'quote'的div元素，这些元素包含了名言的信息
    quotes = soup.find_all('div', class_='quote')
    # 遍历所有找到的名言元素，并从中提取文本、作者和标签信息
    for quote in quotes:
        # 提取名言文本
        text = quote.find('span', class_='text').text
        # 提取作者名
        author = quote.find('small', class_='author').text
        # 提取标签元素
        tags = quote.find('div', class_='tags').find_all('a')
        
        # 创建一个空列表tags_list，用于存储名言的标签
        tags_list = []
        
        # 遍历名言的标签元素，提取每个标签的文本，并将其添加到tags_list列表中
        for tag in tags:
            tags_list.append(tag.text)
        
        # 创建一个包含名言文本、作者和标签列表的列表，并将其添加到all_quotes列表中
        single_quote = [text, author, ', '.join(tags_list)]  # 将标签列表转换为逗号分隔的字符串
        all_quotes.append(single_quote)
        
        # 打印当前已抓取的所有名言信息
        print(all_quotes)

[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', 'change, deep-thoughts, thinking, world']]
[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', 'change, deep-thoughts, thinking, world'], ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'J.K. Rowling', 'abilities, choices']]
[['“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”', 'Albert Einstein', 'change, deep-thoughts, thinking, world'], ['“It is our choices, Harry, that show what we truly are, far more than our abilities.”', 'J.K. Rowling', 'abilities, choices'], ['“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”', 'Albert Einstein', 'inspirational, life, live, miracle, m

In [4]:
# 将抓取到的名言信息写入CSV文件
with open('quotes.csv', 'w', newline='', encoding='utf-8') as csvfile:
    csv_writer = csv.writer(csvfile)
    # 写入CSV文件的列标题
    csv_writer.writerow(['Quote', 'Author', 'Tags'])
    # 写入所有名言信息
    csv_writer.writerows(all_quotes)

print("爬取完成，并已将数据保存在quotes.csv文件中。")

爬取完成，并已将数据保存在quotes.csv文件中。


### 任务三

In [6]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# 创建一个WebDriver对象，这里使用Chrome
driver = webdriver.Chrome(".\chromedriver.exe")  # 请替换为你的Chrome WebDriver的路径

# 打开目标网页
driver.get("http://quotes.toscrape.com/login")

# 找到用户名和密码输入框，并输入你的用户名和密码
username = "username"
password = "password"

username_field = driver.find_element_by_name("username")
password_field = driver.find_element_by_name("password")

username_field.send_keys(username)
password_field.send_keys(password)

# 提交登录表单
password_field.send_keys(Keys.RETURN)

# 最后，关闭浏览器
driver.quit()


SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 104
Current browser version is 117.0.5938.149 with binary path C:\Program Files\Google\Chrome\Application\chrome.exe


### 任务四

In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re



In [7]:
# 创建一个文件夹用于存储图片
if not os.path.exists("book_images"):
    os.makedirs("book_images")

# 存储书籍信息的列表
book_info_list = []

# 定义目标网址
url = "http://books.toscrape.com"

# 循环遍历不同页面，直到没有下一页为止
while True:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # 找到当前页面的所有书籍信息
    books = soup.find_all("article", class_="product_pod")

    # 提取每本书的信息
    for book in books:
        title = book.h3.a["title"]
        price = book.select(".price_color")[0].get_text()
        rating = book.select("p.star-rating")[0]["class"][1]
        image_url = url + book.img["src"].replace("../..", "")
        
        # 使用正则表达式去除文件名中的非法字符并截断为较短的长度
        image_name = re.sub(r'[\/:*?"<>|]', '_', title)[:100] + ".jpg"

        # 下载并保存书籍图片
        image_data = requests.get(image_url).content
        with open(os.path.join("book_images", image_name), "wb") as image_file:
            image_file.write(image_data)

        # 将书籍信息添加到列表中
        book_info_list.append([title, price, rating, image_name])

    # 找到下一页的URL，如果没有下一页，则退出循环
    next_page = soup.select("li.next a")
    if not next_page:
        break
    url = url + "/" + next_page[0]["href"]

# 将书籍信息存储为CSV文件
df = pd.DataFrame(book_info_list, columns=["Title", "Price", "Rating", "Image"])
df.to_csv("book_info.csv", index=False)

print("爬取完成，并已将数据保存在book_info.csv文件中。")


爬取完成，并已将数据保存在book_info.csv文件中。


### 任务五

In [1]:
import requests
from bs4 import BeautifulSoup
import csv

In [2]:
def download_pages(url):
    headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.31',
            'Connection': 'keep-alive',
            'cookie':'bid=RzhhFAeVZ6A; _pk_id.100001.4cf6=f9bb9d590013cfa2.1694940965.; ll="118282"; _vwo_uuid_v2=D15FD51E4BB2EFAFEB43D5EB4D5D4EC15|52154092e05dc6a91bdffba591b0a0ee; __yadk_uid=G0ZgnZtQGsNonWzdCjxTlQqLrXjeWFWZ; dbcl2="180151675:GQ2mYirAZlE"; ck=o8iH; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1694953211%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=1; __utma=30149280.104823509.1694940966.1694943126.1694953212.3; __utmc=30149280; __utmz=30149280.1694953212.3.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=223695111.334883944.1694940969.1694943126.1694953212.3; __utmb=223695111.0.10.1694953212; __utmc=223695111; __utmz=223695111.1694953212.3.3.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18015; __utmb=30149280.12.9.1694955032615; ct=y',
            'Referer': 'https://www.douban.com/misc/sorry?original-url=https%3A%2F%2Fmovie.douban.com%2Fsubject%2F35593344%2Fcomments%3Fstart%3D0%26limit%3D20%26status%3DP%26sort%3Dnew_score'
            }
    r = requests.get(url, headers=headers).content
    return r

In [4]:
def parse_html(html, page_num ):
    URL = 'https://movie.douban.com/subject/35593344/comments?status=P'
    soup = BeautifulSoup(html, features='lxml')
    # print(soup)
    page = soup.find('div', attrs={'class':'mod-bd', 'id':'comments'})
    comment_list = []
    # page.find_all('div', attrs={'class':'comment'})
    # print(page)
    for i in page.find_all('div', attrs={'class':'comment'}):
        info = i.find('span', attrs={'class':'comment-info'})
        name = info.find('a').getText()
        text = i.find('span', attrs={'class':'short'}).getText()
        time = i.find('span', attrs={'class':'comment-time'}).getText()
        x = info.find('span')
        if x.getText() == '看过':
            star = x.find_next_sibling('span')['title']
        comment_list.append({'ID': name, 
                                'Time': time,
                                'star': star,
                                'comments':text})

    navi = page.find('div', attrs={'id':'paginator', 'class':'center'})
    # print(navi)
    next_page = navi.find('a', attrs={'class':'next'})
    # print(next_page)
    if (next_page) and page_num<30 :
        next_url = 'https://movie.douban.com/subject/35593344/comments?start='+str(page_num*20)+'&limit=20&status=P&sort=new_score'
        print(next_url)
        print(page_num)
        page_num=page_num+1
        return comment_list, next_url, page_num
    else:
        return comment_list, None, page_num

In [8]:
url = 'https://movie.douban.com/subject/35593344/comments?start=0&limit=20&status=P&sort=new_score'

with open('comments.csv','wt', newline='', encoding='utf_8_sig') as comments:
    cw = csv.DictWriter(comments, fieldnames=['ID', 'Time', 'star','comments']) #fieldnames必须与dict里的keys相同
    cw.writeheader() #先写header避免header被多次写入。
    page_num = 1
    while url:
        html = download_pages(url)
        # print(html)
        comment_list, url, page_num= parse_html(html, page_num=page_num)
        cw.writerows(comment_list)

https://movie.douban.com/subject/35593344/comments?start=20&limit=20&status=P&sort=new_score
1
https://movie.douban.com/subject/35593344/comments?start=40&limit=20&status=P&sort=new_score
2
https://movie.douban.com/subject/35593344/comments?start=60&limit=20&status=P&sort=new_score
3
https://movie.douban.com/subject/35593344/comments?start=80&limit=20&status=P&sort=new_score
4
https://movie.douban.com/subject/35593344/comments?start=100&limit=20&status=P&sort=new_score
5
https://movie.douban.com/subject/35593344/comments?start=120&limit=20&status=P&sort=new_score
6
https://movie.douban.com/subject/35593344/comments?start=140&limit=20&status=P&sort=new_score
7
https://movie.douban.com/subject/35593344/comments?start=160&limit=20&status=P&sort=new_score
8
https://movie.douban.com/subject/35593344/comments?start=180&limit=20&status=P&sort=new_score
9
https://movie.douban.com/subject/35593344/comments?start=200&limit=20&status=P&sort=new_score
10
https://movie.douban.com/subject/35593344/c

AttributeError: 'NoneType' object has no attribute 'find_all'