In [1]:
import csv
import time

import requests

from bs4 import BeautifulSoup as bs
import pandas as pd

In [2]:
targetURL = 'https://movie.douban.com/top250'
agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'

Single thread

In [3]:
def getContentFromURL(targetURL, agent, page_num):
    res = []
    urls = tuple(f'{targetURL}?start={ page * 25}' for page in range(page_num))
    for url in urls:
        response = requests.get(url, headers={'user-agent': agent})
        bs_info = bs(response.text, 'html.parser')
        movie_titles, movie_urls = parse_movie_title(bs_info)
        movie_ratings = parse_movie_ratings(bs_info)
        comments_number = parse_movie_comment_num(bs_info)
        movie_comments = parse_movie_comments_for_urls(movie_urls, headers={'user-agent': agent})
        res.append([movie_titles, movie_ratings, comments_number, movie_comments])
    return res

In [4]:
def parse_movie_title(bs_info):
    all_titles = []
    movie_urls = []
    for tags in bs_info.find_all('div', attrs={'class': 'hd'}):
        for atag in tags.find_all('a'):
            titles = []
            movie_urls.append(atag.get('href'))
            for stag in atag.find_all('span'):
                titles.append(stag.get_text())
            title = ''.join(titles)
        all_titles.append(title)
    return all_titles, movie_urls

In [5]:
def parse_movie_ratings(bs_info):
    all_ratings = []
    for rating_tag in bs_info.find_all('span', attrs={'class': 'rating_num'}):
        all_ratings.append(float(rating_tag.get_text()))
    return all_ratings

In [6]:
def parse_movie_comment_num(bs_info):
    all_comment_nums = []
    for comment_tag in bs_info.find_all('div', attrs={'class': 'star'}):
        comment_span = comment_tag.find('span', attrs={'class': None, 'property': None})
        all_comment_nums.append(int(comment_span.get_text()[:-3]))
    return all_comment_nums

In [7]:
def parse_movie_comments(movie_url, headers):
    movie_response = requests.get(movie_url, headers=headers)
    print(f'crawled movie comments {movie_response}')
    movie_bs = bs(movie_response.text, 'html.parser')
    short_comments = movie_bs.find_all('span', attrs={'class': 'short'})
    short_comment = '\nComment: '.join([''] + [i.get_text() for i in short_comments if len(i.contents) == 1])
    time.sleep(1)
    return short_comment

In [8]:
def parse_movie_comments_for_urls(movie_urls, headers):
    comments = []
    for movie_url in movie_urls:
        comments.append(parse_movie_comments(movie_url, headers=headers))
    return comments

In [9]:
# res = getContentFromURL(targetURL, agent, 10)

In [10]:
# rows = []
# for page in range(len(res)):
#     for i in zip(res[page][0], res[page][1], res[page][2], res[page][3]):
#         rows.append(list(i))

In [11]:
# with open('C:\\Users\\hrajzl\\Desktop\\results.csv','w',encoding='utf-8-sig', newline='') as f:
#     w = csv.writer(f)
#     w.writerows(rows)

Multi thread

In [9]:
import queue
import threading
from dataclasses import dataclass

In [10]:
movies = {}
movies_lock = threading.Lock() # For later when we want to update movies

page_q = queue.Queue(25) # pages to crawl -- for movies list
movie_q = queue.Queue(100) # movies to crawl -- for comments

# 名称、评分、短评数量和前 5 条热门短评
@dataclass
class Movie:
    title: str = None
    rating: float = None
    num_of_comments: int = None
    comments: str = None
    link: str = None

In [11]:
def page_crawl():
    crawling = True
    while crawling:
        try:
            page_url = page_q.get(block=True, timeout=1)
        except queue.Empty:
            crawling = False
            continue

        response = requests.get(page_url, headers={'user-agent': agent})
        print(f'crawled page {page_url}')
        bs_info = bs(response.text, 'html.parser')

        movie_titles, movie_links = parse_movie_title(bs_info)
        movie_ratings = parse_movie_ratings(bs_info)
        movie_comments_number = parse_movie_comment_num(bs_info)

        for i in range(len(movie_titles)):
            movie = Movie()
            movie.title = movie_titles[i]
            movie.rating = movie_ratings[i]
            movie.num_of_comments = movie_comments_number[i]
            movie.link = movie_links[i]
            movie_q.put(movie)

    print("Page all crawled")

In [12]:
def comments_crawl():
    crawling = True
    while crawling:
        try:
            movie = movie_q.get(block=True, timeout=1)
        except queue.Empty:
            crawling = False
            continue
        
        movie_comments = parse_movie_comments(movie.link, headers={'user-agent': agent})
        movie.comments = movie_comments
        
        movies_lock.acquire()
        global movies
        movies.update({
            movie.title: movie
        })
        movies_lock.release()
    print("Movie comments all crawled")

In [13]:
def page_q_set_up(page_num):
    for url in (f'https://movie.douban.com/top250?start={page * 25}' for page in range(page_num)):
        page_q.put(url)

In [14]:
def crawler_run(page_threads_size, comments_threads_size):
    page_threads = [ threading.Thread(target=page_crawl) for _ in range(page_threads_size) ]
    comments_threads = [ threading.Thread(target=comments_crawl) for _ in range(comments_threads_size) ]
    for t in page_threads:
        t.start()
    for t in page_threads:
        t.join()
    for t in comments_threads:
        t.start()
    for t in comments_threads:
        t.join()

In [31]:
page_q_set_up(2) # Set up url queue to be crawled

In [32]:
movies = {}
crawler_run(2, 5)

crawled page https://movie.douban.com/top250?start=25
crawled page https://movie.douban.com/top250?start=0
Page all crawled
Page all crawled
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comments <Response [200]>
crawled movie comme

In [33]:
print(movies['霸王别姬\xa0/\xa0再见，我的妾  /  Farewell My Concubine'].title)
print(movies['霸王别姬\xa0/\xa0再见，我的妾  /  Farewell My Concubine'].rating)
print(movies['霸王别姬\xa0/\xa0再见，我的妾  /  Farewell My Concubine'].num_of_comments)
print(movies['霸王别姬\xa0/\xa0再见，我的妾  /  Farewell My Concubine'].comments)

霸王别姬 / 再见，我的妾  /  Farewell My Concubine
9.6
1401605

Comment: 不解,拍出过霸王别姬的人怎能拍出无极来
Comment: 就凭这个，我愿意原谅陈凯歌一切的烂片 你只要伟大过一次就可以了 就凭这个 哥哥你是我心中永远不朽的传奇 你是全世界最大的角儿
Comment: 城头变幻大王旗，一个《霸王别姬》，一个《活着》，道尽中国现当代史，百年内无可超越。
Comment: 那么好的国粹，连日本人都知道要护着，你们说烧就烧……大多数开始于民国间的故事，最难捱的都是那段时间。
Comment: 往事不要再提，人生已多风雨。他是霸王，你是虞姬，“我本是男儿郎，又不是女娇娥”，万丈红尘蹉跌走过半世纪。寥落繁华不由己，十万春花如梦里。剑还给你，命也还给你。“君王意气尽，贱妾何聊生？”陪你唱罢这出、我便离去...


In [43]:
with open('C:\\Users\\hrajzl\\Desktop\\results.csv','w',encoding='utf-8-sig', newline='') as f:
    w = csv.writer(f)
    for k, v in movies.items():
        title = movies[k].title
        rating = movies[k].rating
        num_of_comments = movies[k].num_of_comments
        comments = movies[k].comments.strip()
        w.writerow([title, rating, num_of_comments, comments])