In [1]:
import time
import json
import requests
from functools import wraps
from concurrent.futures import as_completed
from concurrent.futures import ThreadPoolExecutor
from bs4 import BeautifulSoup
from tqdm import tqdm


class HttpCodeException(Exception):
    pass


def retry(retry_count=5, sleep_time=1):
    def wrapper(func):
        @wraps(func)
        def inner(*args, **kwargs):
            for i in range(retry_count):
                try:
                    res = func(*args, **kwargs)
                    return res
                except:
                    time.sleep(sleep_time)
                    continue
            return None
        return inner
    return wrapper


def get_proxy():
    return requests.get("http://127.0.0.1:5010/get/").text


@retry()
def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.87 Mobile Safari/537.36',
        'Host': 'movie.douban.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
        'Referer': 'https://movie.douban.com/top250?start=25&filter='
    }

    # 使用代理,尽量减少IP被封的可能
    # res = requests.get(url, headers=headers, proxies={"http": "http://{}".format(get_proxy())})
    # 不适用代理的方法
    res = requests.get(url, headers=headers)
    time.sleep(3)   # 这里一定要sleep3秒,不然,频繁的抓取会导致IP被封
#     print(res.status_code)
    if res.status_code != 200:
        raise HttpCodeException

    return res.text


def extract_info(html):
    """
    获取电影的详细信息
    :param html:
    :return:
    """

    soup = BeautifulSoup(html, 'html.parser')

    area = ""
    info_div = soup.find('div', attrs={'id': 'info'})
    for child in info_div.children:
        if child.string and child.string.startswith('制片国家/地区'):
            area = child.next_sibling.string.strip()

    info_script = soup.find('script', attrs={'type': 'application/ld+json'})
    info_text = info_script.text.replace('\r', '').replace('\n', '')
    json_data = json.loads(info_text)

    info = {}
    info['name'] = json_data['name']
    info['director'] = json_data['director']    # 导演
    info['actor'] = json_data['actor']   # 主演
    info['datePublished'] = json_data['datePublished']  # 发型日期
    info['genre'] = json_data['genre']   # 电影类型
    info['ratingCount'] = json_data['aggregateRating']['ratingCount'] # 评价人数
    info['ratingValue'] = json_data['aggregateRating']['ratingValue'] # 评分
    info['area'] = area   # 制作国家地区
    desc = list(soup.find('div', attrs={'class': 'indent', 'id': 'link-report'}).children)[3].text.replace('\n', '').replace('\u3000', '').strip()
    info['description'] = desc
    return info


def get_info_by_url(url):
    try:
        html = get_html(url)
        info = extract_info(html)
    except:
        return url
    return info


def produce_url():
    """
    每个页面有25个电影，共10个页面，这10个页面的url可以自己生成
    :return:
    """
    url_style = "https://movie.douban.com/top250?start={index}&filter="
    url_lst = []
    for i in range(0, 250, 25):
        url = url_style.format(index=i)
        print(url)
        url_lst.append(url)

    return url_lst


def get_info_url(page_url):
    """
    获取每个页面25个电影的详细信息的url
    :param page_url:
    :return:
    """
    html = get_html(page_url)
    soup = BeautifulSoup(html, 'html.parser')

    url_lst = []
    ol_node = soup.find('ol', class_='grid_view')
    pic_nodes = ol_node.find_all('div', class_='pic')
    for pic_node in pic_nodes:
        a = pic_node.find('a')
        href = a['href']
        url_lst.append(href)

    return url_lst


def run_multi_thread():
    """
    多线程爬取
    :return:
    """
    res_file = open('movie_data', 'w')
    t1 = time.time()
    url_lst = []
    page_url_lst = produce_url()

    for page_url in page_url_lst:
        page_url_lst = get_info_url(page_url)
        url_lst.extend(page_url_lst)

    # 10个线程进行爬取
    tpool = ThreadPoolExecutor(max_workers=10)
    pbar = tqdm(total=len(url_lst), ascii=True)
    def thread_func(url):
        info = get_info_by_url(url)
        pbar.update(1)
        return info
    obj = []
    for url in url_lst:
        t = tpool.submit(thread_func, url)
        obj.append(t)
    tpool.shutdown()
    for t in obj:
        data = t.result()
        if isinstance(data, str):
            print(data)
        else:
            res_file.write(json.dumps(data, ensure_ascii=False) + "\n")

    res_file.close()
    t2 = time.time()
    print("耗时" + str(t2-t1))

In [1]:
if __name__ == '__main__':
    run_multi_thread()

https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
https://movie.douban.com/top250?start=75&filter=
https://movie.douban.com/top250?start=100&filter=
https://movie.douban.com/top250?start=125&filter=
https://movie.douban.com/top250?start=150&filter=
https://movie.douban.com/top250?start=175&filter=
https://movie.douban.com/top250?start=200&filter=
https://movie.douban.com/top250?start=225&filter=
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200200

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
2

In [2]:
toc_url_lst = produce_url()

https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
https://movie.douban.com/top250?start=75&filter=
https://movie.douban.com/top250?start=100&filter=
https://movie.douban.com/top250?start=125&filter=
https://movie.douban.com/top250?start=150&filter=
https://movie.douban.com/top250?start=175&filter=
https://movie.douban.com/top250?start=200&filter=
https://movie.douban.com/top250?start=225&filter=


In [3]:
toc_html = get_html(toc_url_lst[0])

In [4]:
toc_html

In [77]:
url_lst = []


for page_url in page_url_lst:
    page_url_lst = get_info_url(page_url)
    url_lst.extend(url_lst)

https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
https://movie.douban.com/top250?start=75&filter=
https://movie.douban.com/top250?start=100&filter=
https://movie.douban.com/top250?start=125&filter=
https://movie.douban.com/top250?start=150&filter=
https://movie.douban.com/top250?start=175&filter=
https://movie.douban.com/top250?start=200&filter=
https://movie.douban.com/top250?start=225&filter=


AttributeError: 'NoneType' object has no attribute 'find_all'

In [4]:
moive_url_lst[:10]

['https://movie.douban.com/subject/1292052/',
 'https://movie.douban.com/subject/1291546/',
 'https://movie.douban.com/subject/1292720/',
 'https://movie.douban.com/subject/1295644/',
 'https://movie.douban.com/subject/1292063/',
 'https://movie.douban.com/subject/1292722/',
 'https://movie.douban.com/subject/1291561/',
 'https://movie.douban.com/subject/1295124/',
 'https://movie.douban.com/subject/3541415/',
 'https://movie.douban.com/subject/3011091/']

In [5]:
html = get_html(moive_url_lst[0])

200


In [7]:
soup = BeautifulSoup(html, 'html.parser')

In [26]:
for i  in soup.find_all('div', attrs={'id':'info'})[0]:
#     print(i.string)
    if isinstance(i.string, str) and i.string.startswith('制片国家/地区:'):
        area = i.next_sibling.strip()
        break
area

'美国'

In [29]:
info_script = soup.find('script', attrs={'type': 'application/ld+json'}).text

In [74]:
info_dict = json.loads(info_script)
info_dict

{'@context': 'http://schema.org',
 'name': '肖申克的救赎 The Shawshank Redemption',
 'url': '/subject/1292052/',
 'image': 'https://img3.doubanio.com/view/photo/s_ratio_poster/public/p480747492.webp',
 'director': [{'@type': 'Person',
   'url': '/celebrity/1047973/',
   'name': '弗兰克·德拉邦特 Frank Darabont'}],
 'author': [{'@type': 'Person',
   'url': '/celebrity/1047973/',
   'name': '弗兰克·德拉邦特 Frank Darabont'},
  {'@type': 'Person',
   'url': '/celebrity/1049547/',
   'name': '斯蒂芬·金 Stephen King'}],
 'actor': [{'@type': 'Person',
   'url': '/celebrity/1054521/',
   'name': '蒂姆·罗宾斯 Tim Robbins'},
  {'@type': 'Person',
   'url': '/celebrity/1054534/',
   'name': '摩根·弗里曼 Morgan Freeman'},
  {'@type': 'Person',
   'url': '/celebrity/1041179/',
   'name': '鲍勃·冈顿 Bob Gunton'},
  {'@type': 'Person',
   'url': '/celebrity/1000095/',
   'name': '威廉姆·赛德勒 William Sadler'},
  {'@type': 'Person',
   'url': '/celebrity/1013817/',
   'name': '克兰西·布朗 Clancy Brown'},
  {'@type': 'Person',
   'url': '/celebrity/

In [42]:
json.dumps(info_dict['actor'], ensure_ascii=False)

'[{"@type": "Person", "url": "/celebrity/1054521/", "name": "蒂姆·罗宾斯 Tim Robbins"}, {"@type": "Person", "url": "/celebrity/1054534/", "name": "摩根·弗里曼 Morgan Freeman"}, {"@type": "Person", "url": "/celebrity/1041179/", "name": "鲍勃·冈顿 Bob Gunton"}, {"@type": "Person", "url": "/celebrity/1000095/", "name": "威廉姆·赛德勒 William Sadler"}, {"@type": "Person", "url": "/celebrity/1013817/", "name": "克兰西·布朗 Clancy Brown"}, {"@type": "Person", "url": "/celebrity/1010612/", "name": "吉尔·贝罗斯 Gil Bellows"}, {"@type": "Person", "url": "/celebrity/1054892/", "name": "马克·罗斯顿 Mark Rolston"}, {"@type": "Person", "url": "/celebrity/1027897/", "name": "詹姆斯·惠特摩 James Whitmore"}, {"@type": "Person", "url": "/celebrity/1087302/", "name": "杰弗里·德曼 Jeffrey DeMunn"}, {"@type": "Person", "url": "/celebrity/1074035/", "name": "拉里·布兰登伯格 Larry Brandenburg"}, {"@type": "Person", "url": "/celebrity/1099030/", "name": "尼尔·吉恩托利 Neil Giuntoli"}, {"@type": "Person", "url": "/celebrity/1343305/", "name": "布赖恩·利比 Brian Libby"}, {

In [73]:
desc = list(soup.find('div', attrs={'class': 'indent', 'id': 'link-report'}).children)[3].text.replace('\n', '').replace('\u3000', '').strip()
desc

'20世纪40年代末，小有成就的青年银行家安迪（蒂姆·罗宾斯 Tim Robbins 饰）因涉嫌杀害妻子及她的情人而锒铛入狱。在这座名为肖申克的监狱内，希望似乎虚无缥缈，终身监禁的惩罚无疑注定了安迪接下来灰暗绝望的人生。未过多久，安迪尝试接近囚犯中颇有声望的瑞德（摩根·弗里曼 Morgan Freeman 饰），请求对方帮自己搞来小锤子。以此为契机，二人逐渐熟稔，安迪也仿佛在鱼龙混杂、罪恶横生、黑白混淆的牢狱中找到属于自己的求生之道。他利用自身的专业知识，帮助监狱管理层逃税、洗黑钱，同时凭借与瑞德的交往在犯人中间也渐渐受到礼遇。表面看来，他已如瑞德那样对那堵高墙从憎恨转变为处之泰然，但是对自由的渴望仍促使他朝着心中的希望和目标前进。而关于其罪行的真相，似乎更使这一切朝前推进了一步……                                                                    本片根据著名作家斯蒂芬·金（Stephen Edwin King）的原著改编。'

## CSS Selector

In [6]:
with open('top250-start0.html', 'rb') as f:
    html = f.read()

In [7]:
html_decoded = html.decode("utf-8", "ignore")

In [9]:
soup = BeautifulSoup(html_decoded, 'html.parser')

In [81]:
[''.join([_.string.replace('\xa0', ' ') \
          for _ in hd.select('.title, .other')]).strip() \
    for hd in soup.select('.grid_view li .info .hd')]

['肖申克的救赎 / The Shawshank Redemption / 月黑高飞(港)  /  刺激1995(台)',
 '霸王别姬 / 再见，我的妾  /  Farewell My Concubine',
 '阿甘正传 / Forrest Gump / 福雷斯特·冈普',
 '这个杀手不太冷 / Léon / 杀手莱昂  /  终极追杀令(台)',
 '美丽人生 / La vita è bella / 一个快乐的传说(港)  /  Life Is Beautiful',
 '泰坦尼克号 / Titanic / 铁达尼号(港 / 台)',
 '千与千寻 / 千と千尋の神隠し / 神隐少女(台)  /  千与千寻的神隐',
 "辛德勒的名单 / Schindler's List / 舒特拉的名单(港)  /  辛德勒名单",
 '盗梦空间 / Inception / 潜行凶间(港)  /  全面启动(台)',
 "忠犬八公的故事 / Hachi: A Dog's Tale / 忠犬小八(台)  /  秋田犬八千(港)",
 "海上钢琴师 / La leggenda del pianista sull'oceano / 声光伴我飞(港)  /  一九零零的传奇",
 '机器人总动员 / WALL·E / 瓦力(台)  /  太空奇兵·威E(港)',
 '三傻大闹宝莱坞 / 3 Idiots / 三个傻瓜(台)  /  作死不离3兄弟(港)',
 '楚门的世界 / The Truman Show / 真人Show(港)  /  真人戏',
 '放牛班的春天 / Les choristes / 歌声伴我心(港)  /  唱诗班男孩',
 '星际穿越 / Interstellar / 星际启示录(港)  /  星际效应(台)',
 '大话西游之大圣娶亲 / 西遊記大結局之仙履奇緣 / 西游记完结篇仙履奇缘  /  齐天大圣西游记',
 '熔炉 / 도가니 / 无声呐喊(港)  /  漩涡',
 '疯狂动物城 / Zootopia / 优兽大都会(港)  /  动物方城市(台)',
 '无间道 / 無間道 / Infernal Affairs  /  Mou gaan dou',
 '龙猫 / となりのトトロ / 邻居托托罗  /  邻家的豆豆龙',
 "教父 / The 

In [82]:
[_['href'] for _ in soup.select('.grid_view li .pic a')]

['https://movie.douban.com/subject/1292052/',
 'https://movie.douban.com/subject/1291546/',
 'https://movie.douban.com/subject/1292720/',
 'https://movie.douban.com/subject/1295644/',
 'https://movie.douban.com/subject/1292063/',
 'https://movie.douban.com/subject/1292722/',
 'https://movie.douban.com/subject/1291561/',
 'https://movie.douban.com/subject/1295124/',
 'https://movie.douban.com/subject/3541415/',
 'https://movie.douban.com/subject/3011091/',
 'https://movie.douban.com/subject/1292001/',
 'https://movie.douban.com/subject/2131459/',
 'https://movie.douban.com/subject/3793023/',
 'https://movie.douban.com/subject/1292064/',
 'https://movie.douban.com/subject/1291549/',
 'https://movie.douban.com/subject/1889243/',
 'https://movie.douban.com/subject/1292213/',
 'https://movie.douban.com/subject/5912992/',
 'https://movie.douban.com/subject/25662329/',
 'https://movie.douban.com/subject/1307914/',
 'https://movie.douban.com/subject/1291560/',
 'https://movie.douban.com/subjec