In [11]:
from bs4 import BeautifulSoup as bs
import requests
import csv
import time
from expanddouban import getHtml

In [12]:
from imp import reload
import expanddouban
reload(expanddouban)
getHtml = expanddouban.getHtml
newBrowser = expanddouban.newBrowser

In [13]:
cats = [
    '剧情', '爱情', '喜剧', '科幻',
    '动作', '悬疑', '犯罪', '恐怖',
    '青春', '励志', '战争', '文艺',
    '黑色', '幽默', '传记', '情色',
    '暴力','音乐','家庭'
]

locs = [
    '大陆', '美国', '香港', '台湾',
    '日本', '韩国', '英国', '法国',
    '德国', '意大利', '西班牙',
    '印度', '泰国', '俄罗斯', '伊朗',
    '加拿大', '澳大利亚', '爱尔兰',
    '瑞典', '巴西', '丹麦',
]

In [14]:
"""
url example:
  https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,剧情,美国
"""

"""
return a string corresponding to the URL of douban movie lists given category and location.
@param category: str
@param location: str
"""
def getMovieUrl(category, location):
    url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影'
    return ','.join([url, category, location])


In [15]:
class Movie:
    def __init__(self, name, rate, location, category, info_link, cover_link):
        self.name = name
        self.rate = rate
        self.location = location
        self.category = category
        self.info_link = info_link
        self.cover_link = cover_link


In [16]:
def extractOnePage (page):
    soup = bs(page, 'lxml')
    listWpDiv = soup.find(name='div', attrs={'class': 'list-wp'})
    if listWpDiv:
        listWp = listWpDiv.find_all(name='a', attrs={'class': 'item'})
        return [extractOneItem(item) for item in listWp]
    else:
        return []

def extractOneItem (item):
    '''
    find title, rate, img
    '''
    title = item.find(attrs={'class', 'title'}).text
    rate = item.find(attrs={'class', 'rate'}).text
    img = item.find('img').attrs['src']
    return title, rate, img

def fetchOneCat (cat):
    urls = [getMovieUrl(cat, loc) for loc in locs]
    pages = newBatch(urls)
    return pages

def getMovies (cat, loc):
    url = getMovieUrl(cat, loc)
    moviesRawData = extractOnePage(url)
    return [(mv[0], mv[1], loc, cat, mv[2]) for mv in moviesRawData if mv]


In [22]:
def newBatchForOneCat (cat):
    # urls = [getMovieUrl(cat, loc) for loc in locs]
    urls = [getMovieUrl(cat, loc) for loc in locs]
    
    browser = newBrowser()
    baseScript = '''
    function spawnNew (url) {        
        setTimeout(() => { window.location = url }, 0)
        setTimeout(() => { window.stop() }, 4000)
        window.open()
    }
    '''
    showMoreScript = '''
    window.sleep = async function (time) {
        return new Promise((res, rej) => {
            setTimeout(res, time)
        })
    }

    window.autoNext = async function () {
        nextBtn = document.querySelector('.more')
        for(let i=0; i<50; i++) {
            console.log('show more')
            nextBtn.click()
            await sleep(2000)
            nextBtn = document.querySelector('.more')
        }
    }
    window.autoNext()
    '''
    for url in urls:
        script = baseScript + f'\nspawnNew("{url}")'
        browser.execute_script(script)
        browser.switch_to.window(browser.window_handles[-1])
    
    time.sleep(1.5)
    for i in range(len(browser.window_handles) - 1):
        browser.switch_to.window(browser.window_handles[i])
        time.sleep(0.2)
        browser.execute_script(showMoreScript)
    
    time.sleep(1.5)
    result = []
    for i in range(len(browser.window_handles) - 1): # len - 1 for last is 'about:blank'
        browser.switch_to.window(browser.window_handles[i])
        page = browser.page_source
        loc = locs[i]
        rawMoviesData = extractOnePage(page)
        refinedCatData = [(mv[0], mv[1], loc, cat, mv[2]) for mv in rawMoviesData if mv]
        result.append(refinedCatData)
    browser.quit()
    return result

In [23]:
def top3ForOneCat (cat):
    catData = newBatchForOneCat(cat)
    stats = [(d[0][2], len(d)) for d in catData if d] # [(loc, count), ...]
    stats = sorted(stats, key=lambda item: item[1], reverse=True)
    stats += [None, None, None] # 保证有3个
    return {cat: [stats[0], stats[1], stats[2]]}

In [24]:
# top3ForOneCat('剧情')

In [25]:
from gevent import monkey, spawn, joinall
monkey.patch_socket()
def getAllTop3s ():
    gs = [spawn(top3ForOneCat, cat) for cat in cats]
    joinall(gs)
    return [g.value for g in gs]

In [26]:
rankTop3Data = getAllTop3s()

In [27]:
rankTop3Data

[{'剧情': [('英国', 85), ('日本', 81), ('美国', 39)]},
 {'爱情': [('美国', 45), ('英国', 27), ('日本', 20)]},
 {'喜剧': [('美国', 161), ('英国', 79), ('日本', 24)]},
 {'科幻': [('英国', 38), ('日本', 28), ('美国', 19)]},
 {'动作': [('美国', 25), ('日本', 19), ('英国', 7)]},
 {'悬疑': [('英国', 36), ('美国', 17), ('日本', 7)]},
 {'犯罪': [('英国', 40), ('美国', 21), ('日本', 5)]},
 {'恐怖': [('美国', 10), ('日本', 3), ('英国', 3)]},
 {'青春': [('美国', 23), ('日本', 15), ('英国', 13)]},
 {'励志': [('美国', 49), ('日本', 19), ('英国', 11)]},
 {'战争': [('美国', 19), ('日本', 13), ('英国', 13)]},
 {'文艺': [('美国', 22), ('法国', 17), ('英国', 12)]},
 {'黑色': [('美国', 9), ('英国', 6), ('日本', 1)]},
 {'幽默': [('美国', 2), ('英国', 1), ('法国', 1)]},
 {'传记': [('美国', 67), ('英国', 59), ('日本', 11)]},
 {'情色': [('美国', 2), ('日本', 1), None]},
 {'暴力': [('美国', 15), ('英国', 3), ('俄罗斯', 1)]},
 {'音乐': [('美国', 306), ('英国', 197), ('德国', 48)]},
 {'家庭': [('美国', 66), ('日本', 17), ('英国', 10)]}]

In [83]:
import pandas as pd

rowData = []
index = []
[rowData.append(list(rec.values())[0]) or index.append(list(rec.keys())[0]) for rec in rankTop3Data]

df = pd.DataFrame(data=rowData, index=index, columns=['Gold', 'Silver', 'Bronze'])

In [86]:
df.to_csv('output.csv')