In [156]:
from bs4 import BeautifulSoup as bs
import requests
import csv
import time
from expanddouban import getHtml

In [157]:
from imp import reload
import expanddouban
reload(expanddouban)
getHtml = expanddouban.getHtml
newBrowser = expanddouban.newBrowser

In [158]:
cats = [
    '剧情', '爱情', '喜剧', '科幻',
    '动作', '悬疑', '犯罪', '恐怖',
    '青春', '励志', '战争', '文艺',
    '黑色', '幽默', '传记', '情色',
    '暴力','音乐','家庭'
]

locs = [
    '大陆', '美国', '香港', '台湾',
    '日本', '韩国', '英国', '法国',
    '德国', '意大利', '西班牙',
    '印度', '泰国', '俄罗斯', '伊朗',
    '加拿大', '澳大利亚', '爱尔兰',
    '瑞典', '巴西', '丹麦',
]

In [159]:
"""
url example:
  https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,剧情,美国
"""

"""
return a string corresponding to the URL of douban movie lists given category and location.
@param category: str
@param location: str
"""
def getMovieUrl(category, location):
    url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影'
    return ','.join([url, category, location])


In [160]:
class Movie:
    def __init__(self, name, rate, location, category, info_link, cover_link):
        self.name = name
        self.rate = rate
        self.location = location
        self.category = category
        self.info_link = info_link
        self.cover_link = cover_link


In [161]:
def extractOnePage (page):
    soup = bs(page, 'lxml')
    listWpDiv = soup.find(name='div', attrs={'class': 'list-wp'})
    if listWpDiv:
        listWp = listWpDiv.find_all(name='a', attrs={'class': 'item'})
        return [extractOneItem(item) for item in listWp]
    else:
        return []

def extractOneItem (item):
    '''
    find title, rate, img
    '''
    title = item.find(attrs={'class', 'title'}).text
    rate = item.find(attrs={'class', 'rate'}).text
    img = item.find('img').attrs['src']
    return title, rate, img

def fetchOneCat (cat):
    urls = [getMovieUrl(cat, loc) for loc in locs]
    pages = newBatch(urls)
    return pages

def getMovies (cat, loc):
    url = getMovieUrl(cat, loc)
    moviesRawData = extractOnePage(url)
    return [(mv[0], mv[1], loc, cat, mv[2]) for mv in moviesRawData if mv]


In [162]:
def newBatchForOneCat (cat):
    urls = [getMovieUrl(cat, loc) for loc in locs]
    
    browser = newBrowser()
    baseScript = '''
    function spawnNew (url) {
        window.sleep = async function (time) {
            return new Promise((res, rej) => {
                setTimeout(res, time)
            })
        }

        window.autoNext = async function () {
            nextBtn = document.querySelector('.more')
            for(let i=0; i<25; i++) {
                console.log('show more')
                // nextBtn.click() 亲测不起作用，因为在后台的标签会挂起click，
                // 等到切换此tab到前台才真正执行click的listener函数，
                // 所以要直接把listenser函数暴力挖出，直接执行
                getEventListeners(nextBtn).click[0].listener()
                await sleep(2000)
                nextBtn = document.querySelector('.more')
                if (!nextBtn) break
            }
        }
        
        setTimeout(() => window.location = url, 0)
        setTimeout(() => window.stop(), 15000)
        setTimeout(() => window.autoNext(), 5000)
        window.open()
    }
    '''
    for url in urls:
        script = baseScript + f'\nspawnNew("{url}")'
        browser.execute_script(script)
        # browser.execute_script('window.open("about:blank")')
        browser.switch_to.window(browser.window_handles[-1])
    
    time.sleep(10)
    result = []
    for i in range(len(browser.window_handles) - 1): # len - 1 for last is 'about:blank'
        browser.switch_to.window(browser.window_handles[i])
        page = browser.page_source
        loc = locs[i]
        rawMoviesData = extractOnePage(page)
        refinedCatData = [(mv[0], mv[1], loc, cat, mv[2]) for mv in rawMoviesData if mv]
        result.append(refinedCatData)
    time.sleep(1000)
    browser.quit()
    return result

In [163]:
def top3ForOneCat (cat):
    catData = newBatchForOneCat(cat)
    stats = [(d[0][2], len(d)) for d in catData if d] # [(loc, count), ...]
    stats = sorted(stats, key=lambda item: item[1], reverse=True)
    stats += [None, None, None] # 保证有3个
    return {cat: [stats[0], stats[1], stats[2]]}

In [None]:
top3ForOneCat('科幻')

In [131]:
from gevent import monkey, spawn, joinall
monkey.patch_socket()
def getAllTop3s ():
    gs = [spawn(top3ForOneCat, cat) for cat in cats]
    joinall(gs)
    return [g.value for g in gs]