In [1]:
from bs4 import BeautifulSoup as bs
import requests
import csv
from expanddouban import getHtml

In [2]:
"""
url example:
  https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影,剧情,美国
"""

"""
return a string corresponding to the URL of douban movie lists given category and location.
@param category: str
@param location: str
"""
def getMovieUrl(category, location):
    url = 'https://movie.douban.com/tag/#/?sort=S&range=9,10&tags=电影'
    return ','.join([url, category, location])


In [3]:
class Movie:
    def __init__(self, name, rate, location, category, info_link, cover_link):
        self.name = name
        self.rate = rate
        self.location = location
        self.category = category
        self.info_link = info_link
        self.cover_link = cover_link


In [4]:
url = getMovieUrl('科幻', '美国')

In [5]:
def extractOnePage (url):
    html = getHtml(url)
    soup = bs(html, 'lxml')
    listWpDiv = soup.find(name='div', attrs={'class': 'list-wp'})
    if listWpDiv:
        listWp = listWpDiv.find_all(name='a', attrs={'class': 'item'})
        return [extractOneItem(item) for item in listWp]
    else:
        return []

def extractOneItem (item):
    '''
    find title, rate, img
    '''
    title = item.find(attrs={'class', 'title'}).text
    rate = item.find(attrs={'class', 'rate'}).text
    img = item.find('img').attrs['src']
    return title, rate, img

def getMovies (cat, loc):
    url = getMovieUrl(cat, loc)
    moviesRawData = extractOnePage(url)
    return [(mv[0], mv[1], loc, cat, mv[2]) for mv in moviesRawData if mv]

In [6]:
data = getMovies('剧情', '大陆')

In [7]:
def saveMoviesToCSV (cat, loc):
    data = getMovies(cat, lox)
    writer = csv.writer(f, delimiter=',')
    [writer.writerow(row) for row in data]

In [8]:
cats = [
    '剧情', '爱情', '喜剧', '科幻',
    '动作', '悬疑', '犯罪', '恐怖',
    '青春', '励志', '战争', '文艺',
    '黑色', '幽默', '传记', '情色',
    '暴力','音乐','家庭'
]

locs = [
    '大陆', '美国', '香港', '台湾',
    '日本', '韩国', '英国', '法国',
    '德国', '意大利', '西班牙',
    '印度', '泰国', '俄罗斯', '伊朗',
    '加拿大', '澳大利亚', '爱尔兰',
    '瑞典', '巴西', '丹麦',
]

from gevent import monkey, spawn, joinall
monkey.patch_socket()
def getCatStat (cat):
    gs = [spawn(getMovies, cat, loc) for loc in locs]
    gs = [g.value for g in joinall(gs)]
    return {data[0][2]: len(data) for data in gs if data}

def getTop3Loc (cat):
    stat = sorted(getCatStat(cat).items(), key=lambda item: item[1])
    total = sum([item[1] for item in stat])
    statDict = {}
    for i in range(3):
        item = stat[-1-i]
        loc = item[0]
        ct = item[1]
        pct = round(item[1] / total * 100, 2)
        statDict['top'+str(i+1)] = {'loc': loc, 'ct': ct, 'pct': pct}
    return {'type': cat, 'stat': statDict} 
    

In [9]:
def getAllTop3 ():
    return [getTop3Loc(cat) for cat in cats]

In [44]:
%%time
print(getTop3Loc(cats[5]))

{'type': '悬疑', 'stat': {'top1': {'loc': '英国', 'ct': 20, 'pct': 0.36363636363636365}, 'top2': {'loc': '美国', 'ct': 17, 'pct': 0.3090909090909091}, 'top3': {'loc': '日本', 'ct': 7, 'pct': 0.12727272727272726}}}
CPU times: user 646 ms, sys: 161 ms, total: 808 ms
Wall time: 1min


In [11]:
%%time
# allStats = getAllTop3()

CPU times: user 4 µs, sys: 1e+03 ns, total: 5 µs
Wall time: 7.87 µs
