# Initialize Code

In [1]:
# notice
# <_1 1st, _2 2nd ,,, > : tuple
# [ ... ] : list
# { keytype, valtype } : dict

In [2]:
# import request
import requests

# import pattern
from pattern import web
from pattern.web import Element
from pattern.web import plaintext

# import regular expression
import re

# import json
import json

# import time (for sleep)
import time

In [3]:
def get_page(url) :
    r = requests.get(url)
    
    if r.status_code is not 200 :
        return None
    
    return r.text

# Get Movie Counts per Year

Crawl 'http://movie.naver.com/movie/sdb/browsing/bmovie_open.nhn' to get movie list




In [4]:
def get_movie_count_on_year() :
    # result { str year, int count }
    result = dict()
    
    # crawl $url_dir
    url_dir = 'http://movie.naver.com/movie/sdb/browsing/bmovie_open.nhn'
    page = get_page(url_dir)
    
    # find data
    elem_root = Element(page)
    
    elem_old_content = elem_root('div[id="old_content"]')
    if len(elem_old_content) is not 1 :
        return None
    
    elem_td = elem_old_content[0]('td')
    if len(elem_td) is 0 :
        return None
    
    # parse
    td_list = map(lambda x : plaintext(x.content), elem_td)
    
    year_count_list = map(lambda x : re.split('\W+' ,x)[0:2], td_list)
    
    # save at result   
    for year_count in year_count_list :
        result[year_count[0]] = int(year_count[1])
    
    return result

In [5]:
movie_counts = get_movie_count_on_year()

# Get MetaData For Each Movie

Crawl http://movie.naver.com/movie/sdb/browsing/bmovie.nhn?open=XXXX&page=XXXX to get pair of (movie name, movie code)

movie code is used to search movie info at http://movie.naver.com/movie/bi/mi/detail.nhn?code=XXXX


In [6]:
# return [ <str name, str code> ]
def get_movie_list_of_year_page(year, page) :
    url_base = 'http://movie.naver.com/movie/sdb/browsing/bmovie.nhn?'
    url_append = 'open=' + year + '&page=' + page
    
    # find data
    page = get_page(url_base + url_append)
    
    elem_root = Element(page)
    
    elem_old_content = elem_root('div[id="old_content"]')
    if len(elem_old_content) is not 1 :
        return []
    
    elem_li = elem_old_content[0]('ul[class="directory_list"] > li')
    if len(elem_li) is 0 :
        return []
    
    elem_a = map(lambda x : x('a')[0], elem_li)
    
    # parse
    hrefs = map(lambda x : x.attrs['href'], elem_a)
    codes = map(lambda x : re.split('\D+', x)[1] if len(re.split('\D+',x )) is 2 else None, hrefs)
    names = map(lambda x : x.content, elem_a)
    
    
    # save to result
    result = map(lambda x : (names[x], codes[x]), range(len(codes)))
    
    return result

def get_movie_list_of_year(year) :
    page_count = movie_counts[year] / 20
    if movie_counts[year] % 20 is not 0 :
        page_count = page_count + 1
    
    list_per_page = map(lambda x : get_movie_list_of_year_page(year, str(x)), range(1, page_count + 1))
    list_per_year = reduce(lambda a, b : a + b, list_per_page, [])
    
    
    return list_per_year

In [7]:
# filter
movie_counts['1940'] = 0
movie_counts['1950'] = 0
movie_counts['1960'] = 0
movie_counts['1970'] = 0
movie_counts['1980'] = 0
movie_counts['2017'] = 0

# save to json
for year in movie_counts :
    if movie_counts[year] is not 0 :
        tmp = get_movie_list_of_year(year)

        data_file = open('movie_list_' + year + '.txt', 'w')
        json.dump(tmp, data_file, ensure_ascii=True)
        data_file.close()


# Load MetaData

In [8]:
def load_list_of_year(year) :
    data_file = open('movie_list_' + year + '.txt', 'r')
    data = json.load(data_file)
    data_file.close()
    
    return data

In [41]:
# load list from jsons
#years = ['2015', '2014', '2016', '2011', '2010', '2013', '2012', '1991', '1990', 
#         '1993', '1992', '1995', '1994', '1997', '1996', '1999', '1998', '2002', 
#         '2003', '2000', '2001', '2006', '2007', '2004', '2005', '2008', '2009']
years = ['2016', '2011', '2010', '2013', '2012', '1991', '1990', 
         '1993', '1992', '1995', '1994', '1997', '1996', '1999', '1998', '2002', 
         '2003', '2000', '2001', '2006', '2007', '2004', '2005', '2008', '2009']

In [42]:
movie_list_of_year = dict()

# TODO: iterate through years and load
for year in years :
    movie_list_of_year[year] = load_list_of_year(year)
#movie_list_of_year['2014'] = load_list_of_year('2014')


# Get MovieData
data at http://movie.naver.com/movie/bi/mi/detail.nhn?code=XXXX and http://movie.naver.com/movie/bi/mi/point.nhn?code=XXXX
 




In [43]:

# { str param, object obj }
# param : 
#    name
#    watcher_rating
#    netizen_rating
#    expert_rating
#    step1 : genre, nation, runtime, open_date
#    director
#    actor
def get_movie_info(code) :
    # result { str param, object obj }
    result = dict()
    
    url_base = 'http://movie.naver.com/movie/bi/mi/detail.nhn?code='
    r = requests.get(url_base + str(code))
    
    elem_root = Element(r.text);
    
    elem_info_area = elem_root('div[class="mv_info_area"]')
    if len(elem_info_area) is not 1 :
        return None
    
    # name
    elem_movie_name = elem_info_area[0]('h3[class="h_movie"] > a')
    if len(elem_movie_name) is 0 :
        return None
    
    result['name'] = plaintext(elem_movie_name[0].content)
    
    # rate
    elem_rating_group = elem_info_area[0]('div[class="main_score"]')
    if len(elem_rating_group) is 1 :
    
        # watcher rate
        elem_actual_rate = elem_rating_group[0]('a[id="actualPointPersentBasic"] > div > em')
        list_actual_rate = map(lambda x : plaintext(x.content), elem_actual_rate)
        actual_rate = reduce(lambda a, b : a + b, list_actual_rate, '')
    
        result['watcher_rating'] = actual_rate

        # netizen rate
        elem_netizen_rate = elem_rating_group[0]('a[id="pointNetizenPersentBasic"] > em')
        list_netizen_rate = map(lambda x : plaintext(x.content), elem_netizen_rate)
        netizen_rate = reduce(lambda a, b : a + b, list_netizen_rate, '')
    
        result['netizen_rate'] = netizen_rate

        #expert rate
        elem_expert_rate = elem_rating_group[0]('div[class="spc_score_area"] > a > div > em')
        list_expert_rate = map(lambda x : plaintext(x.content), elem_expert_rate)
        expert_rate = reduce(lambda a, b : a + b, list_expert_rate, '')
    
        result['expert_rate'] = expert_rate
    else :
        result['watcher_rating'] = None
        result['netizen_rate'] = None
        result['expert_rate'] = None

#TODO : Fix Code to Parse Easily    
    # step1 : genre, nation, runtime, open_date
    elem_steps = elem_info_area[0]('dd')
    if len(elem_steps) is not 0 :
        elem_spans = elem_steps[0]('p > span')
        
        span_list = list()
        for elem_span in elem_spans :
            span_list.append(elem_span.source)
        result['step1'] = span_list
    else :
        result['step1'] = list()

        
    # actor
    elem_actor_area = elem_root('div[class="made_people"]')
    if len(elem_actor_area) is not 1 :
        result['actor'] = list()
    else :
        elem_people_list = elem_actor_area[0]('ul[class="lst_people"] > li a[class="k_name"]')
        actor_list = map(lambda x : plaintext(x.content), elem_people_list)
        result['actor'] = actor_list
    
    
    # director
    elem_director_area = elem_root('div[class="director"]')
    if len(elem_director_area) is not 1 :
        result['director'] = list()
    else :
        elem_directors = elem_director_area[0]('div[class="dir_obj"]  a[class="k_name"]')
        director_list = map(lambda x : plaintext(x.content), elem_directors)
        result['director'] = director_list
    
    return result

# { str param, object obj }
# param : 
#    name : str
#    watcher_rating : float
#    netizen_rating : float
#    expert_rating : float
#    genre : [ int genre_num ]
#    nation : [ str nation_code ]
#    runtime : int minute
#    open_date : datetime
#    
def parse_movie_info(info) :
    return

In [44]:
print len(movie_list_of_year)

25


In [45]:


for key in movie_list_of_year :
    error_list = list()
    
    movie_info = dict()
    cnt = 0
    for movie_meta in movie_list_of_year[key] :
        try :
            tmp = get_movie_info(movie_meta[1])
        except : 
            error_list.append(movie_meta)
            print 'unknow error occured on crawling ' + str(movie_meta)
            
        if tmp is None :
            error_list.append(movie_meta)
            print 'error occured on crawling ' + str(movie_meta)
        else :
            movie_info[movie_meta[1]] = tmp
        if cnt % 100 is 0 :
            print "year : " + str(key) + " count : " + str(cnt) + " done"
        cnt = cnt + 1
        #time.sleep(1)
    
    data_file = open('movie_info_' + key + '.txt', 'w')
    json.dump(movie_info, data_file, ensure_ascii=True)
    data_file.close()
    
    error_file = open('error_list_' + key + '.txt', 'w')
    json.dump(error_list, error_file, ensure_ascii=True)
    error_file.close()
    


year : 2016 count : 0 done
error occured on crawling [u'\ub098\ub294 \uc4f0\ub808\uae30\ub2e4 (I am Trash)', u'147301']
year : 2016 count : 100 done
error occured on crawling [u'\ub728\uac70\uc6b4 \uc815\uc0ac - \ubbf8\ub098 \uc774\uc57c\uae30 (Anything For You)', u'149620']
year : 2016 count : 200 done
error occured on crawling [u'\ubbf8\ub9dd\uc778\uc758 \ucc38\uc744 \uc218 \uc5c6\ub294 \uc695\uc815 (Unrefusable New Life)', u'149619']
year : 2016 count : 300 done
year : 2016 count : 400 done
error occured on crawling [u'\uc5ec\uc790\uc758 \ubb34\uae30 - \uc57c\ucfe0\uc790\uc758 \uc5ec\uc790 (Ironic destiny)', u'145306']
error occured on crawling [u'\uc695\ub9dd\uc758 \uc80a\uc740 \ub3c4\uc6b0\ubbf8 (Desire &amp; A Knife)', u'145141']
error occured on crawling [u"\uc704\ud5d8\ud55c \uc139\uc2a4: \uc544\uc57c\ub178\uc758 \ubd88\ub95c (Ayano's Adventure)", u'149625']
year : 2016 count : 500 done
error occured on crawling [u'\ucc29\ud55c \uc5ec\ube44\uc11c\uc758 \ubaa9\uc801', u'149555']

In [None]:
tmpstr = '<div><a href=\"/movie/sdb/browsing/bmovie.nhn?genre=15\">\uc560\ub2c8\uba54\uc774\uc158</a><!-- N=a:ifo.genre -->, <a href=\"/movie/sdb/browsing/bmovie.nhn?genre=2\">\ud310\ud0c0\uc9c0</a><!-- N=a:ifo.genre -->, <a href=\"/movie/sdb/browsing/bmovie.nhn?genre=1\">\ub4dc\ub77c\ub9c8</a><!-- N=a:ifo.genre --></div>'
e = Element(tmpstr)
for key in e('a') :
    print key.attr['href']
e.source