# Initialize Code

In [1]:
# notice
# <_1 1st, _2 2nd ,,, > : tuple
# [ ... ] : list
# { keytype, valtype } : dict

In [4]:
# import request
import requests

# import pattern
from pattern import web
from pattern.web import Element
from pattern.web import plaintext

# import regular expression
import re

# import json
import json


In [None]:
def get_page(url) :
    r = requests.get(url)
    
    if r.status_code is not 200 :
        return None
    
    return r.text

# Get Movie Counts per Year

Crawl 'http://movie.naver.com/movie/sdb/browsing/bmovie_open.nhn' to get movie list




In [None]:
def get_movie_count_on_year() :
    # result { str year, int count }
    result = dict()
    
    # crawl $url_dir
    url_dir = 'http://movie.naver.com/movie/sdb/browsing/bmovie_open.nhn'
    page = get_page(url_dir)
    
    # find data
    elem_root = Element(page)
    
    elem_old_content = elem_root('div[id="old_content"]')
    if len(elem_old_content) is not 1 :
        return None
    
    elem_td = elem_old_content[0]('td')
    if len(elem_td) is 0 :
        return None
    
    # parse
    td_list = map(lambda x : plaintext(x.content), elem_td)
    
    year_count_list = map(lambda x : re.split('\W+' ,x)[0:2], td_list)
    
    # save at result   
    for year_count in year_count_list :
        result[year_count[0]] = int(year_count[1])
    
    return result

In [None]:
movie_counts = get_movie_count_on_year()

# Get MetaData For Each Movie

Crawl http://movie.naver.com/movie/sdb/browsing/bmovie.nhn?open=XXXX&page=XXXX to get pair of (movie name, movie code)

movie code is used to search movie info at http://movie.naver.com/movie/bi/mi/detail.nhn?code=XXXX


In [None]:
# return [ <str name, str code> ]
def get_movie_list_of_year_page(year, page) :
    url_base = 'http://movie.naver.com/movie/sdb/browsing/bmovie.nhn?'
    url_append = 'open=' + year + '&page=' + page
    
    # find data
    page = get_page(url_base + url_append)
    
    elem_root = Element(page)
    
    elem_old_content = elem_root('div[id="old_content"]')
    if len(elem_old_content) is not 1 :
        return []
    
    elem_li = elem_old_content[0]('ul[class="directory_list"] > li')
    if len(elem_li) is 0 :
        return []
    
    elem_a = map(lambda x : x('a')[0], elem_li)
    
    # parse
    hrefs = map(lambda x : x.attrs['href'], elem_a)
    codes = map(lambda x : re.split('\D+', x)[1] if len(re.split('\D+',x )) is 2 else None, hrefs)
    names = map(lambda x : x.content, elem_a)
    
    
    # save to result
    result = map(lambda x : (names[x], codes[x]), range(len(codes)))
    
    return result

def get_movie_list_of_year(year) :
    page_count = movie_counts[year] / 20
    if movie_counts[year] % 20 is not 0 :
        page_count = page_count + 1
    
    list_per_page = map(lambda x : get_movie_list_of_year_page(year, str(x)), range(1, page_count + 1))
    list_per_year = reduce(lambda a, b : a + b, list_per_page, [])
    
    
    return list_per_year

In [None]:
# filter
movie_counts['1940'] = 0
movie_counts['1950'] = 0
movie_counts['1960'] = 0
movie_counts['1970'] = 0
movie_counts['1980'] = 0
movie_counts['2017'] = 0

# save to json
for year in movie_counts :
    if movie_counts[year] is not 0 :
        tmp = get_movie_list_of_year(year)

        data_file = open('movie_list_' + year + '.txt', 'w')
        json.dump(tmp, data_file, ensure_ascii=True)
        data_file.close()


# Load MetaData

In [1]:
def load_list_of_year(year) :
    data_file = open('movie_list_' + year + '.txt', 'r')
    data = json.load(data_file)
    data_file.close()
    
    return data

In [2]:
# load list from jsons
years = ['2015', '2014', '2016', '2011', '2010', '2013', '2012', '1991', '1990', 
         '1993', '1992', '1995', '1994', '1997', '1996', '1999', '1998', '2002', 
         '2003', '2000', '2001', '2006', '2007', '2004', '2005', '2008', '2009']

In [5]:
movie_list_of_year = dict()

# TODO: iterate through years and load
for year in years :
    movie_list_of_year[year] = load_list_of_year(year)



# Get MovieData
data at http://movie.naver.com/movie/bi/mi/detail.nhn?code=XXXX and http://movie.naver.com/movie/bi/mi/point.nhn?code=XXXX
 




In [25]:

# { str param, object obj }
# param : 
#    name
#    watcher_rating
#    netizen_rating
#    expert_rating
#    step1 : genre, nation, runtime, open_date
#    director
#    actor
def get_movie_info(code) :
    # result { str param, object obj }
    result = dict()
    
    url_base = 'http://movie.naver.com/movie/bi/mi/detail.nhn?code='
    r = requests.get(url_base + str(code))
    
    elem_root = Element(r.text);
    
    elem_info_area = elem_root('div[class="mv_info_area"]')
    if len(elem_info_area) is not 1 :
        return None
    
    # name
    elem_movie_name = elem_info_area[0]('h3[class="h_movie"] > a')
    if len(elem_movie_name) is 0 :
        return None
    
    result['name'] = plaintext(elem_movie_name[0].content)
    
    # rate
    elem_rating_group = elem_info_area[0]('div[class="main_score"]')
    if len(elem_rating_group) is 1 :
    
        # watcher rate
        elem_actual_rate = elem_rating_group[0]('a[id="actualPointPersentBasic"] > div > em')
        list_actual_rate = map(lambda x : plaintext(x.content), elem_actual_rate)
        actual_rate = reduce(lambda a, b : a + b, list_actual_rate, '')
    
        result['watcher_rating'] = actual_rate

        # netizen rate
        elem_netizen_rate = elem_rating_group[0]('a[id="pointNetizenPersentBasic"] > em')
        list_netizen_rate = map(lambda x : plaintext(x.content), elem_netizen_rate)
        netizen_rate = reduce(lambda a, b : a + b, list_netizen_rate, '')
    
        result['netizen_rate'] = netizen_rate

        #expert rate
        elem_expert_rate = elem_rating_group[0]('div[class="spc_score_area"] > a > div > em')
        list_expert_rate = map(lambda x : plaintext(x.content), elem_expert_rate)
        expert_rate = reduce(lambda a, b : a + b, list_expert_rate, '')
    
        result['expert_rate'] = expert_rate
    else :
        result['watcher_rating'] = None
        result['netizen_rate'] = None
        result['expert_rate'] = None

#TODO : Fix Code to Parse Easily    
    # step1 : genre, nation, runtime, open_date
    elem_steps = elem_info_area[0]('dd')
    if len(elem_steps) is not 0 :
        elem_spans = elem_steps[0]('p > span')
        
        span_list = list()
        for elem_span in elem_spans :
            span_list.append(elem_span.source)
        result['step1'] = span_list
    else :
        result['step1'] = list()

        
    # actor
    elem_actor_area = elem_root('div[class="made_people"]')
    if len(elem_actor_area) is not 1 :
        result['actor'] = list()
    else :
        elem_people_list = elem_actor_area[0]('ul[class="lst_people"] > li a[class="k_name"]')
        actor_list = map(lambda x : plaintext(x.content), elem_people_list)
        result['actor'] = actor_list
    
    
    # director
    elem_director_area = elem_root('div[class="director"]')
    if len(elem_director_area) is not 1 :
        result['director'] = list()
    else :
        elem_directors = elem_director_area[0]('div[class="dir_obj"]  a[class="k_name"]')
        director_list = map(lambda x : plaintext(x.content), elem_directors)
        result['director'] = director_list
    
    return result

# { str param, object obj }
# param : 
#    name : str
#    watcher_rating : float
#    netizen_rating : float
#    expert_rating : float
#    genre : [ int genre_num ]
#    nation : [ str nation_code ]
#    runtime : int minute
#    open_date : datetime
#    
def parse_movie_info(info) :
    return

In [26]:
print len(movie_list_of_year['2015'])

1419


In [27]:
error_list = list()

for key in movie_list_of_year :
    movie_info = dict()
    cnt = 0
    for movie_meta in movie_list_of_year[key] :
        tmp = get_movie_info(movie_meta[1])
        if tmp is None :
            error_list.append(movie_meta)
            print 'error occured on crawling ' + str(movie_meta)
        else :
            movie_info[movie_meta[1]] = tmp
        cnt = cnt + 1
        if cnt % 100 is 0 :
            print "year : " + str(key) + " count : " + str(cnt) + " done"
    
    data_file = open('movie_info_' + key + '.txt', 'w')
    json.dump(movie_info, data_file, ensure_ascii=True)
    data_file.close()    
    
error_file = open('error_list.txt')
json.dump(error_list, error_file, ensure_ascii=True)
error_file.close()


error occured on crawling [u'\uac15\ud55c\uac74 \ub2f9\uc2e0\uc758 \ud0a4\uc2a4 - \uccb4\ucde8 (Bride of the farmhouse I want to meet you)', u'143320']
year : 2015 count : 100 done
error occured on crawling [u'\uad74\uc695 \uc870\uad50 - \ubd88\ud0c0\ub294 \uc720\ubd80\ub140 (Rape training)', u'143271']
error occured on crawling [u'\uadf8 \uc785\uc220\uc5d0 \uc0ac\ub791\uc744 (Love a heart)', u'143243']
error occured on crawling [u'\uadf8\ub140\uc758 \uc74c\ub780\ud55c \uaf43 (When a married woman invites a man)', u'143349']
year : 2015 count : 200 done
error occured on crawling [u'\ub2e4\ub77d\ubc29\uc758\ube44\ubc00\ubb34\uc0ad\uc81c\ud310 (SECRETS IN THE ATTIC)', u'135960']
error occured on crawling [u'\ub2ec\ube5b\uc139\uc2a4\uc815\uc0ac-\uae30\ubaa8\ub178 (The beauty in the moonlight)', u'144437']
year : 2015 count : 300 done
error occured on crawling [u'\ub450 \uc787 (Do It!)', u'37920']
error occured on crawling [u'\ub5a1\uad6d\uc5f4\ucc28\uc758\ube44\ubc00\uc560 (ABDUCTION TRAI

In [41]:
tmpstr = '<div><a href=\"/movie/sdb/browsing/bmovie.nhn?genre=15\">\uc560\ub2c8\uba54\uc774\uc158</a><!-- N=a:ifo.genre -->, <a href=\"/movie/sdb/browsing/bmovie.nhn?genre=2\">\ud310\ud0c0\uc9c0</a><!-- N=a:ifo.genre -->, <a href=\"/movie/sdb/browsing/bmovie.nhn?genre=1\">\ub4dc\ub77c\ub9c8</a><!-- N=a:ifo.genre --></div>'
e = Element(tmpstr)
for key in e('a') :
    print key.attr['href']
e.source

/movie/sdb/browsing/bmovie.nhn?genre=15
/movie/sdb/browsing/bmovie.nhn?genre=2
/movie/sdb/browsing/bmovie.nhn?genre=1


u'<div><a href="/movie/sdb/browsing/bmovie.nhn?genre=15">\\uc560\\ub2c8\\uba54\\uc774\\uc158</a><!-- N=a:ifo.genre -->, <a href="/movie/sdb/browsing/bmovie.nhn?genre=2">\\ud310\\ud0c0\\uc9c0</a><!-- N=a:ifo.genre -->, <a href="/movie/sdb/browsing/bmovie.nhn?genre=1">\\ub4dc\\ub77c\\ub9c8</a><!-- N=a:ifo.genre --></div>'