# LearningAgent

## Module imports and data loader

In [223]:
# module imports

import json
import re
import pandas
import numpy
import scipy
import sklearn
from sklearn.feature_extraction.text import *
from sklearn.linear_model import SGDClassifier

In [224]:
# load list from jsons
years = ['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999',
         '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
         '2010', '2011', '2012', '2013', '2014', '2015', '2016']

all_data = dict()

def load_info_of_year(year) :
    data_file = open('movie_info_w_' + year + '.txt', 'r')
    data = json.load(data_file)
    data_file.close()
    return data

for year in years:
    year_data = load_info_of_year(year)
    all_data.update(year_data)

In [225]:
filtered_data = dict()

#print len(all_data)

for key in all_data:
    if all_data[key]['netizen_rate'] is not None:
        if all_data[key]['expert_rate'] is None or \
                all_data[key]['expert_rate'] is not None and \
                not (abs(all_data[key]['netizen_rate'] - all_data[key]['expert_rate']) > 4):
            item = all_data[key].copy()
            item.pop('expert_rate')
            item.pop('watcher_rating')
            item.pop('open_year')
            item['staff'] = item['actor'] + item['director']
            item.pop('actor')
            item.pop('director')
            if ' ' in item['nation']:
                raise
            item['staff'] = map(lambda x: '__STAFF__' + x.replace(' ', '_'), item['staff'])
            filtered_data[key] = item
            
#print len(filtered_data)

In [226]:
df = pandas.DataFrame(filtered_data)
df.transpose().head().transpose()

Unnamed: 0,10001,100015,100021,100022,100023
genre,"[__GENRE__드라마, __GENRE__멜로/로맨스]","[__GENRE__애니메이션, __GENRE__가족, __GENRE__모험]","[__GENRE__애니메이션, __GENRE__판타지]","[__GENRE__애니메이션, __GENRE__판타지, __GENRE__드라마]",[__GENRE__드라마]
name,시네마 천국,천재강아지 미스터 피바디,신비의 법,극장판 마법소녀 마도카☆마기카 [전편] 시작의 이야기,부곡 하와이
nation,"[__NATION__프랑스, __NATION__이탈리아]",[__NATION__미국],[__NATION__일본],[__NATION__일본],[__NATION__한국]
netizen_rate,9.3,8.42,3.36,9.04,6.11
runtime,124,92,119,130,84
staff,"[__STAFF__자끄_페렝, __STAFF__브리지트_포시, __STAFF__필립...","[__STAFF__타이_버렐, __STAFF__맥스_찰스, __STAFF__아리엘_...","[__STAFF__코야스_타케히토, __STAFF__후지무라_아유미, __STAFF...","[__STAFF__유우키_아오이, __STAFF__사이토_치와, __STAFF__미...","[__STAFF__박명신, __STAFF__류혜린, __STAFF__오성태, __S..."


In [227]:
dataset = list()
stars = list()

for movie_id in filtered_data :
    staffs = reduce(lambda a, b : a + ' ' + b, filtered_data[movie_id]['staff'], '')
    nations = reduce(lambda a, b : a + ' ' + b, filtered_data[movie_id]['nation'], '')
    genres = reduce(lambda a, b : a + ' ' + b, filtered_data[movie_id]['genre'], '')
    runtime = '__RUNTIME__' + str(filtered_data[movie_id]['runtime'] / 60) if filtered_data[movie_id]['runtime'] is not None else '__RUNTIME__X'
    rate = filtered_data[movie_id]['netizen_rate']
    
    dataset.append(staffs + ' ' + nations + ' ' + genres + ' ' + runtime)
    stars.append(rate)


v = CountVectorizer(min_df=0)
v.fit(dataset)
X = v.transform(dataset).toarray()
Y = stars
Y = map(lambda x : (int)(x * 100), Y)

## Model & Learning Validation

In [230]:
# Useful model?
# incremental cross-validator

print len(X)
div = 3 # set divider
mul = 3000 # set size multiplier
for i in range(div):
    train_begin = ((i + 0) * mul)
    train_end = ((i + 1) * mul)
    cross_begin = ((i + 1) * mul)
    cross_end = ((i + 2) * mul)
    
    print "Train: [" + str(train_begin) + ", " + str(train_end) + ")"
    print "Cross: [" + str(cross_begin) + ", " + str(cross_end) + ")"
    
    train_X = X[train_begin:train_end]
    train_Y = Y[train_begin:train_end]
    cross_X = X[cross_begin:cross_end]
    cross_Y = Y[cross_begin:cross_end]
    
    sgd = SGDClassifier(n_jobs=-1)
    sgd.fit(train_X, train_Y)
    
    print sgd.score(cross_X, cross_Y)

14213
Train: [0, 3000)
Cross: [3000, 6000)
0.0846666666667
Train: [3000, 6000)
Cross: [6000, 9000)
0.097
Train: [6000, 9000)
Cross: [9000, 12000)
0.0946666666667


In [231]:
# For production
# total set train
setlen = (int)(len(X) * 0.8)

sgd = SGDClassifier(n_jobs=-1)
sgd.fit(X[0:setlen], Y[0:setlen])

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=-1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [232]:
print sgd.score(X[setlen:], Y[setlen:])

0.0809004572635


In [233]:
def vectorize(genres, staffs, nations, runtime) :
    nations = map(lambda x : '__NATION__' + x, nations)
    staffs = map(lambda x : '__STAFF__' + x.replace(' ', '_'), staffs)
    genres = map(lambda x : '__GENRE__' + x, genres)
    
    staffs = reduce(lambda a, b : a + ' ' + b, staffs, '')
    nations = reduce(lambda a, b : a + ' ' + b, nations, '')
    genres = reduce(lambda a, b : a + ' ' + b, genres, '')
    runtime = '__RUNTIME__' + str(runtime / 60) if runtime is not None else '__RUNTIME__X'
    
    tmp = staffs + ' ' + nations + ' ' + genres + ' ' + runtime
    return v.transform([tmp]).toarray()

## Prediction Test  

In [297]:
""" Input order: genre,
    directors & actors,
    nation,
    running time"""
predict_x = vectorize(['액션', 'SF'], 
                      ['안소니 루소',
                       '크리스 에반스',
                       '로버트 다우니 주니어',
                       '세바스찬 스탠'], 
                      ['미국'], 
                      147)

# print scipy.sparse.csr.csr_matrix(predict_x)

# Predicted rating
print sgd.predict(predict_x)[0] / 100.0

9.33
