In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings; warnings.filterwarnings('ignore');

import os
import numpy as np
import pandas as pd
from ast import literal_eval

import parallel

N_JOBS = 10
SOURCE = os.path.expanduser('~/Classification_RecSys/')

ORI_USER_ID = 'userId'
SEQ_USER_ID = 'user_id'

ORI_ITEM_ID = 'movieId'
SEQ_ITEM_ID = 'item_id'
#Download in https://www.kaggle.com/rounakbanik/the-movies-dataset/data
ratings = pd.read_csv(os.path.join(SOURCE, 'data/ratings.csv'))

ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,2015-03-09 22:52:09
1,1,147,4.5,2015-03-09 23:07:15
2,1,858,5.0,2015-03-09 22:52:03
3,1,1221,5.0,2015-03-09 22:52:26
4,1,1246,5.0,2015-03-09 22:52:36


In [2]:
item_features = pd.read_csv(os.path.join(
    SOURCE, 
    'data/movies_metadata.csv'))
credits = pd.read_csv(os.path.join(
    SOURCE, 
    'data/credits.csv'))
keywords = pd.read_csv(os.path.join(
    SOURCE, 
    'data/keywords.csv'))

#Few cleanning
item_features = item_features.drop([19730, 29503, 35587], axis=0)
item_features.drop_duplicates('id', inplace=True)
keywords.drop_duplicates('id', inplace=True)
credits.drop_duplicates('id', inplace=True)

item_features['id'] = pd.to_numeric(item_features['id'], errors='coerce')
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

#Merge meta data
item_features.drop_duplicates('id', inplace=True)
item_features.index = item_features['id']

item_features = item_features.merge(keywords)
item_features = item_features.merge(credits)

#Basic transform strings to objects (arrays, jsons)
for f in ['cast', 'crew', 'keywords', 'genres']:
    item_features[f] = parallel.apply(
        literal_eval,
        item_features[f],
        n_jobs=N_JOBS)

In [3]:
item_features['cast_size'] = item_features['cast'].apply(len)
item_features['crew_size'] = item_features['crew'].apply(len)

In [4]:
item_features['genres'] = item_features['genres'].apply(
    lambda x: [y['name'] for y in x])

item_features_num = item_features[[
    'id',
    'imdb_id',
    'budget',
    'popularity',
    'revenue',
    'runtime',
    'vote_average',
    'vote_count',
    'cast_size',
    'crew_size']]

In [5]:
all_genres = set()
for x in item_features['genres'].values:
    all_genres.update(x)

all_genres = list(all_genres)
print(" | ".join(all_genres))

all_genres = pd.Series(all_genres)
all_genres.head()

Drama | Adventure | Horror | Science Fiction | Mystery | Music | Foreign | Documentary | Animation | History | Thriller | Crime | Family | War | Western | Comedy | TV Movie | Fantasy | Action | Romance


0              Drama
1          Adventure
2             Horror
3    Science Fiction
4            Mystery
dtype: object

In [6]:
def get_target(x):
    x = all_genres.isin(x)
    return x / x.sum()

target = pd.DataFrame(parallel.apply(
    get_target,
    item_features['genres'].tolist(),
    n_jobs=N_JOBS))

target.columns = all_genres
target.index = item_features['id'].values

In [11]:
item_features['id']

0           862
1          8844
2         15602
3         31357
4         11862
5           949
6         11860
7         45325
8          9091
9           710
10         9087
11        12110
12        21032
13        10858
14         1408
15          524
16         4584
17            5
18         9273
19        11517
20         8012
21         1710
22         9691
23        12665
24          451
25        16420
26         9263
27        17015
28          902
29        37557
          ...  
45402     45527
45403    455661
45404    327237
45405     84710
45406     39562
45407     14008
45408     44330
45409     49279
45410     44333
45411     49277
45412     49271
45413     44324
45414    122036
45415     14885
45416     49280
45417    106807
45418    276895
45419    404604
45420    420346
45421     67179
45422     84419
45423    390959
45424    289923
45425    222848
45426     30840
45427    439050
45428    111109
45429     67758
45430    227506
45431    461257
Name: id, Length: 45432,

# Split Train/Test

In [8]:
split_date = pd.datetime(
    year=2015,
    month=1,
    day=1)

test = ratings[ratings['timestamp'] > split_date]
train = ratings[ratings['timestamp'] <= split_date]

In [83]:
train_coo = scipy.sparse.coo_matrix(
        (
            np.ones(train.shape[0]),
            (train[SEQ_ID_USER], train[SEQ_ID_ITEM])
        ), 
        shape=(num_users, num_items))
train_coo

In [84]:
credits

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
5,6,113277,949.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0
9,10,113189,710.0
