In [178]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import warnings; warnings.filterwarnings('ignore');

import os
import scipy
import pandas as pd
import numpy as np
from ast import literal_eval

import parallel

N_JOBS = 10
SOURCE = os.path.expanduser("~/Classification_RecSys/")

#Read Data
ratings_path = os.path.join(
    SOURCE,
    'data/ratings.csv')
ratings = pd.read_csv(ratings_path)

item_f_path = os.path.join(
    SOURCE,
    'data/movies_metadata.csv')
item_features = pd.read_csv(item_f_path)

credits_path = os.path.join(
    SOURCE,
    'data/credits.csv')
credits = pd.read_csv(credits_path)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [179]:
#Merge Data
item_features['id'] = pd.to_numeric(
    item_features['id'], 
    errors='coerce')
credits['id'] = pd.to_numeric(
    credits['id'], 
    errors='coerce')

credits = credits[~credits['id'].isnull()]
item_features = item_features[~item_features['id'].isnull()]

item_features.drop_duplicates('id', inplace=True)
credits.drop_duplicates('id', inplace=True)

item_features['id'] = item_features['id'].astype(int)
credits['id'] = credits['id'].astype(int)

item_features.index = item_features['id']
credits.index = credits['id']

In [182]:
idx = set(
    item_features.index.tolist()
).intersection(
    credits.index.tolist()
)
idx = list(idx)
len(idx)

item_features = item_features[item_features.index.isin(idx)]
credits = credits[credits.index.isin(idx)]

item_features = pd.concat([
    item_features,
    credits.loc[item_features.index]],
    axis=1)
item_features = item_features.drop('id', axis=1)

ratings = ratings[ratings['movieId'].isin(item_features.index)]

In [187]:
def literal_eval_(x):
    try:
        return literal_eval(x)
    except:
        return []
    
#Getting target
item_features['genres'] = item_features['genres'].apply(
    lambda x: [y['name'] for y in literal_eval_(x) ])

genres = item_features['genres'].tolist() 
all_genres = list(set([y for x in genres for y in x]))
print("|".join(all_genres))

all_genres = pd.Series(all_genres)
all_genres.head()

target = item_features['genres'].apply(
    lambda x: all_genres.isin(x))

target.columns = all_genres
target = target.apply(
    lambda x: x / x.sum(), axis=1)

target.head()

Horror|Western|Fantasy|Thriller|Comedy|Family|Action|War|Music|Documentary|Adventure|Drama|Crime|Romance|Science Fiction|Mystery|Foreign|History|TV Movie|Animation


Unnamed: 0_level_0,Horror,Western,Fantasy,Thriller,Comedy,Family,Action,War,Music,Documentary,Adventure,Drama,Crime,Romance,Science Fiction,Mystery,Foreign,History,TV Movie,Animation
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
862,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333
8844,0.0,0.0,0.333333,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15602,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
31357,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0
11862,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Item Features

Read data an basic data transformation (genres as array & target)

In [188]:
#Year
item_features['year'] = pd.to_datetime(
    item_features['release_date'], 
    errors='coerce').dt.year

def count_json(x):
    try:
        return len(literal_eval(x))
    except:
        return []

#Crew and Cast Sizes
for f in ['cast', 'crew']:
    new_feature = "%s_size" % f
    print(new_feature)
    item_features[new_feature] = parallel.apply(
        count_json,
        item_features[f],
        n_jobs=N_JOBS)

#Ensure Numeric
for f in ['vote_count', 'vote_average', 'budget', 'popularity', 'revenue']:
    item_features[f] = pd.to_numeric(
        item_features[f], 
        errors='coerce')

cast_size
crew_size


In [189]:
#Language
def parse_spok_lan(x):
    return [y['name'] for y in literal_eval_(x)]

item_features['spoken_languages'] = parallel.apply(
    parse_spok_lan,
    item_features['spoken_languages'])

languages = item_features['spoken_languages']
all_languages = list(set([y for x in languages for y in x]))
print("|".join(all_languages))

all_languages = pd.Series(all_languages)

spoken_languages = item_features['spoken_languages'].apply(
    lambda x: all_languages.isin(x))

spoken_languages.columns = all_languages
spoken_languages.index = item_features.index

spoken_languages = spoken_languages.add_prefix("language_")
spoken_languages.head()

|Norsk|No Language|svenska|עִבְרִית|Kinyarwanda|فارسی|Wolof|Slovenščina|Italiano|Latin|日本語|Galego|Esperanto|Bahasa indonesia|Azərbaycan|普通话|Magyar|Français|Cymraeg|Malti|shqip|Tiếng Việt|Dansk|Srpski|हिन्दी|ქართული|اردو|Español|Hrvatski|Český|ελληνικά|한국어/조선말|қазақ|Lietuvikai|Fulfulde|euskera|العربية|English|Bahasa melayu|Português|Íslenska|Bamanankan|Slovenčina|ਪੰਜਾਬੀ|தமிழ்|广州话 / 廣州話|Pусский|ozbek|Hausa|Somali|isiZulu|Polski|Український|తెలుగు|Deutsch|Català|Gaeilge|ภาษาไทย|Bokmål|??????|?????|Eesti|Afrikaans|Kiswahili|български език|беларуская мова|Nederlands|پښتو|suomi|Latviešu|Română|বাংলা|Bosanski|Türkçe


Unnamed: 0_level_0,language_,language_Norsk,language_No Language,language_svenska,language_עִבְרִית,language_Kinyarwanda,language_فارسی,language_Wolof,language_Slovenščina,language_Italiano,...,language_български език,language_беларуская мова,language_Nederlands,language_پښتو,language_suomi,language_Latviešu,language_Română,language_বাংলা,language_Bosanski,language_Türkçe
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8844,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
15602,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
31357,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
11862,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [190]:
numeric_features = item_features[[
    'id',
    'budget',
    'popularity',
    'revenue',
    'runtime',
    'vote_average',
    'vote_count',
    'year',
    'cast_size',
    'crew_size']]

KeyError: "['id'] not in index"

In [None]:
i_features = pd.concat([
    numeric_features,
    spoken_languages.astype(int)],
    axis=1)

In [None]:
i_features

# Concat. title and genre in transactions

In [None]:
item_id_to_name = item_features['title'].to_dict()

def get_title(x):
    if x in item_id_to_name: 
        return item_id_to_name[x]
    else:
        np.nan

ratings['title'] = parallel.apply(
    get_title,
    ratings['movieId'],
    n_jobs=N_JOBS)

In [None]:
ratings

# Train/Test candidates

In [None]:
ORI_ID_USER = 'userId'
ORI_ID_ITEM = 'movieId'

SEQ_ID_USER = 'user_id'
SEQ_ID_ITEM = 'item_id'

In [None]:
#6 month time-window
ratings['id_transaction'] = range(ratings.shape[0])
ratings.rename(columns={'timestamp': 'date'}, inplace=True)
ratings['date'] = pd.to_datetime(ratings['date'])

max_date = ratings['date'].max()
year_max = max_date.year
month_max = max_date.month - 7

split_date = pd.datetime(
    year=year_max,
    month=month_max,
    day=1)

train_filter = ratings['date'] < split_date
test_filter = ratings['date'] >= split_date

train_ids = ratings[train_filter]['id_transaction'].tolist()
test_ids = ratings[test_filter]['id_transaction'].tolist()

train_candidates = ratings[ratings['id_transaction'].isin(train_ids)]
test_candidates = ratings[ratings['id_transaction'].isin(test_ids)]

#Users with train ratings
id_in_train_and_test = set(
    test_candidates[ORI_ID_USER].unique()
).intersection(
    set(train_candidates[ORI_ID_USER].unique())
)

test_candidates = test_candidates[
    test_candidates[ORI_ID_USER].isin(id_in_train_and_test)]

#Users with only one item in test
cnt_films_watched = test_candidates[ORI_ID_USER].value_counts()

valid_users = cnt_films_watched[cnt_films_watched == 1].index
test_candidates = test_candidates[
    test_candidates[ORI_ID_USER].isin(valid_users)]

######################
train = train_candidates
test = test_candidates

ratings = pd.concat([train, test], axis=0)

## Plot Train data for Test users

In [None]:
train_candidates[train_candidates[ORI_ID_USER].isin(
    id_in_train_and_test)][ORI_ID_USER].value_counts().plot(
    kind='hist', figsize=(18, 5), title='test users hist size', bins=100);

## Summary

In [None]:
print("Train data:\t%s (#users: %s #items:%s)" % (
    train.shape[0],
    len(train[ORI_ID_USER].unique()),
    len(train[ORI_ID_ITEM].unique())))

print("Test data:\t%s (#users: %s #items:%s)" % (
    test.shape[0],
    test[ORI_ID_USER].nunique(),
    test[ORI_ID_ITEM].nunique()))

# Data Processing (after having fixed train/test data)

## Assign new sequencial user/item IDs

In [61]:
IDorigin_IDuser = {}
IDuser_IDorigin = {}
for id_user, id_origin in enumerate(ratings[ORI_ID_USER].unique()):
    IDorigin_IDuser[id_origin] = id_user
    IDuser_IDorigin[id_user] = id_origin

IDorigin_IDitem = {}
IDitem_IDorigin = {}
for id_item, id_origin in enumerate(ratings[ORI_ID_ITEM].unique()):
    IDorigin_IDitem[id_origin] = id_item
    IDitem_IDorigin[id_item] = id_origin

num_users = len(IDorigin_IDuser)
num_items = len(IDorigin_IDitem)

ratings[SEQ_ID_USER] = ratings[ORI_ID_USER].apply(lambda x: IDorigin_IDuser[x])
ratings[SEQ_ID_ITEM] = ratings[ORI_ID_ITEM].apply(lambda x: IDorigin_IDitem[x])

train = ratings[ratings['id_transaction'].isin(train['id_transaction'])]
test = ratings[ratings['id_transaction'].isin(test['id_transaction'])]

print("#Users:%s #Items:%s sparsity: %s" % (
    num_users,
    num_items,
    round(1 - (ratings.shape[0] / (num_users * num_items)), 5)
))

NameError: name 'ORI_ID_USER' is not defined

In [None]:
train.to_hdf(
    os.path.join(SOURCE, "data/train.hdf" ), key='train')

test.to_hdf(
    os.path.join(SOURCE, "data/test.hdf" ), key='test')

## Build item catalog

In [None]:
item_features = item_features[item_features['id'].isin( ratings[ORI_ID_ITEM].unique() )]
item_features[SEQ_ID_ITEM] = pd.Series(item_features['id']).apply(
    lambda x: IDorigin_IDitem[x]).values

item_cat_f = [
    'title', 
    'popularity', 
    'genres', 
    'year', 
    SEQ_ID_ITEM, 
    'id']

item_catalog = item_features[item_cat_f].copy()
item_catalog.drop_duplicates(
    list(set(item_cat_f)-set(['genres'])),
    inplace=True)

item_catalog.index = item_catalog[SEQ_ID_ITEM]

item_catalog.sort_values('item_id', inplace=True)
print(item_catalog.shape)
item_catalog.head()

In [None]:
item_catalog.to_hdf(
    os.path.join(SOURCE, "data/item_catalog.hdf" ), 
    key='item_catalog')