In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

import warnings; warnings.simplefilter('ignore')

import os
import time
import scipy
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score

import config
import parallel
import evalutils
import datautils

N_JOBS = 3

#Features Names
TARGET = 'rating'
DATE = 'timestamp'
ID_USER = 'userId'
ID_ITEM = 'movieId'

#Read Data
ui_transact = pd.read_hdf(
    os.path.join(
        config.SOURCE, 
        'data/train.hdf'
    )
)

In [2]:
#Users with more than one item in history
user_ids = ui_transact[ID_USER].value_counts()
user_ids = user_ids[user_ids > 1]

ui_transact = ui_transact[
    ui_transact[ID_USER].isin(user_ids.index)
]

#Only consider one movie per day
ui_transact[DATE] = pd.DatetimeIndex(
    pd.to_datetime(ui_transact[DATE], unit='s')
).normalize()

ui_transact.drop_duplicates([
    ID_USER,
    DATE],
    inplace=True)

#Normalize dates
ui_transact['year'] = ui_transact[DATE].dt.year
ui_transact['month'] = ui_transact[DATE].dt.month

date_norm = np.timedelta64(1, 'M')
date_min = ui_transact[DATE].min()

ui_transact[DATE] = (ui_transact[DATE] - date_min) / date_norm

#Normalize ratings
min_rating = ui_transact['rating'].min()
max_rating = ui_transact['rating'].max()

ui_transact['rating'] = (
    ui_transact['rating'] - min_rating
) / (
    max_rating - min_rating
)

In [3]:
ui_transact.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,id_transaction,user_id,item_id,year,month
1,1,147,0.888889,229.294236,The 400 Blows,0,0,0,2015,3
31,2,64,0.777778,16.788846,Talk to Her,2,1,2,1997,6
60,4,415,0.777778,83.54997,Batman & Robin,4,2,4,2003,1
232,8,1404,0.111111,72.445019,Ae Fond Kiss...,9,3,9,2002,2
453,11,3513,0.666667,155.436457,Legionnaire,27,4,27,2009,1


In [4]:
ui_transact[ID_USER].nunique()

101576

In [5]:
#Candidate Selection
# n_candidates = 50000
# min_hist_size = 1

#Select candidates
# hist_size = ui_transact[ID_USER].value_counts()
# is_elegible = hist_size > min_hist_size
candidates = ui_transact[ID_USER].unique()

# hist_size[
#     is_elegible
# ].sample(n_candidates).index

#Split data by user
ui_grp = ui_transact.groupby(ID_USER)

user_hist = []

for user_id in candidates:
    data = ui_grp.get_group(user_id)
    
    years_in_data = data['year'].unique()
    valid = (
        2016 in years_in_data
    ) and (
        len(years_in_data)
    )
    
    if not valid:
        train_data = data[data['year'] < 2016]
        user_hist.append({
            'TARGET': TARGET,
            'DATE': DATE,
            'ID_ITEM': ID_ITEM,
            'ID_USER': ID_USER,
            'data': data
        })

#Compute their Hist. Transactions
user_hist = parallel.apply(
    datautils.get_hist_user,
    user_hist,
    n_jobs=config.N_JOBS)

KeyboardInterrupt: 

In [None]:
#Split train/test always leaving the last transaction to make predictions
X_train, X_test = [], []
y_train, y_test = [], []
for user_transact in user_hist:
    X_train += user_transact[0][:-1]
    X_test += [user_transact[0][-1]]
    
    y_train += user_transact[2][:-1]
    y_test += [user_transact[2][-1]]

v = DictVectorizer()
v.fit(X_train)

X_train = v.transform(X_train)
X_test = v.transform(X_test)

y_test = pd.Series(y_test)
y_train = pd.Series(y_train)

In [None]:
X_train

In [None]:
X_test

# Train The RecSys Model

In [None]:
from sklearn.metrics import mean_squared_error
from fastFM import als

fm = als.FMRegression(
    rank=30, 
    n_iter=1)
fm.fit(X_train, y_train)

for n_more_iter in range(10):
    fm.fit(
        X_train,
        y_train,
        n_more_iter=n_more_iter*10)
    
    y_pred_train = fm.predict(X_train)
    y_pred_test = fm.predict(X_test)
    
    MSE_train = mean_squared_error(y_true=y_train, y_pred=y_pred_train)
    MSE_test = mean_squared_error(y_true=y_test, y_pred=y_pred_test)
    print(n_more_iter, "Train: %s Test %s" % (MSE_train, MSE_test))
    

In [None]:
y_pred = pd.Series(fm.predict(X_test))
y_pred = (y_pred - y_pred.min()) / (y_pred.max() - y_pred.min())

y_pred.plot(kind='hist', bins=10, alpha=.5)
y_test.plot(kind='hist', bins=10, alpha=.5)

In [None]:
x = X_test[0]
fm.w0_ + (fm.w_ * x.transpose())[0]

In [None]:
fm.V_.shape

In [None]:
n_features = x.shape[1]
np.repeat(x.A, n_features, 1).T * x

In [None]:
#S = sparse.lil_matrix([[0,1,2],[0,0,0],[1,0,0]])
#x.A.repeat(n_features, 1).T
# print S.A.repeat([1,2,3], axis=0)
# print S.A[(0,1,1,2,2,2),:]
# print lil_repeat(S,[1,2,3]).A
# print S[(0,1,1,2,2,2),:].A

In [None]:
fm.predict(X_test[0:1, :])