In [1]:
%matplotlib inline
import warnings; warnings.simplefilter('ignore')

import os
import time
import scipy
import pandas as pd
import numpy as np

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, auc_score

import parallel

N_JOBS = 10
ORI_ID_USER = 'userId'
ORI_ID_ITEM = 'movieId'

SEQ_ID_USER = 'user_id'
SEQ_ID_ITEM = 'item_id'

SOURCE = os.path.expanduser("~/Classification_RecSys")

#Read Data
train = pd.read_hdf(
    os.path.join(SOURCE, 'data/train.hdf'))

test = pd.read_hdf(
    os.path.join(SOURCE, 'data/test.hdf'))

item_catalogue = pd.read_hdf(
    os.path.join(SOURCE, 'data/item_catalog.hdf'))

user_id_translator = pd.concat(
    [train, test], axis=0
).drop_duplicates([
    SEQ_ID_USER])[[ORI_ID_USER, SEQ_ID_USER]]
user_id_translator.sort_values(SEQ_ID_USER, inplace=True)

item_id_translator = pd.concat(
    [train, test], axis=0
).drop_duplicates([
    SEQ_ID_ITEM])[[ORI_ID_ITEM, SEQ_ID_ITEM]]
item_id_translator.sort_values(SEQ_ID_ITEM, inplace=True)

# Summary

In [2]:
userids = set(
    train[SEQ_ID_USER]
).union(
    test[SEQ_ID_USER])

itemids = set(
    train[SEQ_ID_ITEM]
).union(
    test[SEQ_ID_ITEM])

num_users = len(userids)
num_items = len(itemids)

print("num users: %s num items: %s (Transactions)" % (
    num_users, 
    num_items))

num users: 257179 num items: 7306 (Transactions)


## RecSys Train/Test Matrix and User/Item Feature Matrix

In [3]:
train_coo = scipy.sparse.coo_matrix(
        (
            np.ones(train.shape[0]),
            (train[SEQ_ID_USER], train[SEQ_ID_ITEM])
        ), 
        shape=(num_users, num_items))
train_coo

<257179x7306 sparse matrix of type '<class 'numpy.float64'>'
	with 11055959 stored elements in COOrdinate format>

In [4]:
test_coo = scipy.sparse.coo_matrix(
        (
            np.ones(test.shape[0]),
            (test[SEQ_ID_USER], test[SEQ_ID_ITEM])
        ),
        shape=(num_users, num_items))
test_coo

<257179x7306 sparse matrix of type '<class 'numpy.float64'>'
	with 672 stored elements in COOrdinate format>

## Precision

In [5]:
top_k = 10
def get_precison_test(id_user):
    relevants = set(
        test[test[SEQ_ID_USER] == id_user][SEQ_ID_ITEM].tolist()
    )

    scores = model.predict(
        id_user,
        np.arange(num_items)
    )
    
    top_k_rec = np.argsort(-scores)[:top_k].astype(int).tolist()

    precision_local = len(relevants.intersection(top_k_rec)) / top_k

    return [id_user, precision_local]

def get_precison_train(id_user):
    relevants = set(
        train[train[SEQ_ID_USER] == id_user][SEQ_ID_ITEM].tolist()
    )

    scores = model.predict(
        id_user, 
        np.arange(num_items)
    )
    
    top_k_rec = np.argsort(-scores)[:top_k].astype(int).tolist()

    precision_local = len(relevants.intersection(top_k_rec)) / top_k

    return [id_user, precision_local]

In [10]:
start_time = time.time()

epochs_incr = 1
epochs = 10

model = LightFM(learning_rate=0.05, loss='warp')

precision_train = []
precision_test = []
train_auc = []
test_auc = []

train_sample_ids = train.sample(1000)[SEQ_ID_USER].unique()

iterations = range(epochs_incr, epochs+epochs_incr, epochs_incr)

for epoch in iterations:
    print("[epoch: %s]" % epoch)
    start_time = time.time()

    model.fit_partial(
        train_coo, 
        epochs=epochs_incr,
        num_threads=N_JOBS)
    
    train_auc_local = auc_score(
        model,
        train_coo,
        num_threads=N_JOBS).mean()
    train_auc.append(train_auc_local)
    
    test_auc_local = auc_score(
        model,
        test_coo,
        num_threads=N_JOBS).mean()
    test_auc.append(test_auc_local)
    
    print('\tTrain AUC: %s.5f' % train_auc_local)
    print('\tTest AUC: %s.5f' % test_auc_local)
    
    precision_train_local = np.array(parallel.apply(
        get_precison_train,
        train_sample_ids,
        n_jobs=N_JOBS))
    precision_train.append(precision_train_local)
    
    precision_test_local = np.array(parallel.apply(
        get_precison_test,
        test[SEQ_ID_USER].unique(),
        n_jobs=N_JOBS))
    precision_test.append(precision_test_local)
    
    print("\tTrain Precision @10: %.5f" % (
        precision_train_local[:,1].mean()))
    print("\tTest Precision @10: %.5f" % (
        precision_test_local[:,1].mean()))

    print("Elapsed Time: %smin" % round(
        (time.time() - start_time)/60, 2))

[epoch: 1]
	Train AUC: 0.9695274.5f
	Test AUC: 0.852294.5f
	Train Precision @10: 0.67911
	Test Precision @10: 0.00521
Elapsed Time: 0.72min
[epoch: 2]
	Train AUC: 0.97382015.5f
	Test AUC: 0.84858096.5f
	Train Precision @10: 0.70010
	Test Precision @10: 0.00476
Elapsed Time: 0.69min
[epoch: 3]
	Train AUC: 0.9763002.5f
	Test AUC: 0.8522096.5f
	Train Precision @10: 0.71241
	Test Precision @10: 0.00446
Elapsed Time: 0.7min
[epoch: 4]
	Train AUC: 0.9779674.5f
	Test AUC: 0.85497737.5f
	Train Precision @10: 0.72250
	Test Precision @10: 0.00476
Elapsed Time: 0.7min
[epoch: 5]
	Train AUC: 0.9791786.5f
	Test AUC: 0.8550285.5f
	Train Precision @10: 0.73007
	Test Precision @10: 0.00580
Elapsed Time: 0.7min
[epoch: 6]
	Train AUC: 0.98012245.5f
	Test AUC: 0.8566438.5f
	Train Precision @10: 0.73370
	Test Precision @10: 0.00580
Elapsed Time: 0.69min
[epoch: 7]
	Train AUC: 0.9808799.5f
	Test AUC: 0.8574718.5f
	Train Precision @10: 0.73794
	Test Precision @10: 0.00595
Elapsed Time: 0.69min
[epoch: 8]
	T

# Train Precision / Test Precision

In [8]:
def sample_recommendation(model, data_coo, id_user):
    
    local_train = train[train[SEQ_ID_USER] == id_user]
    
    genres = {}
    for x in item_catalogue.loc[local_train[SEQ_ID_ITEM]]['genres']:
        for y in x:
            if y in genres:
                genres[y] += 1
            else:
                genres[y] = 1
        
    genres = pd.Series(genres).sort_values(ascending=False)
    genres = ((genres / local_train.shape[0]).round(2) * 100).astype(int)
    genres = genres.to_dict()
    genres = ["%s:%s" %(genre, genre_cnt) for genre, genre_cnt in genres.items()]
    genres = "|".join(genres)
    
    #genres = local_train['genres'].value_counts() / local_train.shape[0]
    print("Items in train", local_train.shape, genres)
    
    known_positives = data_coo.tocsr()[id_user].indices
    known_positives = item_catalogue.loc[known_positives]
    
    scores = model.predict(
        id_user, 
        np.arange(num_items))
    
    ranking = np.argsort(-scores)
    
    top_items = item_catalogue.loc[ranking[:top_k]]

    print("User %s" % id_user)

    known_positives.index = ['Known Positives'] * known_positives.shape[0]
    top_items.index = ['Recommended'] * top_items.shape[0]
    
    print(pd.concat([known_positives, top_items])[[
        'title', 'genres', 'year'
    ]])

for it in range(2):
    print("------\nExample %s\n--------" % it)
    sample_recommendation(
        model=model,
        data_coo=test_coo,
        id_user=test.sample(1)[SEQ_ID_USER].values[0])

------
Example 0
--------
Items in train (34, 9) Drama:68|Thriller:28|Comedy:26|Romance:15|Crime:15|Action:15|History:9|Documentary:9|Adventure:9|War:6|Science Fiction:6|Music:6|Foreign:6|Family:6|Mystery:3|Horror:3
User 86748
                                                title  \
Known Positives                          Big Bad Mama   
Recommended      Confession of a Child of the Century   
Recommended                               5 Card Stud   
Recommended                            License to Wed   
Recommended                  The Million Dollar Hotel   
Recommended                The Passion of Joan of Arc   
Recommended                              Loose Screws   
Recommended        Terminator 3: Rise of the Machines   
Recommended                                The Tunnel   
Recommended                           Monsoon Wedding   
Recommended                      Beauty and the Beast   

                                              genres    year  
Known Positives       [Ac

# GB

In [9]:
item_emb = pd.DataFrame(model.item_embeddings)
item_emb = item_emb.add_prefix("i_latent_")

user_emb = pd.DataFrame(model.user_embeddings)
user_emb = user_emb.add_prefix("u_latent_")

In [10]:
print("User emb. shape", user_emb.shape)
print("Item emb. shape", item_emb.shape)

User emb. shape (257175, 10)
Item emb. shape (5843, 10)


In [11]:
ui_trans = pd.concat([train, test], axis=0)

In [12]:
user_ids = ui_trans[[SEQ_ID_USER, ORI_ID_USER]].drop_duplicates()
item_ids = ui_trans[[SEQ_ID_ITEM, ORI_ID_ITEM]].drop_duplicates()

In [13]:
user_emb[ORI_ID_USER] = user_ids[ORI_ID_USER]
item_emb[ORI_ID_ITEM] = item_ids[ORI_ID_ITEM]

In [None]:
user_emb.set_index(ORI_ID_USER, inplace=True)
item_emb.set_index(ORI_ID_ITEM, inplace=True)

In [None]:
train_trans_emb_user = user_emb.loc[train[ORI_ID_USER]]
train_trans_emb_item = item_emb.loc[train[ORI_ID_ITEM]]

In [None]:
train[]

In [None]:
import XGB as xgb