## Plain old collaborative filtering

In [31]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.datasets import fetch_movielens

In [32]:
movielens = fetch_movielens()
train = movielens['train']
test = movielens['test']

In [33]:
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train, epochs=20)

# these are expected to be low
# https://stackoverflow.com/questions/45451161/evaluating-the-lightfm-recommendation-model/45466481#45466481
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()

train_auc = auc_score(model, train).mean()
test_auc = auc_score(model, test).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Precision: train 0.63, test 0.11.
AUC: train 0.94, test 0.91.


# Hybrid recommender example

### 1. Pure collaborative filtering aproach

In [34]:
import numpy as np
from lightfm import LightFM
from lightfm.evaluation import auc_score
from lightfm.datasets import fetch_stackexchange


NUM_THREADS = 4
NUM_COMPONENTS = 30 
NUM_EPOCHS = 3
ITEM_ALPHA = 1e-6

In [35]:
data = fetch_stackexchange('crossvalidated',
                           test_set_fraction=0.1,
                           indicator_features=False,
                           tag_features=True)

train = data['train']
test = data['test']

In [36]:
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS)

# Run 3 epochs and time it.
%time model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS)

CPU times: user 368 ms, sys: 884 µs, total: 369 ms
Wall time: 111 ms


In [37]:
# Import the evaluation routines

# Compute and print the AUC score
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

# Set biases to zero
model.item_biases *= 0.0
test_auc = auc_score(model, test, train_interactions=train, num_threads=NUM_THREADS, check_intersections=False).mean()
print('Collaborative filtering test AUC (item_biases=0): %s' % test_auc)

Collaborative filtering train AUC: 0.88878465
Collaborative filtering test AUC: 0.41050318
Collaborative filtering test AUC (item_biases=0): 0.5342386


### A hybrid model

In [43]:
item_features = data['item_features']
tag_labels = data['item_feature_labels']

print('There are %s distinct tags, with values like %s.' % (item_features.shape[1], tag_labels[:3].tolist()))

KeyboardInterrupt: 

In [39]:
# Define a new model instance
model = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(train,
                item_features=item_features,
                epochs=NUM_EPOCHS,
                num_threads=NUM_THREADS)

In [40]:
# Don't forget the pass in the item features again!
train_auc = auc_score(model,
                      train,
                      item_features=item_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 0.8551806


In [41]:
test_auc = auc_score(model,
                    test,
                    train_interactions=train,
                    item_features=item_features,
                    num_threads=NUM_THREADS,
                    check_intersections=False).mean()
print('Hybrid test set AUC: %s' % test_auc)

Hybrid test set AUC: 0.73257315


In [42]:
def get_similar_tags(model, tag_id):
    # Define similarity as the cosine of the angle
    # between the tag latent vectors

    # Normalize the vectors to unit length
    tag_embeddings = (model.item_embeddings.T
                      / np.linalg.norm(model.item_embeddings, axis=1)).T

    query_embedding = tag_embeddings[tag_id]
    similarity = np.dot(tag_embeddings, query_embedding)
    most_similar = np.argsort(-similarity)[1:4]

    return most_similar


for tag in (u'bayesian', u'regression', u'survival'):
    tag_id = tag_labels.tolist().index(tag)
    print('Most similar tags for %s: %s' % (tag_labels[tag_id],
                                            tag_labels[get_similar_tags(model, tag_id)]))

Most similar tags for bayesian: ['mcmc' 'conjugate-prior' 'prior']
Most similar tags for regression: ['nadaraya-watson' 'multiple-regression' 'duan-smearing']
Most similar tags for survival: ['parametric' 'cox-model' 'non-response']
