## Import packages

In [1]:
from lightfm import LightFM
from lightfm.data import Dataset

In [2]:
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm.cross_validation import random_train_test_split

In [3]:
import numpy as np
import pandas as pd

In [4]:
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [5]:
from scipy.stats import describe

In [38]:
from sklearn.preprocessing import MinMaxScaler

## Agregated dataset

In [39]:
data = pd.read_csv('dataset_aggr.csv')

In [40]:
music_category = pd.read_csv('music_type.csv')
item_features = music_category

### Profiling users

In [41]:
# This will work as a user profile

user_features = pysqldf('''select UserID, avg(number_of_unique_songs) as X1, avg(number_of_unique_genres) as X2, 
                           avg(main_genre_dominance) as X3, avg(genre_ratio) as X4, 
                           sum(no_stimulus_points) as X5, sum(stimulus_points) as X6,
                           sum(driving_style_relaxed_driving) as X7, sum(driving_style_sport_driving) as X8,
                           sum(landscape_coast_line) as X9, sum(landscape_country_side) as X10, 
                           sum(landscape_mountains) as X11,
                           sum(landscape_urban) as X12, sum(mood_active) as X13, sum(mood_happy) as X14, 
                           sum(mood_lazy) as X15, sum(mood_sad) as X16,
                           sum(natural_phenomena_afternoon) as X17, sum(natural_phenomena_day_time) as X18,
                           sum(natural_phenomena_morning) as X19, sum(natural_phenomena_night) as X20,
                           sum(road_type_city) as X21, sum(road_type_highway) as X22, 
                           sum(road_type_serpentine) as X23,
                           sum(sleepiness_awake) as X24, sum(sleepiness_sleepy) as X25, 
                           sum(traffic_conditions_free_road) as X26,
                           sum(traffic_conditions_lots_of_cars) as X27, sum(traffic_conditions_traffic_jam) as X28,
                           sum(weather_cloudy) as X29, sum(weather_rainy) as X30, 
                           sum(weather_snowing) as X31, sum(weather_sunny) as X32
                           from data
                           group by UserID''')

In [42]:
scale = MinMaxScaler()
_user_ids = user_features.UserID.copy()
user_features = pd.DataFrame(scale.fit_transform(user_features.values), 
                             columns=user_features.columns, index=user_features.index)
user_features.UserID = _user_ids
user_features.head()

Unnamed: 0,UserID,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X23,X24,X25,X26,X27,X28,X29,X30,X31,X32
0,1001,0.065217,0.555556,0.25,0.05,0.056701,0.038095,0.051282,0.0,0.045455,...,0.0,0.214286,0.117647,0.0,0.032258,0.04,0.08,0.058824,0.047619,0.142857
1,1002,0.210145,0.777778,0.5,0.233333,0.123711,0.171429,0.051282,0.294118,0.181818,...,0.064516,0.0,0.0,0.172414,0.096774,0.32,0.08,0.294118,0.285714,0.142857
2,1003,0.07971,0.444444,0.375,0.149306,0.061856,0.052381,0.076923,0.058824,0.045455,...,0.032258,0.071429,0.058824,0.068966,0.129032,0.0,0.04,0.058824,0.047619,0.047619
3,1004,0.028986,0.222222,0.5,0.3,0.020619,0.02381,0.051282,0.0,0.0,...,0.0,0.142857,0.0,0.0,0.0,0.04,0.04,0.058824,0.0,0.047619
4,1005,0.144928,0.777778,0.404762,0.143991,0.128866,0.066667,0.153846,0.058824,0.090909,...,0.129032,0.071429,0.176471,0.172414,0.096774,0.08,0.16,0.058824,0.095238,0.047619


In [43]:
dataset = Dataset()
dataset.fit(data.UserID.unique(),
            data.ItemID.unique())

In [44]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 42, num_items 139.


In [45]:
dataset.fit_partial(users=[x['UserID'] for idx, x in user_features.iterrows()],
                    items=[x['ItemID'] for idx, x in item_features.iterrows()],
                    # user_features=[[f'X{i}_{x[f"X{i}"]}' for i in range(1,32)] for idx, x in user_features.iterrows()],
                    user_features=[f'X{i}' for i in range(1,32)],
                    item_features=[x['category_name'] for idx, x in item_features.iterrows()])
                    # item_features=['category_name', ])

In [46]:
user_features = dataset.build_user_features([( x['UserID'], {f'X{i}': x[f'X{i}'] for i in range(1,32)} ) 
                                             for idx, x in user_features.iterrows()])

print(repr(user_features))

<42x73 sparse matrix of type '<class 'numpy.float32'>'
	with 1344 stored elements in Compressed Sparse Row format>


In [47]:
item_features = dataset.build_item_features([( x['ItemID'], [x['category_name'], ] ) 
                                             for idx, x in item_features.iterrows()])

print(repr(item_features))

<139x149 sparse matrix of type '<class 'numpy.float32'>'
	with 278 stored elements in Compressed Sparse Row format>


In [48]:
(interactions, weights) = dataset.build_interactions(((x['UserID'], x['ItemID'], x['avg_rating'])
                                                       for idx, x in data.iterrows()))

print(repr(interactions))

<42x139 sparse matrix of type '<class 'numpy.int32'>'
	with 930 stored elements in COOrdinate format>


### Split data

In [49]:
train, test = random_train_test_split(interactions, test_percentage=0.2)

In [50]:
train.shape, test.shape

((42, 139), (42, 139))

In [64]:
NUM_THREADS = 4
NUM_COMPONENTS = 2000
NUM_EPOCHS = 500
ITEM_ALPHA = 2e-6
USER_ALPHA = 1e-6

In [65]:
# Define a new model instance
model = LightFM(loss='warp',
                learning_schedule='adadelta',
                learning_rate=0.05,
                rho=0.75,
                epsilon=1e-5,
                user_alpha=USER_ALPHA,
                max_sampled=20,
                item_alpha=ITEM_ALPHA,
                no_components=NUM_COMPONENTS)

# Fit the hybrid model. Note that this time, we pass
# in the item features matrix.
model = model.fit(train,
                  item_features=item_features,
                  user_features=user_features,
                  # sample_weight=weights,
                  # there is no way to get weights after randomsplit
                  epochs=NUM_EPOCHS,
                  num_threads=NUM_THREADS,
                  verbose=False)

In [66]:
train_auc = auc_score(model,
                      train,
                      item_features=item_features,
                      user_features=user_features,
                      num_threads=NUM_THREADS).mean()
print('Hybrid training set AUC: %s' % train_auc)

Hybrid training set AUC: 1.0


In [67]:
test_auc = np.nanmean(auc_score(model,
                                test,
                                train_interactions=train,
                                item_features=item_features,
                                user_features=user_features,
                                num_threads=NUM_THREADS))
print('Hybrid test set AUC: %s' % test_auc)

Hybrid test set AUC: 0.4985979


In [68]:
reciprocal = reciprocal_rank(model,
                             test,
                             train_interactions=train,
                             item_features=item_features,
                             user_features=user_features,
                             num_threads=NUM_THREADS).mean()
print('Reciprocal rank for test set: %s' % reciprocal)

Reciprocal rank for test set: 0.21552294


In [69]:
found_precision = precision_at_k(model,
                                 test,
                                 train_interactions=train,
                                 item_features=item_features,
                                 user_features=user_features,
                                 num_threads=NUM_THREADS)

print(f'Exact precision (per user): {found_precision}\n')
test_precision = np.nanmean(found_precision)
print(f'Hybrid test set Precision at K: {describe(found_precision)}\n')
print('Hybrid test set mean Precision at K: %s' % test_precision)

Exact precision (per user): [0.  0.  0.  0.  0.2 0.4 0.  0.2 0.1 0.3 0.  0.  1.  0.1 0.1 0.1 0.  0.1
 0.2 0.  0.1 0.  0.  0.  0.  0.5 0.1 0.  0.  0.  0.  0. ]

Hybrid test set Precision at K: DescribeResult(nobs=32, minmax=(0.0, 1.0), mean=0.109375, variance=0.04216734, skewness=2.9379661083221436, kurtosis=9.429328101949638)

Hybrid test set mean Precision at K: 0.109375


In [70]:
test_recall = np.nanmean(recall_at_k(model,
                                     test,
                                     train_interactions=train,
                                     item_features=item_features,
                                     user_features=user_features,
                                     num_threads=NUM_THREADS))
print('Hybrid test set Recall at K: %s' % test_recall)

Hybrid test set Recall at K: 0.13543779656511434


### Unaggregated dataset

At the moment, lightfm doesn't support multiple (duplicated) pairs of user-item (holding different values for user features). 

## Conclusions

LightFM algorithm was able to recommend songs with mediocre results, with a performance similar to collaborative filtering done by the deep recommender 'spotlight' framework.