In [8]:
import pandas as pd
import numpy as np
from ast import literal_eval
import scipy.sparse

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k, reciprocal_rank, auc_score

from tools.lightfm_tools import prepare_interactions, evaluate_model, evaluate_diversity_and_coverage

## Data loading

In [2]:
# Ratings without the least active users and the least popular games
df = pd.read_csv('data/ratings.csv.gz')
# Names of the most popular features
features_names = pd.read_csv('data/LightFM_item_features_names.csv.gz')
# Features of games
game_features = pd.read_csv('data/LightFM_item_features.csv.gz')
game_features['features'] = game_features.features.apply(literal_eval)

In [3]:
# We don't need features of games we don't have any interactions with
features_list = [x for x in features_names['0']]
games_list = [x for x in df['bgg_id']]
game_features = game_features.drop((game_features['bgg_id'])[~game_features['bgg_id'].isin(games_list)].index)

In [4]:
# Positive interactions are: rating the game above 6 or having a game in collection without rating it
df_positive = df.loc[((df['bgg_user_owned'] == 1.0) & (df['bgg_user_rating'].isnull()) | (df['bgg_user_rating'] > 6))].copy()
df_positive['value'] = 1
df_negative =  df.loc[(df['bgg_user_rating'] < 4)].copy()
df_negative['value'] = -1
df_logistic = pd.concat([df_positive, df_negative])

## Preparation of dataset, interactions and item features matrices

In [5]:
dataset = Dataset()
dataset.fit((x for x in df['bgg_user_name']), (x for x in df['bgg_id']), item_features=(x for x in features_list))

In [10]:
# Interactions for all functions except logistic loss
try:
    interactions = scipy.sparse.load_npz('data/interactions.npz')
    test_interactions = scipy.sparse.load_npz('data/test_interactions.npz')
    known = pd.read_csv('data/known.csv.gz')
    print("Loaded interactions")
except:    
    interactions, test_interactions, known = prepare_interactions(df_positive, dataset)
    scipy.sparse.save_npz('data/interactions.npz', interactions)
    scipy.sparse.save_npz('data/test_interactions.npz', test_interactions)
    known.to_csv('data/known.csv.gz', compression='gzip', index=False)    

Loaded interactions


In [12]:
# Interactions for logistic loss
try:
    interactions_logistic = scipy.sparse.load_npz('data/interactions_logistic.npz')
    test_interactions_logistic = scipy.sparse.load_npz('data/test_interactions_logistic.npz')
    known_logistic = pd.read_csv('data/known_logistic.csv.gz')
    print("Loaded interactions")
except:    
    interactions_logistic, test_interactions_logistic, known_logistic = prepare_interactions(df_logistic, dataset)
    scipy.sparse.save_npz('data/interactions_logistic.npz', interactions_logistic)
    scipy.sparse.save_npz('data/test_interactions_logistic.npz', test_interactions_logistic)
    known_logistic.to_csv('data/known_logistic.csv.gz', compression='gzip', index=False)    

Loaded interactions


In [13]:
item_features = dataset.build_item_features((val['bgg_id'], [w for w in val['features'] if w in features_list]) for idx, val in game_features.iterrows())

In [14]:
id_item_features = dataset.build_item_features((val['bgg_id'], [w for w in val['features'] if w in features_list] + [val['bgg_id']]) for idx, val in game_features.iterrows())

In [15]:
random_game_features = game_features.copy()
random_game_features.iloc[:, 1] = np.random.permutation(game_features.iloc[:,1].values)
random_item_features = dataset.build_item_features((val['bgg_id'], [w for w in val['features'] if w in features_list]) for idx, val in random_game_features.iterrows())

## Comparison of results achieved using different parameters

### WARP model

In [16]:
model = LightFM(loss='warp')
model.fit(interactions, verbose=True, epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [01:59<00:00,  5.98s/it]


<lightfm.lightfm.LightFM at 0x7fa19c2e65e0>

In [17]:
evaluate_model(model, interactions, test_interactions, k=5, num_threads=8)

Precision: train 0.50
Precision: test 0.20
AUC: train 0.94
AUC: test 0.93


In [18]:
features_model = LightFM(loss='warp')
features_model.fit(interactions, verbose=True, item_features = item_features,
                   epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [10:27<00:00, 31.35s/it]


<lightfm.lightfm.LightFM at 0x7fa19c2e66d0>

In [19]:
evaluate_model(features_model, interactions, test_interactions, k=5,
               num_threads=8, item_features = item_features)

Precision: train 0.49
Precision: test 0.19
AUC: train 0.93
AUC: test 0.92


In [20]:
random_features_model = LightFM(loss='warp')
random_features_model.fit(interactions, verbose=True, item_features = random_item_features,
                          epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [09:29<00:00, 28.45s/it]


<lightfm.lightfm.LightFM at 0x7fa19c2e68b0>

In [21]:
evaluate_model(random_features_model, interactions, test_interactions, k=5,
               num_threads=8, item_features = random_item_features)

Precision: train 0.48
Precision: test 0.18
AUC: train 0.93
AUC: test 0.92


In [22]:
id_features_model = LightFM(loss='warp')
id_features_model.fit(interactions, verbose=True, item_features = id_item_features,
                      epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [10:10<00:00, 30.52s/it]


<lightfm.lightfm.LightFM at 0x7fa19c7383d0>

In [23]:
evaluate_model(id_features_model, interactions, test_interactions, k=5,
               num_threads=8, item_features = id_item_features)

Precision: train 0.50
Precision: test 0.19
AUC: train 0.94
AUC: test 0.92


### k-OS WARP model

In [24]:
model = LightFM(loss = 'warp-kos')
model.fit(interactions, verbose=True, epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [03:19<00:00,  9.95s/it]


<lightfm.lightfm.LightFM at 0x7fa19c7c5ee0>

In [25]:
evaluate_model(model, interactions, test_interactions, k=5, num_threads=8)

Precision: train 0.46
Precision: test 0.17
AUC: train 0.92
AUC: test 0.91


In [26]:
id_features_model = LightFM(loss = 'warp-kos')
id_features_model.fit(interactions, verbose=True, item_features = id_item_features,
                      epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [11:35<00:00, 34.78s/it]


<lightfm.lightfm.LightFM at 0x7fa19c7c53d0>

In [27]:
evaluate_model(id_features_model, interactions, test_interactions, k=5,
               num_threads=8, item_features = id_item_features)

Precision: train 0.46
Precision: test 0.17
AUC: train 0.91
AUC: test 0.90


### BPR model

In [28]:
model = LightFM(loss = 'bpr')
model.fit(interactions, verbose=True, epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [01:40<00:00,  5.01s/it]


<lightfm.lightfm.LightFM at 0x7fa19c88c220>

In [29]:
evaluate_model(model, interactions, test_interactions, k=5, num_threads=8)

Precision: train 0.45
Precision: test 0.13
AUC: train 0.90
AUC: test 0.87


In [30]:
id_features_model = LightFM(loss = 'bpr')
id_features_model.fit(interactions, verbose=True, item_features = id_item_features,
                      epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [14:18<00:00, 42.94s/it]


<lightfm.lightfm.LightFM at 0x7fa19c88c4f0>

In [31]:
evaluate_model(id_features_model, interactions, test_interactions, k=5,
               num_threads=8, item_features = id_item_features)

Precision: train 0.41
Precision: test 0.12
AUC: train 0.90
AUC: test 0.86


### Logistic model

In [32]:
model = LightFM(loss = 'logistic')
model.fit(interactions_logistic, verbose=True, epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [01:08<00:00,  3.43s/it]


<lightfm.lightfm.LightFM at 0x7fa19c738b50>

In [33]:
evaluate_model(model, interactions_logistic, test_interactions_logistic, k=5, num_threads=8)

Precision: train 0.24
Precision: test 0.08
AUC: train 0.80
AUC: test 0.79


In [34]:
id_features_model = LightFM(loss = 'logistic')
id_features_model.fit(interactions_logistic, verbose=True, item_features = id_item_features,
                      epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [06:49<00:00, 20.47s/it]


<lightfm.lightfm.LightFM at 0x7fa19c7c50a0>

In [35]:
evaluate_model(id_features_model, interactions_logistic, test_interactions_logistic, k=5,
               num_threads=8, item_features = id_item_features)

Precision: train 0.06
Precision: test 0.01
AUC: train 0.67
AUC: test 0.67


## Diversity & coverage testing

In [None]:
#TODO
evaluate_diversity_and_coverage(model, users_sample_size, dataset, games_df, known, path, item_features = None, N = 10)