In [124]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k, reciprocal_rank, auc_score

### Data loading and cleaning

In [125]:
df = pd.read_csv('../LightFM_ratings.csv.gz',  usecols=[1,2])
# Nazwy najpopularniejszych kategorii
features_names = pd.read_csv('../LightFM_item_features_names.csv.gz')
# Kategorie konkretnych gier 
game_features = pd.read_csv('../LightFM_item_features.csv.gz')

In [126]:
grouped = df.groupby(by='bgg_user_name')
cleaned_df = []
for user, user_df in grouped:
    if user_df.shape[0] < 10:
        continue
    cleaned_df.append(user_df[:])
cleaned_df = pd.concat(cleaned_df)

In [127]:
cleaned_df.groupby('bgg_user_name')['bgg_id'].nunique()

bgg_user_name
 mycroft           62
- v -              12
-=yod@=-          307
-grizzly-          29
-johnny-          125
                 ... 
zzzxxxyyy          20
zzzzz              37
zzzzzane          169
zzzzzyy            11
æleksandr þræð     11
Name: bgg_id, Length: 247376, dtype: int64

In [128]:
features_list = [x for x in features_names['0']]
games_list = [x for x in cleaned_df['bgg_id']]

In [130]:
# We drop the features of games for which we don't have any interaction
game_features = game_features.drop((game_features['bgg_id'])[~game_features['bgg_id'].isin(games_list)].index)

### Preparation of interactions and item features matrices 

In [131]:
dataset = Dataset()
dataset.fit((x for x in cleaned_df['bgg_user_name']), (x for x in cleaned_df['bgg_id']), item_features=(x for x in features_names['0']))

In [132]:
# Won't work with weird designers' names! (like "..., Jr.")
item_features = dataset.build_item_features((val['bgg_id'], [w[1:-1].replace("\\'", "'") for w in val['features'][1:-1].split(", ") if w[1:-1].replace("\\'", "'") in features_list]) for idx, val in game_features.iterrows())

In [133]:
def prepare_interactions(df, dataset):
    
    def split_test(test_df, seed=42, frac=0.8):
        grouped = test_df.groupby(by='bgg_user_name')
        test_known = []
        test_unknown = []
        for user, df in grouped:
            df_size = df.shape[0]
            known_size = int(round(frac*df_size))
            known_indices = np.random.choice(df_size, known_size, replace=False)
            known_data = df.iloc[known_indices]
            test_known.append(known_data)

            unknown_indices = np.setdiff1d(np.arange(df_size), known_indices)
            unknown_data = df.iloc[unknown_indices]
            test_unknown.append(unknown_data)
        return pd.concat(test_known), pd.concat(test_unknown)

    users = df['bgg_user_name'].unique()
    np.random.shuffle(users)
    train_size = int(0.7*users.shape[0])
    train_df = df[df['bgg_user_name'].isin(users[:train_size])]
    test_df = df[df['bgg_user_name'].isin(users[train_size:])]
    print('Splitting test set')
    test_known, test_unknown = split_test(test_df)
    interactions_df = train_df.append(test_known)
    print('Preparing training interactions')
    interactions = dataset.build_interactions(((val['bgg_user_name'], val['bgg_id']) for idx, val in interactions_df.iterrows()))
    print('Preparing testing interactions')
    test_interactions = dataset.build_interactions(((val['bgg_user_name'], val['bgg_id']) for idx, val in test_unknown.iterrows()))
    return interactions[0], test_interactions[0]

In [None]:
interactions, test_interactions = prepare_interactions(cleaned_df, dataset)

Splitting test set


### WARP loss

In [None]:
model = LightFM(loss='warp')
model.fit(interactions, verbose=True, epochs=20, num_threads=8)

In [None]:
train_precision = precision_at_k(model, interactions, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(model, test_interactions, train_interactions = interactions, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

In [None]:
train_auc = auc_score(model, interactions, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

### BPR loss

In [None]:
bpr_model = LightFM(loss='bpr')
bpr_model.fit(interactions, verbose=True, epochs=20, num_threads=8)

In [None]:
train_precision = precision_at_k(bpr_model, interactions, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(bpr_model, test_interactions, train_interactions = interactions, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

In [None]:
train_auc = auc_score(bpr_model, interactions, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(bpr_model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

### WARP loss with item features

In [None]:
features_model = LightFM(loss='warp', no_components = 64, item_alpha = 1e-6)
features_model.fit(interactions, verbose=True, item_features = item_features, epochs=20, num_threads=8)

In [None]:
train_precision = precision_at_k(features_model, interactions, item_features = item_features, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(features_model, test_interactions, train_interactions = interactions, item_features = item_features, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

In [None]:
train_auc = auc_score(features_model, interactions, item_features = item_features, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(features_model, test_interactions, train_interactions = interactions, item_features = item_features, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

### Additional evaluation functions

In [None]:
train_recall = recall_at_k(model, interactions, k=5, num_threads=8).mean()
print('Recall: train %.2f' % (train_recall))

test_recall = recall_at_k(model, test_interactions, train_interactions = interactions, k=5, num_threads=8).mean()
print('Recall: test %.2f' % (test_recall))

In [None]:
train_auc = auc_score(model, interactions, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

In [None]:
train_recip = reciprocal_rank(model, interactions, num_threads=8).mean()
print('Reciprocal rank: train %.2f' % (train_recip))

test_recip = reciprocal_rank(model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('Reciprocal rank: test %.2f' % (test_recip))