In [1]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k, reciprocal_rank, auc_score

### Data loading and cleaning

In [2]:
df = pd.read_csv('../LightFM_ratings.csv.gz',  usecols=[1,2])
# Nazwy najpopularniejszych kategorii
features_names = pd.read_csv('../LightFM_item_features_names.csv.gz')
# Kategorie konkretnych gier 
game_features = pd.read_csv('../LightFM_item_features.csv.gz')

In [3]:
grouped = df.groupby(by='bgg_user_name')
cleaned_df = []
for user, user_df in grouped:
    if user_df.shape[0] < 10:
        continue
    cleaned_df.append(user_df[:])
cleaned_df = pd.concat(cleaned_df)

In [4]:
len(cleaned_df)

23116352

In [5]:
deleted = 0
grouped = cleaned_df.groupby(by='bgg_id')
supercleaned_df = []
for game, game_df in grouped:
    if game_df.shape[0] < 10:
        deleted += 1
        continue
    supercleaned_df.append(game_df[:])
supercleaned_df = pd.concat(supercleaned_df)

In [6]:
cleaned_df = supercleaned_df

In [66]:
deleted

41526

In [33]:
cleaned_df.groupby('bgg_user_name')['bgg_id'].nunique()

bgg_user_name
 mycroft          62 
- v -             12 
-=yod@=-          307
-grizzly-         29 
-johnny-          125
                 ... 
zzzxxxyyy         20 
zzzzz             37 
zzzzzane          169
zzzzzyy           11 
æleksandr þræð    11 
Name: bgg_id, Length: 247376, dtype: int64

In [7]:
features_list = [x for x in features_names['0']]
games_list = [x for x in cleaned_df['bgg_id']]

In [8]:
# We drop the features of games for which we don't have any interaction
game_features = game_features.drop((game_features['bgg_id'])[~game_features['bgg_id'].isin(games_list)].index)

In [9]:
features_names
game_features

Unnamed: 0.1,Unnamed: 0,bgg_id,features
0,0,1,"['Economic:1021', 'Negotiation:1026', 'Politic..."
1,1,2,"['Card Game:1002', 'Fantasy:1010', 'Trick-taki..."
2,2,3,"['Abstract Strategy:1009', 'Medieval:1035', 'A..."
3,3,4,"['Ancient:1050', 'Action Points:2001', 'Area M..."
4,4,5,"['Economic:1021', 'Territory Building:1086', '..."
...,...,...,...
99222,99222,329086,"['Animals:1089', 'Transportation:1011', 'Pick-..."
99345,99345,329509,"['Bluffing:1023', 'Deduction:1039', 'Real-time..."
99378,99378,329591,"['Industry / Manufacturing:1088', 'Trains:1034..."
99444,99444,329841,"['Trains:1034', 'Travel:1097', 'Card Drafting:..."


### Preparation of interactions and item features matrices 

In [10]:
dataset = Dataset()
dataset.fit((x for x in cleaned_df['bgg_user_name']), (x for x in cleaned_df['bgg_id']), item_features=(x for x in features_names['0']))

In [11]:
def prepare_interactions(df, dataset):
    
    def split_test(test_df, seed=42, frac=0.8):
        grouped = test_df.groupby(by='bgg_user_name')
        test_known = []
        test_unknown = []
        for user, df in grouped:
            df_size = df.shape[0]
            known_size = int(round(frac*df_size))
            known_indices = np.random.choice(df_size, known_size, replace=False)
            known_data = df.iloc[known_indices]
            test_known.append(known_data)
            unknown_indices = np.setdiff1d(np.arange(df_size), known_indices)
            unknown_data = df.iloc[unknown_indices]
            test_unknown.append(unknown_data)
        return pd.concat(test_known), pd.concat(test_unknown)

    users = df['bgg_user_name'].unique()
    np.random.shuffle(users)
    train_size = int(0.7*users.shape[0])
    train_df = df[df['bgg_user_name'].isin(users[:train_size])]
    test_df = df[df['bgg_user_name'].isin(users[train_size:])]
    print('Splitting test set')
    test_known, test_unknown = split_test(test_df)
    interactions_df = train_df.append(test_known)
    print('Preparing training interactions')
    interactions = dataset.build_interactions(((val['bgg_user_name'], val['bgg_id']) for idx, val in interactions_df.iterrows()))
    print('Preparing testing interactions')
    test_interactions = dataset.build_interactions(((val['bgg_user_name'], val['bgg_id']) for idx, val in test_unknown.iterrows()))
    return interactions[0], test_interactions[0], interactions_df

In [12]:
interactions, test_interactions, known = prepare_interactions(cleaned_df, dataset)

Splitting test set
Preparing training interactions
Preparing testing interactions


### WARP loss

In [135]:
model = LightFM(loss='warp')
model.fit(interactions, verbose=True, epochs=20, num_threads=8)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


<lightfm.lightfm.LightFM at 0x7fcf38e31e50>

In [136]:
train_precision = precision_at_k(model, interactions, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(model, test_interactions, train_interactions = interactions, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

Precision: train 0.45
Precision: test 0.17


In [137]:
train_auc = auc_score(model, interactions, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

AUC: train 0.99
AUC: test 0.99


### BPR loss

In [138]:
bpr_model = LightFM(loss='bpr')
bpr_model.fit(interactions, verbose=True, epochs=20, num_threads=8)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


<lightfm.lightfm.LightFM at 0x7fcf0a48c0d0>

In [139]:
train_precision = precision_at_k(bpr_model, interactions, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(bpr_model, test_interactions, train_interactions = interactions, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

Precision: train 0.43
Precision: test 0.13


In [140]:
train_auc = auc_score(bpr_model, interactions, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(bpr_model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

AUC: train 0.95
AUC: test 0.93


### WARP loss with item features

In [132]:
# Won't work with weird designers' names! (like "..., Jr.")
item_features = dataset.build_item_features((val['bgg_id'], [w[1:-1].replace("\\'", "'") for w in val['features'][1:-1].split(", ") if w[1:-1].replace("\\'", "'") in features_list]) for idx, val in game_features.iterrows())

In [141]:
# https://github.com/lyst/lightfm/issues/551
# https://www.ethanrosenthal.com/2016/11/07/implicit-mf-part-2/
features_model = LightFM(loss='warp', no_components = 64, item_alpha = 1e-6)
features_model.fit(interactions, verbose=True, item_features = item_features, epochs=20, num_threads=8)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


<lightfm.lightfm.LightFM at 0x7fcf0a48ce20>

In [142]:
train_precision = precision_at_k(features_model, interactions, item_features = item_features, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(features_model, test_interactions, train_interactions = interactions, item_features = item_features, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

Precision: train 0.46
Precision: test 0.17


In [143]:
train_auc = auc_score(features_model, interactions, item_features = item_features, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(features_model, test_interactions, train_interactions = interactions, item_features = item_features, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

AUC: train 0.99
AUC: test 0.99


## With shuffled item features

In [55]:
random_game_features = game_features.copy()
random_game_features.iloc[:, 1] = np.random.permutation(game_features.iloc[:,1].values)
random_game_features

In [60]:
random_item_features = dataset.build_item_features((val['bgg_id'], [w[1:-1].replace("\\'", "'") for w in val['features'][1:-1].split(", ") if w[1:-1].replace("\\'", "'") in features_list]) for idx, val in random_game_features.iterrows())

In [66]:
features_model = LightFM(loss='warp', no_components = 64, item_alpha = 1e-6)
features_model.fit(interactions, verbose=True, item_features = random_item_features, epochs=20, num_threads=8)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19


<lightfm.lightfm.LightFM at 0x7fe3c8c63160>

In [67]:
train_precision = precision_at_k(features_model, interactions, item_features = random_item_features, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(features_model, test_interactions, train_interactions = interactions, item_features = random_item_features, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

Precision: train 0.44
Precision: test 0.16


## With identity matrix
See https://github.com/jemmott/lightfm-goodbooks-debug/blob/master/LightFM%20Item%20Features%20with%20GoodReads.ipynb, https://github.com/lyst/lightfm/issues/551

In [13]:
item_features_id = dataset.build_item_features((val['bgg_id'], [w[1:-1].replace("\\'", "'") for w in val['features'][1:-1].split(", ") if w[1:-1].replace("\\'", "'") in features_list] + [val['bgg_id']]) for idx, val in game_features.iterrows())

In [15]:
features_model = LightFM(loss='warp', no_components = 64, item_alpha = 1e-6)

In [16]:
features_model.fit(interactions, verbose=True, item_features = item_features_id, epochs=20, num_threads=8)

Epoch: 100%|██████████| 20/20 [13:15<00:00, 39.77s/it]


<lightfm.lightfm.LightFM at 0x7f93b83c2df0>

In [70]:
train_precision = precision_at_k(features_model, interactions, item_features = item_features_id, k=5, num_threads=8).mean()
print('Precision: train %.2f' % (train_precision))

test_precision = precision_at_k(features_model, test_interactions, train_interactions = interactions, item_features = item_features_id, k=5, num_threads=8).mean()
print('Precision: test %.2f' % (test_precision))

Precision: train 0.48
Precision: test 0.18


## Prepare dataset for diversity&coverage evaluation

In [56]:
known.groupby('bgg_user_name')['bgg_id'].apply(list)[" mycroft"]

[5,
 13,
 15,
 36,
 98,
 116,
 181,
 199,
 221,
 278,
 478,
 483,
 520,
 548,
 823,
 987,
 1143,
 1258,
 1262,
 1297,
 1299,
 1426,
 1917,
 1927,
 2456,
 2471,
 3341,
 3419,
 4095,
 4324,
 4610,
 4845,
 5029,
 5220,
 5336,
 6472,
 7240,
 7336,
 7467,
 7479,
 7600,
 8041,
 8204,
 8569,
 9609,
 9829,
 10167,
 10383,
 10547,
 11229,
 11416,
 12477,
 12995,
 13805,
 15045,
 15047,
 15062,
 15548,
 15987,
 17223,
 17226,
 17449]

In [54]:
known_df[0]

Unnamed: 0,bgg_user_name,bgg_id
9,mycroft,5
10,mycroft,13
11,mycroft,15
12,mycroft,36
13,mycroft,98
...,...,...
66,mycroft,15548
67,mycroft,15987
68,mycroft,17223
69,mycroft,17226


In [17]:
def return_top_N(N, user_name, model, user_mapping, games_mapping, item_features, training):
    user_id = user_mapping[user_name]
    no_games = len(games_mapping)
    
    known_ids = training[user_name]
#     known_ids = training[np.in1d(training['bgg_user_name'],user_name)]['bgg_id'].values
    #to jest super wolne
    #known_ids = training[training['bgg_user_name'] == user_name]['bgg_id'].values
    known_ids = [games_mapping[i] for i in known_ids]
    unknown_ids = np.array([i for i in range(no_games) if i not in known_ids])
#     print(unknown_ids)
    ratings = model.predict(user_id, unknown_ids, item_features = item_features, num_threads = 8)
    games_ids = np.argsort(ratings)[::-1][:N]
#     print(games_ids)
    top_N = []
    for idx in games_ids:
        idx = unknown_ids[idx]
#         print(idx)
        top_N.append(list(games_mapping.keys())[list(games_mapping.values()).index(idx)])
    return top_N
    
return_top_N(5, "- v -", features_model, dataset.mapping()[0], dataset.mapping()[2], item_features_id, known.groupby('bgg_user_name')['bgg_id'].apply(list))

[13, 36218, 1927, 30549, 40692]

In [181]:
cleaned_df[cleaned_df['bgg_user_name'] == "- v -"]['bgg_id'].values

array([  181,   320,   478,   822,  1917,  2281,  2338,  3076,  8207,
        9446, 11159, 11161])

In [134]:
dataset.mapping()[0]

{' mycroft': 0,
 '- v -': 1,
 '-=yod@=-': 2,
 '-grizzly-': 3,
 '-johnny-': 4,
 '-loren-': 5,
 '-lucas-': 6,
 '-mide-': 7,
 '-mik-': 8,
 '-morphling-': 9,
 '-pj-': 10,
 '-snarf-': 11,
 '-toni-': 12,
 '-xxx-': 13,
 '...hammer': 14,
 '.::clarté::.': 15,
 '.jck.': 16,
 '0 1 1 2 3 5 8': 17,
 '000ryuk000': 18,
 '000undo000': 19,
 '0010': 20,
 '007_ulf': 21,
 '007alex': 22,
 '007design': 23,
 '007king_kong': 24,
 '007mrbond': 25,
 '007poptart': 26,
 '00arak00': 27,
 '00bogey': 28,
 '00daniel00': 29,
 '00matej00': 30,
 '00mccracken': 31,
 '00schneider': 32,
 '00skip': 33,
 '00vito': 34,
 '00zaphod': 35,
 '00zero': 36,
 '010dennis': 37,
 '01151125': 38,
 '015599m': 39,
 '01davie': 40,
 '01hejazi': 41,
 '01josepe': 42,
 '01lwilliams': 43,
 '01neo01': 44,
 '01schafi': 45,
 '0201strong': 46,
 '020907': 47,
 '030caylus': 48,
 '0321': 49,
 '041744': 50,
 '0444287': 51,
 '0447603': 52,
 '0492372665': 53,
 '051276': 54,
 '07734': 55,
 '07770': 56,
 '07cbdj': 57,
 '0815spieler': 58,
 '08nienhl': 59,
 '

In [None]:
from tqdm import tqdm
N = 10
recommendations_df = pd.DataFrame()
known_grouped = known.groupby('bgg_user_name')['bgg_id'].apply(list)
for user in tqdm(dataset.mapping()[0].keys()):
    rec = return_top_N(N, user, features_model, dataset.mapping()[0], dataset.mapping()[2], item_features_id, known_grouped)
    d = {'bgg_user_name': N*[user], 'bgg_id': rec}
    d = pd.DataFrame(data=d)
    recommendations_df = recommendations_df.append(d)

recommendations_df

  0%|          | 870/247375 [01:46<8:11:48,  8.35it/s] 

In [None]:
recommendations_df.to_csv('../lightfm_top5_recommendations.csv.gz', compression='gzip')

In [None]:
recommendations_df

In [101]:
return_top_N(N, "- v -", features_model, dataset.mapping()[0], dataset.mapping()[2])

[1406, 10630, 103885, 25417, 15987]

## Additional potentially usefull stuff

In [182]:
# https://github.com/lyst/lightfm/issues/394
def get_similar_tags(model, tag_id):
    # Define similarity as the cosine of the angle
    # between the tag latent vectors

    # Normalize the vectors to unit length
    tag_embeddings = (model.item_embeddings.T
                      / np.linalg.norm(model.item_embeddings, axis=1)).T
    query_embedding = tag_embeddings[tag_id]
    similarity = np.dot(tag_embeddings, query_embedding)
    most_similar = np.argsort(-similarity)[0:10]
    print(most_similar)
    return most_similar


for tag in ['Animals:1089']:
    tag_id = features_list.index(tag)
    get_similar_tags(features_model, tag_id)

[   31   111   876   531  1037    20    19   186  8790 17631]


In [None]:
train_recall = recall_at_k(model, interactions, k=5, num_threads=8).mean()
print('Recall: train %.2f' % (train_recall))

test_recall = recall_at_k(model, test_interactions, train_interactions = interactions, k=5, num_threads=8).mean()
print('Recall: test %.2f' % (test_recall))

In [None]:
train_auc = auc_score(model, interactions, num_threads=8).mean()
print('AUC: train %.2f' % (train_auc))

test_auc = auc_score(model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('AUC: test %.2f' % (test_auc))

In [None]:
train_recip = reciprocal_rank(model, interactions, num_threads=8).mean()
print('Reciprocal rank: train %.2f' % (train_recip))

test_recip = reciprocal_rank(model, test_interactions, train_interactions = interactions, num_threads=8).mean()
print('Reciprocal rank: test %.2f' % (test_recip))