In [1]:
import pandas as pd
import numpy as np
from gmf import GMFEngine
from mlp import MLPEngine
from neumf import NeuMFEngine
from data import SampleGenerator
import utils
import torch

In [2]:
gmf_config = {'alias': 'gmf_factor8neg4-implict',
              'num_epoch': 200,
              'batch_size': 1024,
              # 'optimizer': 'sgd',
              # 'sgd_lr': 1e-3,
              # 'sgd_momentum': 0.9,
              # 'optimizer': 'rmsprop',
              # 'rmsprop_lr': 1e-3,
              # 'rmsprop_alpha': 0.99,
              # 'rmsprop_momentum': 0,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': 6040,
              'num_items': 3706,
              'latent_dim': 8,
              'num_negative': 4,
              'l2_regularization': 0.001, # 0.01
              'use_cuda': False,
              'device_id': 0,
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}


In [3]:
mlp_config = {'alias': 'mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001',
              'num_epoch': 200,
              'batch_size': 256,  # 1024,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': 6040,
              'num_items': 3706,
              'latent_dim': 8,
              'num_negative': 4,
              'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
              'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
              'use_cuda': False,
              'device_id': 7,
              'pretrain': True,
              'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}


In [4]:
neumf_config = {'alias': 'pretrain_neumf_factor8neg4',
                'num_epoch': 200,
                'batch_size': 1024,
                'optimizer': 'adam',
                'adam_lr': 1e-3,
                'num_users': 6040,
                'num_items': 3706,
                'latent_dim_mf': 8,
                'latent_dim_mlp': 8,
                'num_negative': 4,
                'layers': [16,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
                'l2_regularization': 0.01,
                'use_cuda': True,
                'device_id': 7,
                'pretrain': False,
                'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
                'pretrain_mlp': 'checkpoints/{}'.format('mlp_factor8neg4_Epoch100_HR0.5606_NDCG0.2463.model'),
                'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
                }


In [5]:
ml1m_dir = 'data/ml-1m/ratingTraining.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')
# Reindex
ml1m_uwf = 'data/ml-1m/unwatchedFilms.dat'
uwflist = []
with open(ml1m_uwf) as f: uwflist =[int(i[:-1]) for i in f.readlines()]        
user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max()))
# DataLoader for training
sample_generator = SampleGenerator(ratings=ml1m_rating,uwf_list=uwflist)
evaluate_data = sample_generator.evaluate_data
# Specify the exact model
config = gmf_config
engine = GMFEngine(config)
state_dict = torch.load( 'checkpoints/gmf_factor8neg4-implict_Epoch199_HR0.6363_NDCG0.3678.model')
engine.model.load_state_dict(state_dict)

#utils.resume_checkpoint(engine.model, 'checkpoints/gmf_factor8neg4-implict_Epoch199_HR0.6363_NDCG0.3678.model', config['device_id'])
#engine.model = torch.load('checkpoints/gmf_factor8neg4-implict_Epoch199_HR0.6363_NDCG0.3678.model')
hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=0)
engine.save(config['alias'], 0, hit_ratio, ndcg)

Range of userId is [0, 5420]
Range of itemId is [0, 3350]
    userId  itemId  rating  timestamp  rank_latest
0        0       0       1  978300019         27.0
1        0       1       1  978300055         24.0
2        0       2       1  978300055         25.0
3        0       3       1  978300055         26.0
4        0       4       1  978300103         23.0
5        0       5       1  978300172         22.0
6        0       6       1  978300275         21.0
7        0       7       1  978300719         19.0
8        0       8       1  978300719         20.0
9        0       9       1  978300760         16.0
10       0      10       1  978300760         17.0
11       0      11       1  978300760         18.0
12       0      12       1  978301368         15.0
13       0      13       1  978301398         14.0
14       0      14       1  978301570         13.0
15       0      15       1  978301590         12.0
16       0      16       1  978301619         11.0
17       0      17      

In [7]:
2710500/5421

500.0

In [7]:
from metrics import MetronAtK
_metron = MetronAtK(top_k=10)
engine.model.eval()
with torch.no_grad():
    test_users, test_items = evaluate_data[0], evaluate_data[1]
    negative_users, negative_items = evaluate_data[2], evaluate_data[3]
    test_scores = engine.model(test_users, test_items)
    negative_scores = engine.model(negative_users, negative_items)
    _metron.subjects = [test_users.data.view(-1).tolist(),
                                 test_items.data.view(-1).tolist(),
                                 test_scores.data.view(-1).tolist(),
                                 negative_users.data.view(-1).tolist(),
                                 negative_items.data.view(-1).tolist(),
                                 negative_scores.data.view(-1).tolist()]


(2710500, 3)
(2710500, 3)
5421


In [9]:
import math 

full,top_k = _metron._subjects,_metron._top_k
top_k = full[full['rank']<=top_k]
score = 0.0
print(top_k.iloc[:30])
print(top_k.shape)
_test_items = { d['user'].iloc[0]:d['item'].to_list() for i,d in top_k.groupby('user')}
score = sum([sum(d[d['item'].isin(_test_items[d['user'].iloc[0]])]['rank'].apply(lambda x: math.log(2) / math.log(1 + x)).to_list()) for i,d in top_k.groupby('user')])

         user  item     score  rank
2683399     0    26  0.955286   1.0
2683396     0    22  0.954951   2.0
51          0    84  0.784121   3.0
77          0   400  0.749840   4.0
2683398     0    25  0.731213   5.0
87          0   538  0.718056   6.0
16          0  1181  0.543746   7.0
2683395     0    21  0.456222   8.0
19          0   415  0.440439   9.0
15          0   597  0.402159  10.0
572         1   136  0.899110   1.0
584         1   115  0.825122   2.0
548         1    93  0.756404   3.0
531         1  1154  0.714274   4.0
575         1   799  0.685619   5.0
2683401     1    47  0.629828   6.0
547         1  1022  0.568471   7.0
507         1  1092  0.567496   8.0
504         1   162  0.541935   9.0
588         1   534  0.527291  10.0
1024        2   264  0.881943   1.0
1058        2   214  0.873100   2.0
1005        2   277  0.828644   3.0
2683406     2    68  0.694055   4.0
1013        2   843  0.685498   5.0
1036        2   743  0.645211   6.0
991         2   120  0.64103

In [19]:
score

24630.635171778773

In [8]:
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data
# Specify the exact model
config = gmf_config
engine = GMFEngine(config)
# config = mlp_config
# engine = MLPEngine(config)
# config = neumf_config
# engine = NeuMFEngine(config)

TypeError: __init__() missing 1 required positional argument: 'uwf_list'

In [None]:
score_1 = {d['user']:len(d[(d['item'].isin(self._test_items[d['user'].iloc[0]]))& (d['ratings']==1.0)]) for i,d in top_k.groupby('user')]
score_2 = {d['user']:len(d[(d['item'].isin(self._test_items[d['user'].iloc[0]]))& (d['ratings']==0.0)]) for i,d in top_k.groupby('user')]           