In [11]:
import sys
import os
import shutil
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
import cornac
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets.python_splitters import python_random_split
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.utils.constants import SEED as DEFAULT_SEED


print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.8.10 (default, Sep 28 2021, 16:10:42) 
[GCC 9.3.0]
Pandas version: 1.5.1
Tensorflow version: 2.7.4


In [12]:
# top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m

# Model parameters
EPOCHS = 100
BATCH_SIZE = 256

SEED = DEFAULT_SEED  # Set None for non-deterministic results

In [13]:
data = []
with open('../data/um.dat', 'r') as infile:
    for line in infile.readlines():
        user, item, rating = line.strip().split('\t')
        data.append([int(user), int(item), rating])
        
df = pd.DataFrame(data=data, columns=["userID", "itemID", "rating"])
data = []
with open('../data/um_0.8.train', 'r') as infile:
    for line in infile.readlines():
        user, item, rating = line.strip().split('\t')
        data.append([int(user), int(item), rating])
        
train = pd.DataFrame(data=data, columns=["userID", "itemID", "rating"])
data = []
with open('../data/um_0.8.test', 'r') as infile:
    for line in infile.readlines():
        user, item, rating = line.strip().split('\t')
        data.append([int(user), int(item), rating])
        
test = pd.DataFrame(data=data, columns=["userID", "itemID", "rating"])
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=SEED)

In [19]:
train = train.sort_values("userID")
test = test.sort_values("userID")

In [20]:
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]

In [21]:
leave_one_out_test = test.groupby("userID").last().reset_index()

In [22]:
train_file = "../data/train.csv"
test_file = "../data/test.csv"
leave_one_out_test_file = "../data/leave_one_out_test.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

In [23]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)

INFO:recommenders.models.ncf.dataset:Indexing ../data/train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ../data/leave_one_out_test.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file ../data/leave_one_out_test_full.csv ...
100%|██████████████████████████████████████| 2818/2818 [00:07<00:00, 402.05it/s]
INFO:recommenders.models.ncf.dataset:Indexing ../data/leave_one_out_test_full.csv ...


In [24]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)



In [25]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [22.58s]: train_loss = 0.256993 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [25.39s]: train_loss = 0.243091 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [22.44s]: train_loss = 0.236316 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [20.71s]: train_loss = 0.232144 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [23.42s]: train_loss = 0.229129 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 60 [21.77s]: train_loss = 0.227550 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 70 [20.50s]: train_loss = 0.225747 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 80 [21.17s]: train_loss = 0.224581 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 90 [24.87s]: train_loss = 0.223091 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 100 [22.55s]: train_loss = 0.223013 


Took 2241.513431094587 seconds for training.


In [26]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]


predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,1,2111,0.488377
1,1,7241,0.926073
2,1,386,0.635497
3,2,4045,0.81274
4,2,9055,0.932129


In [27]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 103.4089816249907 seconds for prediction.


In [28]:
TOP_K = 10
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.034902
NDCG:	0.140789
Precision@K:	0.115295
Recall@K:	0.076867


In [29]:
TOP_K = 20
eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.043182
NDCG:	0.143988
Precision@K:	0.098776
Recall@K:	0.129419
