# Neural Collaborative Filtering (NCF)
References:
* https://github.com/microsoft/recommenders 
* https://towardsdatascience.com/neural-collaborative-filtering-96cef1009401

In [63]:
import sys
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import random
import time
from tqdm.notebook import tqdm


import tensorflow as tf
from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp


In [70]:
data = pd.read_csv("source/supporting/goodreads_interactions.csv")

# TESTING MODE
data = data.iloc[0:500000]
#data = data.sample(n=10000000)

## Focus on read and reviewed only i.e., remove is_reviewed == 0 and rating == 0
data = data[(data['is_reviewed']==1)&(data['rating']!=0)]

## Focus on users that have more than 10 book ratings
ratings_per_user = data[(data['is_reviewed']==1)&(data['rating']!=0)].groupby('user_id').size().reset_index(name='number_of_ratings_per_user')
data = data[data.user_id.isin(set(ratings_per_user[ratings_per_user['number_of_ratings_per_user']>=10]['user_id']))]

## Focus on ratings >= 3  
## NOTE: in the future could use no rating - rating 2 as negative data? 
data = data[data.rating>=3]

## Remove is_reviewed and is_read
data = data[['user_id','book_id','rating']]

# Basic statistics about the dataset
print(f"Total number of entries = {len(data)}")
print(f"Total number of unique users = {len(set(data['user_id']))}")
print(f"Total number of unique books = {len(set(data['book_id']))}")


# Perform a 80/20 train-test split on the interactions in the dataset
train, test = train_test_split(data.values, test_size=0.2, random_state=17)
train_df = pd.DataFrame(train, columns=data.columns)
test_df = pd.DataFrame(test, columns=data.columns)
print("Train Size  : ", len(train_df))
print("Test Size : ", len (test_df))

## Relabel IDs train set for both user and book ids
le_user = pp.LabelEncoder()
le_item = pp.LabelEncoder()
train_df['user_id_idx'] = le_user.fit_transform(train_df['user_id'].values)
train_df['book_id_idx'] = le_item.fit_transform(train_df['book_id'].values)

## Test items that are only present in the train set (i.e., user and book id that are in train set)
test_df = test_df[(test_df['user_id'].isin(train_df['user_id'].unique())) & (test_df['book_id'].isin(train_df['book_id'].unique()))]

## Relabel IDs for test set for both user and book ids
test_df['user_id_idx'] = le_user.transform(test_df['user_id'].values)
test_df['book_id_idx'] = le_item.transform(test_df['book_id'].values)

## Preparing for Micorosft's Input -- requires timestamp even though timestamp isn't used
train_df = train_df[['user_id_idx','book_id_idx','rating']]
train_df.rename(columns={"user_id_idx":"userID", "book_id_idx":"itemID"}, inplace =True)
train_df['timestamp'] = 0

test_df = test_df[['user_id_idx','book_id_idx','rating']]
test_df.rename(columns={"user_id_idx":"userID", "book_id_idx":"itemID"}, inplace =True)
test_df['timestamp'] = 0

## Print number of unique users and books after filter
n_users = train_df['userID'].nunique()
n_items = train_df['itemID'].nunique()
print("Number of Unique Users : ", n_users)
print("Number of unique Books : ", n_items)

Total number of entries = 31519
Total number of unique users = 398
Total number of unique books = 24542
Train Size  :  25215
Test Size :  6304
Number of Unique Users :  398
Number of unique Books :  20177


In [71]:
## SETUP
args = {
    'n_layers' : 16,
    'n_factors' : 4,
    'epochs' : 50,
    'lr': 0.005,
    'batch_size' : 1024,
    'K':20,
    'seed':17
}

In [72]:
def recommendTopKAll():
    with Timer() as test_time:
        users, items, preds = [], [], []
        item = list(train_df.itemID.unique())
        for user in train_df.userID.unique():
            user = [user] * len(item) 
            users.extend(user)
            items.extend(item)
            preds.extend(list(model.predict(user, item, is_list=True)))
        all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})
        merged = pd.merge(train_df, all_predictions, on=["userID", "itemID"], how="outer")
        all_predictions = merged[merged.rating.isnull()].drop(['rating','timestamp'], axis=1)
    print("Took {} seconds for prediction.".format(test_time))
    return all_predictions

def recommendTopK(predictions, train_df, test_df, userID=0, K=20):
    predictions = predictions[predictions['userID']==userID]
    predictions = predictions.sort_values('prediction', ascending=False)
    predictions = predictions.iloc[:K].reset_index(drop=True)
    interaction_train = train_df[train_df["userID"]==userID].groupby('userID')['itemID'].apply(list).iloc[0]
    interaction_test = test_df[test_df["userID"]==userID].groupby('userID')['itemID'].apply(list).iloc[0]
    predictions['inTrain'] = predictions.apply(lambda x: True if x.itemID in interaction_train else False, axis=1)
    predictions['inTest'] = predictions.apply(lambda x: True if x.itemID in interaction_test else False, axis=1)
    return predictions  

In [73]:
train_file = "./source/supporting/NGCFtrain.csv"
test_file = "./source/supporting/NGCFtest.csv"
train_df.sort_values("userID").to_csv(train_file, index=False)
test_df.sort_values("userID").to_csv(test_file, index=False)
data_ = NCFDataset(train_file=train_file, test_file=test_file, seed=args['seed'])

INFO:recommenders.models.ncf.dataset:Indexing ./source/supporting/NGCFtrain.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ./source/supporting/NGCFtest.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ./source/supporting/NGCFtest_full.csv ...


In [74]:
model = NCF (
    n_users=data_.n_users, 
    n_items=data_.n_items,
    model_type="NeuMF",
    n_factors=args['n_factors'],
    layer_sizes=[args['n_layers']],
    n_epochs=args['epochs'],
    batch_size=args['batch_size'],
    learning_rate=args['lr'],
    verbose=10,
    seed=args['seed']
)



In [75]:
with Timer() as train_time:
    model.fit(data_)
print("Took {} seconds for training.".format(train_time))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [2.78s]: train_loss = 0.213595 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 20 [2.79s]: train_loss = 0.155225 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 30 [2.97s]: train_loss = 0.125712 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 40 [3.01s]: train_loss = 0.112959 
INFO:recommenders.models.ncf.ncf_singlenode:Epoch 50 [3.49s]: train_loss = 0.103548 


Took 161.0640 seconds for training.


In [76]:
all_predictions = recommendTopKAll()
eval_map = map_at_k(test_df, all_predictions, col_prediction='prediction', k=args['K'])
eval_ndcg = ndcg_at_k(test_df, all_predictions, col_prediction='prediction', k=args['K'])
eval_precision = precision_at_k(test_df, all_predictions, col_prediction='prediction', k=args['K'])
eval_recall = recall_at_k(test_df, all_predictions, col_prediction='prediction', k=args['K'])

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

Took 17.0902 seconds for prediction.
MAP:	0.010273
NDCG:	0.019462
Precision@K:	0.005714
Recall@K:	0.024344


In [85]:
recommendTopK(all_predictions,train_df, test_df, userID=3, K=args['K'])

Unnamed: 0,userID,itemID,prediction,inTrain,inTest
0,3,1275,0.999999,False,True
1,3,3071,0.999989,False,False
2,3,3323,0.999989,False,False
3,3,2844,0.999985,False,False
4,3,2858,0.999976,False,False
5,3,6439,0.999887,False,False
6,3,1343,0.999835,False,False
7,3,1549,0.999686,False,False
8,3,2530,0.999213,False,False
9,3,15130,0.998712,False,False
