# Neural Collaborative Filtering (NCF)
Neural Matrix Factorization (NeuMF) combines Generalized Matrix Factorization (GMF) with deep neural networks to model user-item interactions. The model is trained on implicit feedback data and predicts the likelihood of a user interacting with an item.

Based on https://github.com/recommenders-team/recommenders/blob/main/examples/02_model_collaborative_filtering/ncf_deep_dive.ipynb

In [1]:
# General imports
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from utils.timer import Timer
from model.ncf_singlenode import NCF
from model.dataset import Dataset as NCFDataset
from dataset.splitters import python_chrono_split
from evaluation.evaluation import (
    map, ndcg_at_k, precision_at_k, recall_at_k
)
from utils.constants import SEED
from utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.10.16 | packaged by conda-forge | (main, Dec  5 2024, 14:07:43) [MSC v.1942 64 bit (AMD64)]
Pandas version: 2.2.3
Tensorflow version: 2.19.0


In [2]:
items_df = pd.read_csv('../data/items.csv')
ratings_df = pd.read_csv('../data/ratings.csv')
df = pd.merge(ratings_df, items_df, on='movie_id')
# keep only required columns
df = df[['user_id', 'movie_id', 'rating', 'unix_timestamp', 'title']]
print("number of unique users: ", df['user_id'].nunique())
print("number of unique movies: ", df['movie_id'].nunique())
print("number of ratings: ", len(df))
df.head()

number of unique users:  943
number of unique movies:  1682
number of ratings:  102295


Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [3]:
# In some cases a user might have rated the same movie multiple times. We will keep the latest rating and remove the rest.
df = df.sort_values(by='unix_timestamp', ascending=False).drop_duplicates(subset=['user_id', 'movie_id'], keep='first')
print("number of ratings after removing duplicates: ", len(df))
df.head()

number of ratings after removing duplicates:  100000


Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
65803,729,748,4,893286638,"Saint, The (1997)"
81032,729,272,4,893286638,Good Will Hunting (1997)
47830,729,689,4,893286638,"Jackal, The (1997)"
74694,729,313,3,893286638,Titanic (1997)
53320,729,333,4,893286638,"Game, The (1997)"


In [4]:
# list of all users
unique_users = df['user_id'].unique() 
# creating a list of all movie names in it
unique_movies = df['movie_id'].unique()

In [5]:
# model parameters
EPOCHS = 100
BATCH_SIZE = 256

In [6]:
train, test = python_chrono_split(df, 0.75)

In [7]:
train.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,title
94620,1,172,5,874965478,"Empire Strikes Back, The (1980)"
61344,1,168,5,874965478,Monty Python and the Holy Grail (1974)
76298,1,165,5,874965518,Jean de Florette (1986)
49312,1,156,4,874965556,Reservoir Dogs (1992)
16120,1,196,5,874965677,Dead Poets Society (1989)


In [8]:
# Filter out any users or items in the test set that do not appear in the training set
train_users = train['user_id'].unique()
train_movies = train['movie_id'].unique()
test = test[(test['user_id'].isin(train_users)) & (test['movie_id'].isin(train_movies))]
print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (74992, 5)
test shape:  (24891, 5)


In [9]:
# Create a test set containing the last interaction for each user as for the leave-one-out evaluation.
leave_one_out_test = test.groupby('user_id').last().reset_index()

In [10]:
# Write datasets to csv files
train_file = "./data/train.csv"
test_file = "./data/test.csv"
leave_one_out_test_file = "./data/leave_one_out_test.csv"
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

In [11]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)

INFO:model.dataset:Indexing ./data/train.csv ...
INFO:model.dataset:Indexing ./data/leave_one_out_test.csv ...
INFO:model.dataset:Creating full leave-one-out test file ./data/leave_one_out_test_full.csv ...
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return bound(*args, **kwds)
  return boun

The NCF has a lot of parameters. The most important ones are:
- n_factors, which controls the dimension of the latent space. Usually, the quality of the training set predictions grows with as n_factors gets higher.
- layer_sizes, sizes of input layer (and hidden layers) of MLP, input type is list.
- n_epochs, which defines the number of iteration of the SGD procedure. Note that both parameter also affect the training time.
- model_type, we can train single "MLP", "GMF" or combined model "NCF" by changing the type of model.

We will here set n_factors to 4, layer_sizes to [16,8,4], n_epochs to 100, batch_size to 256. To train the model, we simply need to call the fit() method.

In [12]:
# Train NCF based on Tensorflow
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)



In [13]:
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:model.ncf_singlenode:Epoch 10 [1.84s]: train_loss = 0.255855 
INFO:model.ncf_singlenode:Epoch 20 [1.88s]: train_loss = 0.244323 
INFO:model.ncf_singlenode:Epoch 30 [1.91s]: train_loss = 0.238583 
INFO:model.ncf_singlenode:Epoch 40 [1.80s]: train_loss = 0.233727 
INFO:model.ncf_singlenode:Epoch 50 [1.81s]: train_loss = 0.229628 
INFO:model.ncf_singlenode:Epoch 60 [1.81s]: train_loss = 0.227037 
INFO:model.ncf_singlenode:Epoch 70 [1.80s]: train_loss = 0.225190 
INFO:model.ncf_singlenode:Epoch 80 [1.89s]: train_loss = 0.224051 
INFO:model.ncf_singlenode:Epoch 90 [1.88s]: train_loss = 0.223083 
INFO:model.ncf_singlenode:Epoch 100 [1.81s]: train_loss = 0.222241 


Took 184.11060639994685 seconds for training.


In [14]:
# Prediction
predictions = [[row.user_id, row.movie_id, model.predict(row.user_id, row.movie_id)]
               for (_, row) in test.iterrows()]


predictions = pd.DataFrame(predictions, columns=['user_id', 'movie_id', 'prediction'])
predictions.head()

Unnamed: 0,user_id,movie_id,prediction
0,1,88,0.570681
1,1,149,0.01789
2,1,239,0.787535
3,1,101,0.373325
4,1,110,0.006269


In [15]:
# Generic Evaluation
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.movie_id.unique())
    for user in train.user_id.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"user_id": users, "movie_id":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["user_id", "movie_id"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 1.6817080000182614 seconds for prediction.


In [16]:
TOP_K = 10
eval_map = map(test, all_predictions, col_prediction='prediction', k=TOP_K) # Mean Average Precision
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K) # Normalized Discounted Cumulative Gain
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K) # Precision at K
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K) # Recall at K

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.048337
NDCG:	0.198426
Precision@K:	0.180170
Recall@K:	0.101736


In [17]:
# Record results for tests - ignore this cell
store_metadata("map", eval_map)
store_metadata("ndcg", eval_ndcg)
store_metadata("precision", eval_precision)
store_metadata("recall", eval_recall)

In [18]:
# Recommend for user
rand_user = np.random.choice(train.user_id.unique(), 1)[0]
user_predictions = all_predictions[all_predictions.user_id == rand_user]
top_movies_for_user = user_predictions.sort_values(by='prediction', ascending=False).head(10)
print("Top 10 movies for user id {}: ".format(rand_user))
print(top_movies_for_user[['movie_id', 'prediction']].to_string(index=False)) 

Top 10 movies for user id 41: 
 movie_id  prediction
      127    0.936399
      483    0.930977
      172    0.911197
       79    0.909881
       89    0.901455
       22    0.890720
      183    0.870665
      134    0.848151
      234    0.844848
      176    0.838546


In [19]:
# Leave one out evaluation with Hit ratio & NDCG
k = TOP_K

ndcgs = []
hit_ratio = []

for b in data.test_loader():
    user_input, item_input, labels = b
    output = model.predict(user_input, item_input, is_list=True)

    output = np.squeeze(output)
    rank = sum(output >= output[0])
    if rank <= k:
        ndcgs.append(1 / np.log(rank + 1))
        hit_ratio.append(1)
    else:
        ndcgs.append(0)
        hit_ratio.append(0)

eval_ndcg = np.mean(ndcgs)
eval_hr = np.mean(hit_ratio)

print("HR:\t%f" % eval_hr)
print("NDCG:\t%f" % eval_ndcg)

HR:	0.520679
NDCG:	0.400674


In [20]:
# Save Model
model.save("ncf_model")

In [32]:
# Save parameters with pickle
import pickle

with open("ncf_params.pkl", "wb") as f:
    pickle.dump({
        "n_users": data.n_users,
        "n_items": data.n_items,
        "model_type": model.model_type,
        "n_factors": model.n_factors,
        "layer_sizes": model.layer_sizes,
        "n_epochs": model.n_epochs,
        "batch_size": model.batch_size,
        "learning_rate": model.learning_rate,
        "seed": SEED
    }, f)
# Save predictions
predictions.to_csv("ncf_predictions.csv", index=False)


In [33]:
# Load parameters from pickle
with open("ncf_params.pkl", "rb") as f:
    params = pickle.load(f)
print("Loaded parameters: ", params)

Loaded parameters:  {'n_users': 943, 'n_items': 1597, 'model_type': 'neumf', 'n_factors': 4, 'layer_sizes': [16, 8, 4], 'n_epochs': 100, 'batch_size': 256, 'learning_rate': 0.001, 'seed': 42}


In [35]:
# Load model and data
data = NCFDataset(train_file=train_file)
loaded_model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type=params['model_type'],
    n_factors=params['n_factors'],
    layer_sizes=params['layer_sizes'],
    n_epochs=params['n_epochs'],
    batch_size=params['batch_size'],
    learning_rate=params['learning_rate'],
    verbose=10,
    seed=params['seed']
)
loaded_model.set_dict(data)
loaded_model.load(neumf_dir="ncf_model")

# Recommend for user
rand_user = np.random.choice(train.user_id.unique(), 1)[0]
items = list(train.movie_id.unique())

# Get predictions of all movies for the user
all_predictions = pd.DataFrame(data={"movie_id": [], "prediction": []})
for movie in items:
    prediction = loaded_model.predict(rand_user, movie)
    all_predictions.loc[-1] = [movie, prediction]

# Sort predictions and get top 10 products for the user
top_products_for_user = user_predictions.sort_values(by='prediction', ascending=False).head(10)
print("Top 10 products for user id {}: ".format(rand_user))
print(top_products_for_user[['movie_id', 'prediction']].to_string(index=False)) 

INFO:model.dataset:Indexing ./data/train.csv ...


Top 10 products for user id 103: 
 movie_id  prediction
      127    0.936399
      483    0.930977
      172    0.911197
       79    0.909881
       89    0.901455
       22    0.890720
      183    0.870665
      134    0.848151
      234    0.844848
      176    0.838546
