In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

%load_ext autoreload
%autoreload 2

In [3]:
print(f'TF version: {tf.__version__}')

TF version: 2.2.0


# Read Data

In [4]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'

## Amazon Fashion

In [5]:
data_path = 'data/Amazon/'
# file_name = 'Amazon_full' # file_name = 'Amazon_05_users' 
file_name = 'Amazon_01_users'

## MovieLens

In [6]:
# data_path = 'data/ML/'
# file_name = 'ML_full' # file_name = 'ML_05_users'
# file_name = 'ML_01_users'

In [7]:
df = pd.read_pickle(path + data_path + file_name)
df.user_id = df.user_id.astype('category').cat.codes
df.item_id = df.item_id.astype('category').cat.codes
df.head()

Unnamed: 0,user,item,datetime,rating,item_id,user_id
4983863,A39ZLL8ILVT2J8,B00FXSELCM,2014-03-24,3.0,104506,73226
7294092,A39ZLL8ILVT2J8,B00VDPQ884,2016-06-29,5.0,175639,73226
4809981,A39ZLL8ILVT2J8,B00EWC0W3W,2016-08-14,5.0,99224,73226
9337932,A39ZLL8ILVT2J8,B01EZKMD64,2016-10-03,5.0,238824,73226
8832820,A39ZLL8ILVT2J8,B01ABS4646,2016-12-22,5.0,222085,73226


---
# Data Prep

## Dataset Params

In [8]:
val_perc = test_perc = 0.1
n_last_items_val = n_last_items_test = 1

total_items = len(df.item_id.unique())
total_users = len(df.user_id.unique())

## Create Split

In [9]:
from Data_prep import train_val_test_split
datasets = train_val_test_split(df, val_perc, test_perc, n_last_items_val, n_last_items_test)
train_set, val_set, test_set = datasets

---
# Neural Matrix Factorisation
- paper: http://papers.www2017.com.au.s3-website-ap-southeast-2.amazonaws.com/proceedings/p173.pdf
- blog: https://medium.com/@victorkohler/collaborative-filtering-using-deep-neural-networks-in-tensorflow-96e5d41a39a1
- code: https://github.com/Leavingseason/NeuralCF/blob/master/GMF.py

# GMF

## Params

In [47]:
GMF_params = {
    'learning_rate': 0.0001,
    'batch_size': 64,
    'nolf': 16,
    'regs': [0,0],
    'epochs': 1,#20,
    'sample_size': int(0.5*len(train_set.user_id.unique())),
    'num_neg': 5,
    'ckpt_dir': '../NeuMF_storage/GMF_ckpts/ckpts'
}

## Build GMF Model

In [11]:
from Models import build_GMF_model
GMF_model = build_GMF_model(total_items=total_items,
                            total_users=total_users,
                            nolf=GMF_params['nolf'],
                            regs=GMF_params['regs'])

## Summary

In [None]:
GMF_model.summary()

## Compile

In [12]:
optimizer = tf.keras.optimizers.Adam(lr=GMF_params['learning_rate'])
GMF_model.compile(optimizer=optimizer, loss='binary_crossentropy')

---
# Train GMF Model

## Configure Checkpoints

In [13]:
GMF_ckpts_prefix = os.path.join(GMF_params['ckpt_dir'], "ckpt")
GMF_ckpts_callback = tf.keras.callbacks.ModelCheckpoint(filepath=GMF_ckpts_prefix,    
                                                         monitor='loss',    
                                                         mode='min',    
                                                         save_best_only=True,
                                                         save_weights_only=True)

## Create Samples

In [14]:
from Models import create_NeuMF_samples
print('\nCreating Samples')
GMF_u_i_l = create_NeuMF_samples(train_set, GMF_params['epochs'], GMF_params['sample_size'], GMF_params['num_neg'])
GMF_all_user_inputs, GMF_all_item_inputs, GMF_all_labels = GMF_u_i_l


Creating Samples


100% |########################################################################|


## Fit

In [15]:
from Models import neumf_train_loop

In [17]:
GMF_model, GMF_hist = neumf_train_loop(GMF_model, GMF_u_i_l, GMF_params, [GMF_ckpts_callback])


Fitting GMF with parameters:
Parameters:                                               0
learning_rate                            0.0001
batch_size                                   64
nolf                                         16
epochs                                        1
sample_size                               60686
num_neg                                       5
ckpt_dir       ../NeuMF_storage/GMF_ckpts/ckpts
Epoch: 0


## Store Weights

In [36]:
GMF_weights_path = '../NeuMF_storage/GMF_weights/GMF_weights'
GMF_model.save_weights(GMF_weights_path, overwrite=False)


# Evaluation

## Load model_weights

In [87]:
GMF_model = build_GMF_model(total_items=total_items, total_users=total_users, nolf=GMF_params['nolf'])
GMF_model.load_weights(tf.train.latest_checkpoint(GMF_params['ckpt_dir']))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x15aab5610>

## Predict within 100 sample
- sample 99 negative items
- add 1 true item to sample
- predict for sample of 100
- rank and evaluate

In [92]:
import progressbar
def predict_neumf(model, train_set, test_set, sample_len=100, rank_at=20):
    user_items = train_set.groupby('user_id')['item_id'].apply(list)
    test_user_items = test_set.groupby('user_id')['item_id'].apply(list)
    train_items = train_set.item_id.unique()

    preds_ranked = []
    true_items = []
    pbar = progressbar.ProgressBar()
    for u in pbar(test_user_items.index):
        true_item = test_user_items[u]
        pos_items = user_items[u]
        neg_items = set(train_items) - set(pos_items)
        neg_sample = np.random.choice(list(neg_items), sample_len-1)
        total_sample = np.append(neg_sample, true_item)
        user_array = np.full(len(total_sample), u, dtype='int32')

        preds = np.hstack(model.predict([user_array, np.array(total_sample)], batch_size=sample_len, verbose=0))
        ids = np.argpartition(preds, -rank_at)[-rank_at:]
        best_ids = np.argsort(preds[ids])[::-1]
        best = total_sample[ids[best_ids]]

        preds_ranked.append(best)
        true_items.append(true_item)

    ranked_df = pd.DataFrame(list(zip(test_user_items.index, preds_ranked, true_items)),
                             columns=['users', 'pred_items_ranked', 'true_id'])

    return ranked_df

## Metrics from 100 sample

In [103]:
from Evaluation import get_metrics
get_metrics(ranked_df, 5, 20, False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,1333,0.109829,0.109829
1,5,2109,0.173766,0.034753
2,10,2729,0.22485,0.022485
3,15,3294,0.271401,0.018093
4,20,3799,0.31301,0.01565


## Predictions on all items
- Preferred for final comparison

In [242]:
%%time
preds_ranked = []
true_items = []
rank_at = 20
# model = MLP_model
model = GMF_model
pbar = progressbar.ProgressBar()


for u in pbar(test_user_items.index):
    true_item = test_user_items[u]
    
    user_array = np.full(total_items, u, dtype='int32')
    preds = np.hstack(model.predict([user_array, np.arange(total_items)], batch_size=total_items, verbose=0))
    ids = np.argpartition(preds, -rank_at)[-rank_at:]
    best_ids = np.argsort(preds[ids])[::-1]
    best = np.arange(total_items)[ids[best_ids]]
    
    preds_ranked.append(best)
    true_items.append(true_item)
    
ranked_df_all = pd.DataFrame(list(zip(test_user_items.index, preds_ranked, true_items)),
                         columns=['users', 'pred_items_ranked', 'true_id'])

100% |########################################################################|

CPU times: user 1h 44min 6s, sys: 8min 7s, total: 1h 52min 13s
Wall time: 1h 46min 41s





## Metrics from all items ranked

In [243]:
from Evaluation import get_metrics
get_metrics(ranked_df_all, 5, 20, stats=False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,82,0.006756,0.006756
1,5,251,0.020681,0.004136
2,10,347,0.02859,0.002859
3,15,434,0.035758,0.002384
4,20,484,0.039878,0.001994


---
# MLP

## Params

In [23]:
MLP_params = {
    'learning_rate': 0.01,
    'batch_size': 256,
    'layers': [32,16,8],
    'reg_layers': [0,0,0],
    'epochs': 1,#20,
    'sample_size': int(0.5*len(train_set.user_id.unique())),
    'num_neg': 4,
    'ckpt_dir': '../NeuMF_storage/MLP_ckpts/ckpts'
}

In [24]:
from Models import build_MLP_model
MLP_model = build_MLP_model(total_items, total_users, layers=MLP_params['layers'], reg_layers=MLP_params['reg_layers'])

In [None]:
MLP_model.summary()

## Compile

In [25]:
optimizer = tf.keras.optimizers.Adam(lr=MLP_params['learning_rate'])
MLP_model.compile(optimizer=optimizer, loss='binary_crossentropy')


# Train MLP Model

## Configure Checkpoints

In [26]:
MLP_ckpts_prefix = os.path.join(MLP_params['ckpt_dir'], "ckpt")
MLP_ckpts_callback = tf.keras.callbacks.ModelCheckpoint(filepath=MLP_ckpts_prefix,    
                                                         monitor='loss',    
                                                         mode='min',    
                                                         save_best_only=True,
                                                         save_weights_only=True)

## Create Samples

In [27]:
%%time
from Models import create_NeuMF_samples
print('\nCreating Samples')
MLP_u_i_l = create_NeuMF_samples(train_set, MLP_params['epochs'], MLP_params['sample_size'], MLP_params['num_neg'])
MLP_all_user_inputs, MLP_all_item_inputs, MLP_all_labels = MLP_u_i_l


Creating Samples


100% |########################################################################|

CPU times: user 26.5 s, sys: 784 ms, total: 27.3 s
Wall time: 28.2 s





## Fit

In [28]:
MLP_model, MLP_hist = neumf_train_loop(MLP_model, MLP_u_i_l, MLP_params, [MLP_ckpts_callback])


Fitting model_2 with parameters:
Parameters:                                               0
learning_rate                              0.01
batch_size                                  256
layers                              [32, 16, 8]
reg_layers                            [0, 0, 0]
epochs                                        1
sample_size                               60686
num_neg                                       4
ckpt_dir       ../NeuMF_storage/MLP_ckpts/ckpts
Epoch: 0


## Store Weights

In [37]:
MLP_weights_path = '../NeuMF_storage/MLP_weights/MLP_weights'
MLP_model.save_weights(MLP_weights_path, overwrite=False)



[TIP] Next time specify overwrite=True!


In [118]:
%%time
ranked_df_MLP = predict_neumf(MLP_model, train_set, test_set)

100% |########################################################################|

CPU times: user 41min 28s, sys: 44.3 s, total: 42min 13s
Wall time: 42min 35s





In [120]:
from Evaluation import get_metrics
get_metrics(ranked_df_MLP, 5, 20, False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,1963,0.161737,0.161737
1,5,4660,0.38395,0.07679
2,10,6607,0.544368,0.054437
3,15,7674,0.632281,0.042152
4,20,8296,0.68353,0.034176


---
# NeuMF

## Params

In [30]:
NeuMF_params = {
    'learning_rate': 0.001,
    'batch_size': 256,
    'layers': [32,16,8],
    'reg_layers': [0,0,0],
    'nolf': 16,
    'epochs': 1,#20,
    'sample_size': int(0.5*len(train_set.user_id.unique())),
    'num_neg': 4,
    'ckpt_dir': '../MLP_ckpts/ckpts'
}

In [56]:
[MLP_params['layers'][i] for i in range(1, len(MLP_params['layers']))]

[16, 8]

In [73]:
from Models import build_NeuMF_model
NeuMF_model = build_NeuMF_model(total_users, total_items, mf_nolf=GMF_params['nolf'], reg_mf=GMF_params['regs'], layers=MLP_params['layers'], reg_layers=MLP_params['reg_layers'])

In [74]:
NeuMF_model.summary()

Model: "NeuMF"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mlp_embedding_user (Embedding)  (None, 1, 16)        1941952     user_input[0][0]                 
__________________________________________________________________________________________________
mlp_embedding_item (Embedding)  (None, 1, 16)        3959440     item_input[0][0]                 
______________________________________________________________________________________________

In [32]:
optimizer = tf.keras.optimizers.Adam(lr=NeuMF_params['learning_rate'])
NeuMF_model.compile(optimizer=optimizer, loss='binary_crossentropy')

## Configure Checkpoints

In [33]:
NeuMF_ckpts_prefix = os.path.join(NeuMF_params['ckpt_dir'], "ckpt")
NeuMF_ckpts_callback = tf.keras.callbacks.ModelCheckpoint(filepath=NeuMF_ckpts_prefix,    
                                                         monitor='loss',    
                                                         mode='min',    
                                                         save_best_only=True,
                                                         save_weights_only=True)

## Create Samples

In [34]:
from Models import create_NeuMF_samples
print('\nCreating Samples')
# NeuMF_u_i_l = create_NeuMF_samples(train_set, NeuMF_params['epochs'], NeuMF_params['sample_size'], NeuMF_params['num_neg'])

# Re-use of samples
NeuMF_u_i_l = MLP_u_i_l
NeuMF_all_user_inputs, NeuMF_all_item_inputs, NeuMF_all_labels = NeuMF_u_i_l


Creating Samples


## Fit

In [234]:
NeuMF_model, NeuMF_hist = neumf_train_loop(NeuMF_model, NeuMF_u_i_l, NeuMF_params, [NeuMF_ckpts_callback])


Fitting NeuMF with parameters:
Parameters:                                 0
learning_rate               0.001
batch_size                    256
layers                [32, 16, 8]
reg_layers              [0, 0, 0]
nolf                           16
epochs                         20
sample_size                 60686
num_neg                         4
ckpt_dir       ../MLP_ckpts/ckpts
Epoch: 0
Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9
Epoch: 10
Epoch: 11
Epoch: 12
Epoch: 13
Epoch: 14
Epoch: 15
Epoch: 16
Epoch: 17
Epoch: 18
Epoch: 19


In [237]:
ranked_df_neumf = predict_neumf(NeuMF_model, train_set, test_set)

100% |########################################################################|


In [238]:
from Evaluation import get_metrics
get_metrics(ranked_df_neumf, 5, 20, stats=False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,1821,0.150037,0.150037
1,5,4044,0.333196,0.066639
2,10,5803,0.478125,0.047812
3,15,6919,0.570075,0.038005
4,20,7598,0.62602,0.031301


## Load NeuMF from weights
- Combining weights of GMF and MLP

In [75]:
from Models import build_GMF_model, build_MLP_model
GMF_model = build_GMF_model(total_items=total_items, total_users=total_users, nolf=GMF_params['nolf'])
GMF_model.load_weights(GMF_weights_path).expect_partial()

MLP_model = build_MLP_model(total_items, total_users, layers=MLP_params['layers'], reg_layers=MLP_params['reg_layers'])
MLP_model.load_weights(MLP_weights_path).expect_partial()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1778f6290>

In [78]:
pretrained_NeuMF = load_pretrain_model(NeuMF_model, GMF_model, MLP_model, len(NeuMF_params['layers']))

In [79]:
pretrained_NeuMF.summary()

Model: "NeuMF"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
item_input (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
mlp_embedding_user (Embedding)  (None, 1, 16)        1941952     user_input[0][0]                 
__________________________________________________________________________________________________
mlp_embedding_item (Embedding)  (None, 1, 16)        3959440     item_input[0][0]                 
______________________________________________________________________________________________