In [63]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [64]:
print(f'TF version: {tf.__version__}')
print(tf.config.list_physical_devices())

TF version: 2.1.0
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU'), PhysicalDevice(name='/physical_device:XLA_GPU:0', device_type='XLA_GPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


# Read Data

In [65]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '../datasets/'

## Amazon Fashion

In [66]:
# data_path = 'data/Amazon/'
# file_name = 'Amazon_full' # file_name = 'Amazon_05_users' 
# file_name = 'Amazon_01_users'
file_name = 'am_80k_users'

## MovieLens

In [67]:
# data_path = 'data/ML/'
# file_name = 'ml_1m'
# file_name = 'ML_full' # file_name = 'ML_05_users'
# file_name = 'ML_01_users'

In [68]:
df = pd.read_pickle(path + file_name)
df.user_id = df.user_id.astype('category').cat.codes
df.item_id = df.item_id.astype('category').cat.codes
df.head()

Unnamed: 0,user,item,datetime,rating,user_id,item_id
6904244,A2EQZT4NOBKME3,B00SFLJZ52,2015-06-11,3.0,29949,137482
4472551,A2EQZT4NOBKME3,B00D4TJRWG,2015-06-11,5.0,29949,73746
10630561,A2EQZT4NOBKME3,B00OBT081W,2016-01-15,5.0,29949,126216
5801430,A2EQZT4NOBKME3,B00KD9AGAC,2016-08-24,5.0,29949,108075
5505899,A1QKA075BTCNIH,B00ISY7VNO,2015-01-13,5.0,15581,100488


In [69]:
from Data_prep import leave_users_out
remaining, subset = leave_users_out(df, 40000)

In [70]:
len(subset)

351771

In [71]:
df = subset
df.user_id = df.user_id.astype('category').cat.codes
df.item_id = df.item_id.astype('category').cat.codes

---
# Data Prep

## Dataset Params

In [72]:
val_perc = test_perc = 0.1
n_last_items_val = n_last_items_test = 1

total_items = len(df.item_id.unique())
total_users = len(df.user_id.unique())

## Create Split

In [73]:
from Data_prep import train_val_test_split
datasets = train_val_test_split(df, val_perc, test_perc, n_last_items_val, n_last_items_test)
train_set, val_set, test_set = datasets

---
# Neural Collaborative Filtering (NCF)
Using the NCF framework we build Generalized Matrix Factorisation (GMF), Multiplayer Perceptron Matrix Factorisation (MLP) and combine the two in Neural Matrix Factorisation (NeuMF)
- paper: http://papers.www2017.com.au.s3-website-ap-southeast-2.amazonaws.com/proceedings/p173.pdf
- blog: https://medium.com/@victorkohler/collaborative-filtering-using-deep-neural-networks-in-tensorflow-96e5d41a39a1
- code: https://github.com/Leavingseason/NeuralCF/blob/master

## Set Parameters

### CKPTS Store Paths

In [74]:
run_num = 'am_40k_nolf_20'

In [84]:
GMF_params = {
    'learning_rate': 0.001,
    'batch_size': 256,
    'nolf': 20,
    'regs': [0.00001,0.00001],
    'epochs': 20,
    'sample_size': len(train_set),#int(0.5*len(train_set.user_id.unique())),
    'num_neg': 4,
    'ckpt_dir': f'../NeuMF_storage/GMF_ckpts_{run_num}/ckpts',
    'optimizer':'Adam'
}

In [85]:
MLP_params = {
    'learning_rate': 0.001,
    'batch_size': 256,
    'layers': [64,32,16,8],
    'reg_layers': [0,0,0,0],
    'epochs': 20,
    'sample_size': len(train_set),#int(0.5*len(train_set.user_id.unique())),
    'num_neg': 4,
    'ckpt_dir': f'../NeuMF_storage/MLP_ckpts_{run_num}/ckpts',
    'optimizer':'Adam'
}

In [86]:
NeuMF_params = {
    'learning_rate': 0.001,
    'batch_size': 256,
    'layers': [64,32,16,8],
    'reg_layers': [0,0,0,0],
    'reg_mf': [0.00001, 0.00001],
    'nolf': 20,
    'epochs': 20,
    'sample_size': len(train_set),#int(0.5*len(train_set.user_id.unique())),
    'num_neg': 4,
    'ckpt_dir': f'../NeuMF_storage/NeuMF_ckpts_{run_num}/ckpts',
    'optimizer':'Adam'
}

## Init

In [87]:
from NCF import NCF
NCF = NCF(total_users, total_items, GMF_params, MLP_params, NeuMF_params)

NCF.build_GMF_model()
NCF.build_MLP_model()
NCF.build_NeuMF_model()

## Create Samples

## MP sampler

In [78]:
def subsampler(worker_id, epochs, user_items, train_users, train_items, params):
    print(f'worker {workes_id} started')
    all_user_inputs, all_item_inputs, all_labels = [], [], []
    for epoch in range(epochs): 
        user_inputs, item_inputs, labels = [], [], []
        for s in range(params['sample_size']):
            # Add positive item
            u = np.random.choice(train_users)
            u_items = user_items[u]
            i = np.random.choice(u_items)

            user_inputs.append(u)
            item_inputs.append(i)
            labels.append(1)

            # Add negative item
            for i in range(params['num_neg']):
                j = np.random.choice(train_items)
                while j in u_items:  # neg item j cannot be in the set of pos items of user u
                    j = np.random.choice(train_items)

                user_inputs.append(u)
                item_inputs.append(j)
                labels.append(0)
                
        all_user_inputs.append(user_inputs)
        all_item_inputs.append(item_inputs)
        all_labels.append(labels)
        
    return {'u':all_user_inputs, 'i':all_item_inputs, 'l':all_labels}

In [168]:
%%time
import multiprocessing as mp
from progressbar import progressbar
if __name__ == '__main__':
    processors = mp.cpu_count()
    params = NCF.GMF_params
#     params = NCF.MLP_params
#     params = NCF.NeuMF_params
    data = train_set

    all_user_inputs, all_item_inputs, all_labels = [], [], []
    user_items = data.groupby('user_id')['item_id'].apply(list)
    train_users = data.user_id.unique()
    train_items = data.item_id.unique()
    
    epoch_splits = np.array_split(np.array(range(params['epochs'])), processors)
    args = []
    for worker_id, epoch_split in enumerate(epoch_splits):
        args.append((worker_id, len(epoch_split), user_items, train_users, train_items, params))
        
    with mp.Pool(processes=processors) as pool:
        results = pool.starmap(subsampler, args)
    
    all_user_inputs, all_item_inputs, all_labels = [], [], []
    samples = [[], [], []]
    for res_epochs in results:
        all_user_inputs.extend(res_epochs['u'])
        all_item_inputs.extend(res_epochs['i'])
        all_labels.extend(res_epochs['l'])
    
    GMF_samples_mp = [all_user_inputs, all_item_inputs, all_labels]
#     MLP_samples_mp = [all_user_inputs, all_item_inputs, all_labels]
#     NeuMF_samples_mp = [all_user_inputs, all_item_inputs, all_labels]


CPU times: user 48.5 s, sys: 13.1 s, total: 1min 1s
Wall time: 1h 18min 47s


In [None]:
GMF_sample_dict = {'u':GMF_samples_mp[0], 'i':GMF_samples_mp[1], 'l':GMF_samples_mp[2]}
GMF_samples_df = pd.DataFrame(GMF_sample_dict)
GMF_samples_df.to_pickle('../NeuMF_storage/GMF_samples')

In [None]:
# GMF_samples = NCF.create_samples(name='GMF', data=train_set)
# MLP_samples = NCF.create_samples(name='MLP', data=train_set)
# NeuMF_samples = NCF.create_samples(name='NeuMF', data=train_set)all_user_inputs, all_item_inputs, all_labels

## Training

In [81]:
import multiprocessing as mp
def train_model(name='', train_set=[], verbose=1, store_path=''):
    model, params = NCF.get_model(name)

    ckpts_prefix = os.path.join(params['ckpt_dir'], "ckpt")
    ckpts_callback = tf.keras.callbacks.ModelCheckpoint(filepath=ckpts_prefix,    
                                                     monitor='loss',    
                                                     mode='min',    
                                                     save_best_only=True,
                                                     save_weights_only=True)

    if len(train_set) == 0:
        raise Exception('No samples available, create samples first using: create_samples')
        
    fit(model, params, train_set, [ckpts_callback], verbose)

    if len(store_path) > 0:
        model.save_weights(store_path)


def fit(model, params, train_set, callbacks, verbose):
    print(f'\nFitting {model._name} with parameters:')
    print(pd.DataFrame.from_dict(params, orient='index'))
    user_items = train_set.groupby('user_id')['item_id'].apply(list)
    train_users = train_set.user_id.unique()
    train_items = train_set.item_id.unique()
    num_processes = mp.cpu_count()

    for epoch in range(params['epochs']):
        print(f'Epoch: {epoch}')
        user_inputs, item_inputs, labels = create_sample(user_items, train_users, train_items, params, num_processes)
        hist = model.fit([np.array(user_inputs), np.array(item_inputs)], 
                  np.array(labels), 
                  batch_size=params['batch_size'], 
                  verbose=verbose, 
                  epochs=1, 
                  shuffle=True,
                  callbacks=callbacks)

        NCF.history[model._name]['loss'].append(round(hist.history['loss'][0],5))


def create_sample(user_items, train_users, train_items, params, num_processes):
    samples_sizes_split = np.array_split(np.array(range(params['sample_size'])),8)
    args = []
    for samples_size in samples_sizes_split:
        args.append((user_items, train_users, train_items, len(samples_size), params['num_neg']))
    with mp.Pool(processes=num_processes) as pool:
        results = pool.starmap(create_sample_worker, args)

    user_inputs, item_inputs, labels = [], [], []
    for res_epochs in results: 
        user_inputs.extend(res_epochs['u'])
        item_inputs.extend(res_epochs['i'])
        labels.extend(res_epochs['l'])

    return user_inputs, item_inputs, labels


def create_sample_worker(user_items, train_users, train_items, sample_size, num_neg):
    user_inputs, item_inputs, labels = [], [], []
#     print('worker started')
    for s in range(sample_size):
        # Add positive item
        u = np.random.choice(train_users)
        u_items = user_items[u]
        i = np.random.choice(u_items)

        user_inputs.append(u)
        item_inputs.append(i)
        labels.append(1)

        # Add negative item
        for i in range(num_neg):
            j = np.random.choice(train_items)
            while j in u_items:  # neg item j cannot be in the set of pos items of user u
                j = np.random.choice(train_items)

            user_inputs.append(u)
            item_inputs.append(j)
            labels.append(0)

    return {'u':user_inputs, 'i':item_inputs, 'l':labels}

In [48]:
if __name__ == '__main__':
    train_model('GMF', train_set, store_path=f'../weights/GMF_weights_{run_num}/GMF_weights') 


Fitting GMF with parameters:
                                                             0
learning_rate                                            0.001
batch_size                                                 256
nolf                                                        20
regs                                            [1e-05, 1e-05]
epochs                                                      20
sample_size                                              86388
num_neg                                                      4
ckpt_dir       ../NeuMF_storage/GMF_ckpts_am_10k_nolf_20/ckpts
optimizer                                                 Adam
Epoch: 0
Train on 431940 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch: 1
Train on 431940 samples
Epoch: 2
Train on 431940 samples
Epoch: 3
Train on 431940 samples
Epoch: 4
Train on 431940 samples
 16128/431940 [>.............................] - ETA: 20s - loss: 0.1282

KeyboardInterrupt: 

In [21]:
if __name__ == '__main__':
    train_model('MLP', train_set, store_path=f'../weights/MLP_weights_{run_num}/MLP_weights') 


Fitting MLP with parameters:
                                                     0
learning_rate                                    0.001
batch_size                                         256
layers                                 [64, 32, 16, 8]
reg_layers                                [0, 0, 0, 0]
epochs                                              20
sample_size                                      86388
num_neg                                              4
ckpt_dir       ../NeuMF_storage/MLP_ckpts_am_10k/ckpts
optimizer                                         Adam
Epoch: 0
Train on 431940 samples
Epoch: 1
Train on 431940 samples
Epoch: 2
Train on 431940 samples
Epoch: 3
Train on 431940 samples
Epoch: 4
Train on 431940 samples
Epoch: 5
Train on 431940 samples
Epoch: 6
Train on 431940 samples
Epoch: 7
Train on 431940 samples
Epoch: 8
Train on 431940 samples
Epoch: 9
Train on 431940 samples
Epoch: 10
Train on 431940 samples
Epoch: 11
Train on 431940 samples
Epoch: 12
Train on 431

In [22]:
# NCF.train_model('NeuMF', NeuMF_samples) 

#### Load weights for NeuMF

In [23]:
NCF.use_pretrain_model(GMF_weights_path=f'../weights/GMF_weights_{run_num}/GMF_weights',
                       MLP_weights_path=f'../weights/MLP_weights_{run_num}/MLP_weights')

In [None]:
if __name__ == '__main__':
    train_model('NeuMF', train_set, store_path=f'../weights/NeuMF_weights_{run_num}/NeuMF_weights') 


Fitting NeuMF with parameters:
                                                               0
learning_rate                                              0.001
batch_size                                                   256
layers                                           [64, 32, 16, 8]
reg_layers                                          [0, 0, 0, 0]
reg_mf                                            [1e-05, 1e-05]
nolf                                                          20
epochs                                                        20
sample_size                                               343771
num_neg                                                        4
ckpt_dir       ../NeuMF_storage/NeuMF_ckpts_am_40k_nolf_20/ckpts
optimizer                                                   Adam
Epoch: 0
Train on 1718855 samples


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch: 1
Train on 1718855 samples
Epoch: 2
Train on 1718855 samples
Epoch: 3
Train on 1718855 samples
Epoch: 4
Train on 1718855 samples
Epoch: 5
Train on 1718855 samples
Epoch: 6
Train on 1718855 samples
Epoch: 7
Train on 1718855 samples
Epoch: 8
Train on 1718855 samples
Epoch: 9
Train on 1718855 samples
Epoch: 10
Train on 1718855 samples
Epoch: 11
Train on 1718855 samples
Epoch: 12
Train on 1718855 samples
Epoch: 13
Train on 1718855 samples
Epoch: 14
Train on 1718855 samples
Epoch: 15
Train on 1718855 samples
 304640/1718855 [====>.........................] - ETA: 3:24 - loss: 7.0040e-10

---
## Evaluation

In [None]:
steps = 5
rank_at = 20
sample_len = 100

## Full set scores

In [101]:
u = test_set.user_id.unique()[1]
print(u)
user_array = np.full(total_items, u, dtype='int32')
preds = np.hstack(NCF.MLP.predict([user_array, np.arange(total_items)], batch_size=total_items, verbose=0))

ids = np.argpartition(preds, -rank_at)[-rank_at:]
best_ids = np.argsort(preds[ids])[::-1]
best = np.arange(total_items)[ids[best_ids]]

4451


In [102]:
best

array([ 3188,  3256,  3191, 10610, 46194, 33450, 49521, 19985,  3169,
       33415, 49512,  3281, 20056, 33351, 20085,  3244, 20097, 10549,
       20105, 33444])

In [65]:
np.hstack(NCF.NeuMF.predict([user_array, np.arange(total_items)], batch_size=total_items, verbose=0)).argsort()[-20:]

array([27709, 38199,  2727, 20903,  8565, 46599, 27759, 27768, 38133,
       27793,  8625, 25749, 38100,  8650, 20832, 38034, 43046, 20801,
       10840, 17531])

In [89]:
best

array([53539, 25795, 25817, 25844, 25837, 25824, 25851, 25821, 25793,
       25813, 25812, 25811, 25809, 25808, 25804, 25803, 25801, 25799,
       25796, 25842])

In [97]:
NCF.NeuMF.load_weights('../weights/NeuMF_weights_ml_1m/NeuMF_weights').expect_partial()
NCF.GMF.load_weights('../weights/GMF_weights_ml_1m/GMF_weights').expect_partial()
NCF.MLP.load_weights('../weights/MLP_weights_ml_1m/MLP_weights').expect_partial()

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fc9abe40198>

In [98]:
len(train_set.item_id.unique())

52597

In [26]:
%%time
ranked_df_full = NCF.get_predictions('GMF', train_set, test_set)

100% |########################################################################|

CPU times: user 1min 33s, sys: 347 ms, total: 1min 34s
Wall time: 1min 30s





In [27]:
from Evaluation import get_metrics
get_metrics(ranked_df_full, steps, rank_at, stats=False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,0,0.0,0.0
1,5,5,0.005,0.001
2,10,7,0.007,0.0007
3,15,7,0.007,0.000467
4,20,7,0.007,0.00035


In [28]:
%%time
ranked_df_full = NCF.get_predictions('MLP', train_set, test_set)

100% |########################################################################|

CPU times: user 1min 41s, sys: 577 ms, total: 1min 41s
Wall time: 1min 33s





In [29]:
from Evaluation import get_metrics
get_metrics(ranked_df_full, steps, rank_at, stats=False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,0,0.0,0.0
1,5,0,0.0,0.0
2,10,0,0.0,0.0
3,15,0,0.0,0.0
4,20,0,0.0,0.0


In [None]:
%%time
ranked_df_full = NCF.get_predictions('NeuMF', train_set, test_set)

In [None]:
from Evaluation import get_metrics
get_metrics(ranked_df_full, steps, rank_at, stats=False)

## Sample scores

In [32]:
%%time
ranked_df = NCF.sample_prediction('GMF', train_set, test_set)

100% |########################################################################|

CPU times: user 52.4 s, sys: 648 ms, total: 53 s
Wall time: 51.8 s





In [33]:
from Evaluation import get_metrics
get_metrics(ranked_df, steps, rank_at, stats=False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,351,0.351,0.351
1,5,377,0.377,0.0754
2,10,403,0.403,0.0403
3,15,413,0.413,0.027533
4,20,429,0.429,0.02145


In [34]:
%%time
ranked_df = NCF.sample_prediction('MLP', train_set, test_set)

100% |########################################################################|

CPU times: user 52.7 s, sys: 575 ms, total: 53.2 s
Wall time: 51.9 s





In [35]:
from Evaluation import get_metrics
get_metrics(ranked_df, steps, rank_at, stats=False)

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,128,0.128,0.128
1,5,149,0.149,0.0298
2,10,187,0.187,0.0187
3,15,250,0.25,0.016667
4,20,301,0.301,0.01505


In [None]:
%%time
ranked_df = NCF.sample_prediction('NeuMF', train_set, test_set)

In [None]:
from Evaluation import get_metrics
get_metrics(ranked_df, steps, rank_at, stats=False)

---
### Multiprocessing for multiple samples creatings 

In [None]:
# from progressbar import progressbar
# def create_samples(params, data, name=''):
#         print(f'Creating Samples for {name}')
# #         _, params = self.get_model(name)
#         all_user_inputs, all_item_inputs, all_labels = [], [], []
#         user_items = data.groupby('user_id')['item_id'].apply(list)
#         train_users = data.user_id.unique()
#         train_items = data.item_id.unique()

#         pbar = progressbar.ProgressBar()
#         for n in pbar(range(params['epochs'])):
#             user_inputs, item_inputs, labels = [], [], []
#             for s in range(int(params['sample_size'])):
#                 # Add positive item
#                 u = np.random.choice(train_users)
#                 u_items = user_items[u]
#                 i = np.random.choice(u_items)

#                 user_inputs.append(u)
#                 item_inputs.append(i)
#                 labels.append(1)

#                 # Add negative item
#                 for i in range(params['num_neg']):
#                     j = np.random.choice(train_items)
#                     while j in u_items:  # neg item j cannot be in the set of pos items of user u
#                         j = np.random.choice(train_items)

#                     user_inputs.append(u)
#                     item_inputs.append(j)
#                     labels.append(0)

#             all_user_inputs.append(user_inputs)
#             all_item_inputs.append(item_inputs)
#             all_labels.append(labels)

#         return [all_user_inputs, all_item_inputs, all_labels]

In [None]:
# import multiprocessing as mp
# if __name__ == '__main__':
    
#     with mp.Pool(processes=2) as pool:
#         results = pool.starmap(create_samples, [(GMF_params, train_set, 'GMF'), (MLP_params, train_set, 'MLP')])
        