In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import progressbar as progressbar

%load_ext autoreload
%autoreload 2

In [2]:
print(f'TF version: {tf.__version__}')
tf.config.list_physical_devices()

TF version: 2.2.0


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:XLA_CPU:0', device_type='XLA_CPU')]

# Read Data

In [92]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
# path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '../datasets/'

## Amazon Fashion

In [93]:
# data_path = 'data/Amazon/'
# file_name = 'Amazon_full' # file_name = 'Amazon_05_users' 
# file_name = 'Amazon_01_users'
# file_name = 'am_80k_users'
file_name = 'am_40k_users'

## MovieLens

In [94]:
# data_path = 'data/ML/'
file_name = 'ml_1m'
# file_name = 'ML_full' # file_name = 'ML_05_users'
# file_name = 'ML_01_users'

In [95]:
df = pd.read_pickle(path + file_name)
df.user_id = df.user_id.astype('category').cat.codes
df.item_id = df.item_id.astype('category').cat.codes
df.head()

Unnamed: 0,rating,datetime,user_id,item_id
0,4,2000-12-31 22:00:19,0,2969
1,4,2000-12-31 22:00:55,0,1574
2,5,2000-12-31 22:00:55,0,957
3,5,2000-12-31 22:00:55,0,1178
4,3,2000-12-31 22:01:43,0,2147


### Create ML split (paper)

In [55]:
test_set = pd.DataFrame()
train_set = pd.DataFrame()
pbar = progressbar.ProgressBar()
for u in pbar(df.user_id.unique()):
    last_item = df[df.user_id==u].iloc[-1:]
    test_set = test_set.append(last_item)
    remaining = df[df.user_id==u].iloc[:-1]
    train_set = pd.concat([train_set, remaining])

100% |########################################################################|


In [96]:
test_set = pd.read_pickle(path + 'ml_1m_test')
train_set = pd.read_pickle(path + 'ml_1m_train')

---
# Data Prep

## Dataset Params

In [50]:
val_perc = test_perc = 0.1
n_last_items_val = n_last_items_test = 1

total_items = len(df.item_id.unique())
total_users = len(df.user_id.unique())

In [56]:
len(train_set.user_id.unique())

6040

## Create Split

In [9]:
from Data_prep import train_val_test_split
datasets = train_val_test_split(df, val_perc, test_perc, n_last_items_val, n_last_items_test)
train_set, val_set, test_set = datasets

---
# Neural Collaborative Filtering (NCF)
Using the NCF framework we build Generalized Matrix Factorisation (GMF), Multiplayer Perceptron Matrix Factorisation (MLP) and combine the two in Neural Matrix Factorisation (NeuMF)
- paper: http://papers.www2017.com.au.s3-website-ap-southeast-2.amazonaws.com/proceedings/p173.pdf
- blog: https://medium.com/@victorkohler/collaborative-filtering-using-deep-neural-networks-in-tensorflow-96e5d41a39a1
- code: https://github.com/Leavingseason/NeuralCF/blob/master

## Set Parameters

In [23]:
# run_num = 'am_40k_GS_'
run_num = 'ml_1m_nolf_8'

In [143]:
GMF_params = {
    'learning_rate': 0.001,
    'batch_size': 256,
    'nolf': 8,
    'regs': [0.000001,0.000001], #ML: [0.000001,0.000001]
    'epochs': 20,
    'sample_size': len(train_set),#int(0.5*len(train_set.user_id.unique())),
    'num_neg': 4,
    'ckpt_dir': f'../NeuMF_storage/GMF_ckpts_{run_num}/ckpts',
    'optimizer':'Adam'
}

In [12]:
# MLP_params = {
#     'learning_rate': 0.001,
#     'batch_size': 256,
#     'layers': [16,32,16,8],
#     'reg_layers': [0.0001,0,0,0],   #ML: [+-0.0005,0,0,0],
#     'epochs': 20,
#     'sample_size': len(train_set),#int(0.5*len(train_set.user_id.unique())),
#     'num_neg': 4,
#     'ckpt_dir': f'../NeuMF_storage/MLP_ckpts_{run_num}/ckpts',
#     'optimizer':'Adam'
# }

In [15]:
NeuMF_params = {
    'learning_rate': 0.001,
    'batch_size': 256,
    'layers': [16,32,16,8],
    'reg_layers': [0.0001,0,0,0],
    'reg_mf': [0.000001,0.000001],
    'nolf': 8,
    'epochs': 20,
    'sample_size': len(train_set),#int(0.5*len(train_set.user_id.unique())),
    'num_neg': 4,
    'ckpt_dir': f'../NeuMF_storage/NeuMF_ckpts_{run_num}/ckpts',
    'optimizer':'Adam'
}

## Load samples

In [37]:
sample_path = '../NeuMF_storage/samples_ml_1m/'
sample_name = 'ml_1m_sample_'

# sample_path = '../NeuMF_storage/samples_am_40k/'
# sample_name = 'am_40k_sample_'

In [38]:
%%time
import csv
samples = []
pbar = progressbar.ProgressBar()
for sample_num in pbar(range(20)):
    with open(f'{sample_path}{sample_name}{sample_num}.csv', 'r') as read_obj:
        sample = [[], [], []]
        csv_reader = csv.reader(read_obj)
        sample_str = list(csv_reader)
        
        for user_input, item_input, label in zip(sample_str[0], sample_str[1], sample_str[2]):
            sample[0].append(int(user_input))
            sample[1].append(int(item_input))
            sample[2].append(int(label))
        
    samples.append(sample)

100% |########################################################################|

CPU times: user 1min 41s, sys: 11.6 s, total: 1min 52s
Wall time: 1min 53s





---
# Grid Search: NeuMF

## Init

In [21]:
from NCF import NCF
NCF = NCF(total_users, total_items, NeuMF_params=NeuMF_params)
NCF.build_NeuMF_model()

## Parameter Space

In [27]:
learning_rates = [0.005, 0.001, 0.0005, 0.0001]
batch_sizes = [128, 256, 512, 1024]
gmf_regs = [[0.000001, 0.000001], [0.00001, 0.00001], [0.0001, 0.0001]]
mlp_regs = [[0.000001, 0.000001, 0.000001, 0.000001], 
            [0.00001, 0.00001, 0.00001, 0.00001], 
            [0.0001, 0.0001, 0.0001, 0.0001],
            [0.001, 0.001, 0.001, 0.001],
            [0.000001, 0, 0, 0],
            [0.00001, 0, 0, 0],
            [0.0001, 0, 0, 0],
            [0, 0, 0, 0.0001],
            [0, 0, 0, 0.00001],
            [0, 0, 0, 0.000001]]
            
total_runs = len(learning_rates) * len(batch_sizes) * len(gmf_regs) * len(mlp_regs)
total_runs

270

### Init eval params

In [None]:
from Evaluation import get_metrics
steps = 5
rank_at = 20
sample_len = 100

## Init store results

In [25]:
path = '../NeuMF_storage/GS/'
gs_result_name = 'GS_test'
storage_columns = ['tot_val_rec'] + ['val_metrics'] + list(NeuMF_params.keys())
gs_results = pd.DataFrame(columns=storage_columns)
gs_results.to_pickle(path + gs_result_name)

NameError: name 'NeuMF_params' is not defined

## Run Grid Search

In [None]:
all_
for learning_rate in learning_rates:
    NCF.NeuMF_params['learning_rate'] = learning_rate
    
    for batch_size in batch_sizes:
        NCF.NeuMF_params['batch_size'] = batch_size
        
        for gmf_reg in gmf_regs:
            NCF.NeuMF_params['reg_mf'] = gmf_reg
            
            for mlp_reg in mlp_regs:
                NCF.NeuMF_params['reg_layers'] = mlp_reg
                
                
                

---
## NeuMF

### Full set scores

In [None]:
%%time
NeuMF_ranked_df_full = NCF.get_predictions('NeuMF', test_set)

In [None]:
from Evaluation import get_metrics
get_metrics(NeuMF_ranked_df_full, steps, rank_at, stats=False)

In [40]:
metrics_all_sample.to_pickle('../results/NCF/NCF_metrics_all_sample_am_40k')

### Sample scores

In [None]:
%%time
NeuMF_ranked_df = NCF.sample_prediction('NeuMF', train_set, test_set)

In [None]:
from Evaluation import get_metrics
get_metrics(NeuMF_ranked_df, steps, rank_at, stats=False)

---
# Grid Search: GMF

## Init

In [98]:
from NCF import NCF
NCF = NCF(total_users, total_items, GMF_params=GMF_params)
NCF.build_GMF_model()

## Parameter Space

In [99]:
learning_rates = [0.005, 0.001, 0.0005, 0.0001]
batch_sizes = [1024, 128, 256, 512]
regs = [[0, 0], [0.000001, 0.000001]]
            
total_runs = len(learning_rates) * len(batch_sizes) * len(regs)
total_runs

32

### Init eval params

In [100]:
from Evaluation import get_metrics
steps = 5
rank_at = 20
sample_len = 100

## Init store results

In [142]:
store_path = '../NeuMF_storage/GS/'
gs_result_name = 'GS_test'
storage_columns = ['test_rec@10'] + ['val_metrics'] + ['test_metrics'] + list(GMF_params.keys())
gs_results = pd.DataFrame(columns=storage_columns)
gs_results.to_pickle(store_path + gs_result_name)

## Run Grid Search

In [115]:
from Evaluation import get_metrics

all_val_metrics = []
for learning_rate in learning_rates:
    NCF.GMF_params['learning_rate'] = learning_rate
    
    for batch_size in batch_sizes:
        NCF.GMF_params['batch_size'] = batch_size
        
        for reg in regs:
            NCF.GMF_params['regs'] = reg
            
            val_metrics = NCF.train_model('GMF', samples, train_set, test_set.iloc[:500], verbose=1) 
            ranked_df = NCF.sample_prediction('GMF', train_set, test_set)
            test_metrics = get_metrics(ranked_df, steps, rank_at, stats=False)
            results = dict(NCF.GMF_params,
                **{'test_rec@10':test_metrics['recall'].iloc[2], 
                'val_metrics':val_metrics,
                'test_metrics':test_metrics})

            pd.read_pickle(store_path + gs_result_name).append(results, ignore_index=True).to_pickle(store_path + gs_result_name)
            
            all_val_metrics.append(val_metrics)    
                


Fitting GMF with parameters:
learning_rate                                            0.005
batch_size                                                1024
nolf                                                         8
regs                                                    [0, 0]
epochs                                                       2
sample_size                                             994169
num_neg                                                      4
ckpt_dir         ../NeuMF_storage/GMF_ckpts_ml_1m_nolf_8/ckpts
optimizer                                                 Adam
Name: 0, dtype: object
Epoch: 0
Train on 4970845 samples


100% |########################################################################|


[   rank_at  hitcounts  recall  precision      ndcg
0        1         34   0.068   0.068000  0.068000
1        5        127   0.254   0.050800  0.161353
2       10        191   0.382   0.038200  0.202466
3       15        257   0.514   0.034267  0.237340
4       20        291   0.582   0.029100  0.253399]
Epoch: 1
Train on 4970845 samples


100% |########################################################################|


[   rank_at  hitcounts  recall  precision      ndcg
0        1         42   0.084     0.0840  0.084000
1        5        120   0.240     0.0480  0.159831
2       10        201   0.402     0.0402  0.211907
3       15        246   0.492     0.0328  0.235828
4       20        300   0.600     0.0300  0.261302]


100% |########################################################################|


TypeError: unhashable type: 'dict'

In [141]:
pd.read_pickle(store_path + gs_result_name)

Unnamed: 0,test_rec@10,val_metrics,learning_rate,batch_size,nolf,regs,epochs,sample_size,num_neg,ckpt_dir,optimizer,test_metrics
0,0.395033,"[[rank_at, hitcounts, recall, precision, ndcg]...",0.005,1024,8,"[0, 0]",2,994169,4,../NeuMF_storage/GMF_ckpts_ml_1m_nolf_8/ckpts,Adam,rank_at hitcounts recall precision ...
