In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import os
%load_ext autoreload
%autoreload 2
print('TF version:', tf.__version__ , '\nGPU available:', tf.test.is_gpu_available())

TF version: 2.0.0 
GPU available: True


# Read Data
- all datasets are datetime sorted

In [2]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
# path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '../' # Paperspace

## Amazon Fashion 

In [3]:
data_path = 'datasets/' # Paperspace
# data_path = 'Data/Am/'
file_name = 'Amazon_01_users'

## MovieLens 

In [4]:
# data_path = 'datasets/' # Paperspace
# data_path = 'Data/ML/'
# file_name = 'ML_05_users'
# file_name = 'ML_01_users'

In [5]:
df = pd.read_pickle(path + data_path + file_name)
df['item_id'] = df.item.astype('category').cat.codes
df['user_id'] = df.user.astype('category').cat.codes

df.head()

Unnamed: 0,user,item,datetime,rating,item_id,user_id
4983863,A39ZLL8ILVT2J8,B00FXSELCM,2014-03-24,3.0,104506,73226
7294092,A39ZLL8ILVT2J8,B00VDPQ884,2016-06-29,5.0,175639,73226
4809981,A39ZLL8ILVT2J8,B00EWC0W3W,2016-08-14,5.0,99224,73226
9337932,A39ZLL8ILVT2J8,B01EZKMD64,2016-10-03,5.0,238824,73226
8832820,A39ZLL8ILVT2J8,B01ABS4646,2016-12-22,5.0,222085,73226


# Data Prep

## Init

In [6]:
total_items = len(df.item_id.unique())

In [13]:
res_ext = file_name[:2]
all_models = pd.read_pickle(path + 'results/' + res_ext + '/all_models')

### New Model?

In [14]:
new_model_id = str(int(all_models.model_id.max()[0]) + 1) + '_' + res_ext

In [7]:
res_ext = file_name[:2]
new_model_id = str(0) + '_' + res_ext

In [8]:
params = {
'model_id':new_model_id,
'train_time':0,
'epochs':0,
'BATCH_SIZE':32,
'learning_rate':0.1,
'delta':0.2,             # Diversity Bias
'max_seq_len':15,        # Max length of sequence71=median

'val_perc':0.1,          # Percentage of users from df in val and test set
'test_perc':0.1, 
'n_items_val':0,        # Number of last (chronologically) items in val and test set
'n_items_test':1,

'pad_value':total_items, # Pad with total_items+1 => masked => still use item 0
'shift_targets_by':1     
}

In [9]:
BATCH_SIZE = params['BATCH_SIZE']
learning_rate = params['learning_rate']
delta = params['delta']
max_seq_len = params['max_seq_len']

val_perc = params['val_perc']
test_perc = params['test_perc']
n_items_val = params['n_items_val']
n_items_test = params['n_items_test']

pad_value = params['pad_value']
shift_targets_by = params['shift_targets_by'] 

## Train Test Split

In [10]:
from Data_prep import train_val_test_split

# Train Test Val Split
data_split = train_val_test_split(df, val_perc, test_perc, n_items_val, n_items_test, seqs=True)

train_set, val_set, val_left_out_items, test_set, test_left_out_items = data_split

## Create Datasets

In [11]:
from Data_prep import create_seq_batch_dataset
 
#Train Set
train_dataset = create_seq_batch_dataset(df=train_set, 
                                         shift=shift_targets_by, 
                                         max_seq_len=max_seq_len, 
                                         pad_value=pad_value, 
                                         batch_size=BATCH_SIZE, 
                                         stats=False,
                                         drop_remainder=True)

#Val Set
val_dataset = create_seq_batch_dataset(df=val_set, 
                                       shift=shift_targets_by, 
                                       max_seq_len=max_seq_len, 
                                       pad_value=pad_value, 
                                       batch_size=BATCH_SIZE, 
                                       stats=False,
                                       drop_remainder=True)

# #Test Set
# test_dataset = create_seq_batch_dataset(df=test_set, 
#                                        shift=shift_targets_by, 
#                                        max_seq_len=max_seq_len, 
#                                        pad_value=pad_value, 
#                                        batch_size=BATCH_SIZE, 
#                                        stats=False,
#                                        drop_remainder=True)

---
# LSTM Model
Collaborative Filtering with Recurrent Neural Networks
- paper: https://arxiv.org/pdf/1608.07400.pdf
- code: https://github.com/rdevooght/sequence-based-recommendations (in Theano)

## Build Model

In [12]:
embedding_dim = 100
rnn_units = 20

In [13]:
from Models import build_LSTM_model
model = build_LSTM_model(total_items = total_items, # +1 because padding is total_items+1
                         embedding_dim = embedding_dim,
                         mask_value = pad_value,
                         rnn_units = rnn_units,
                         batch_size = BATCH_SIZE)

## Add Custom Metric=Recall and Loss=Diversity Bias Loss

In [14]:
from Evaluation import recall_metric, diversity_bias_loss, create_diversity_bias

In [15]:
diversity_bias = create_diversity_bias(train_set, total_items, delta)

In [16]:
optimizer=tf.keras.optimizers.Adagrad(lr=learning_rate)
loss=diversity_bias_loss(db=diversity_bias, total_items=total_items)
metrics=[recall_metric(total_items=total_items)]

model.compile(optimizer=optimizer,
              loss=loss, 
              metrics=metrics)

## Summmary

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (32, None, 100)           24746600  
_________________________________________________________________
masking (Masking)            (32, None, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (32, None, 20)            9680      
_________________________________________________________________
dense (Dense)                (32, None, 247465)        5196765   
Total params: 29,953,045
Trainable params: 29,953,045
Non-trainable params: 0
_________________________________________________________________


---
# Train Model

## Configure Callbacks

In [18]:
ext = '_' + file_name[:2] #ML or Am
# directory = './ckpts/ckpts' 
directory = '../ckpts/ckpts'

# Directory where the checkpoints will be saved
checkpoint_dir = directory + '_' + str(params['model_id'])

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

In [19]:
from Helpers import TimingCallback

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath = checkpoint_prefix,    
                                                         monitor = 'val_recall',    
                                                         mode = 'max',    
                                                         save_best_only = True,
                                                         save_weights_only = True)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor = 'val_recall',
                                                           min_delta = 0.0001,
                                                           mode = 'max',
                                                           patience = 15)
    

timing_callback = TimingCallback()

callbacks = [checkpoint_callback, early_stopping_callback, timing_callback]

## Fit Model

In [21]:
epochs = 100
print('#Batches:', tf.data.experimental.cardinality(train_dataset).numpy())
print('Batch size:', BATCH_SIZE)

#Batches: 3034
Batch size: 32


In [None]:
print('Fitting LSTM with max sequence length:', str(max_seq_len))
history = model.fit(x = train_dataset, 
                    validation_data=val_dataset, 
                    epochs=epochs,
                    callbacks=callbacks)

Fitting LSTM with max sequence length: 15
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
 214/3034 [=>............................] - ETA: 8:07 - loss: 1.2666 - recall: 0.0707

## Plot Training Loss, Metrics

In [None]:
results = history.history()
plt.plot(results['loss'])
plt.plot(results['val_loss'])
plt.show()
plt.plot(results['recall'])
plt.plot(results['val_recall'])
plt.show()

---
## Continue training from checkpoint

### Load Weights

In [20]:
from Models import build_LSTM_model
model = build_LSTM_model(total_items = total_items,
                         embedding_dim = embedding_dim,
                         mask_value = pad_value,
                         rnn_units = rnn_units,
                         batch_size = BATCH_SIZE)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f9ea2d03f60>

### Compile

In [21]:
from Evaluation import recall_metric, diversity_bias_loss, create_diversity_bias
diversity_bias = create_diversity_bias(train_set, total_items, delta)

optimizer=tf.keras.optimizers.Adagrad(lr=learning_rate)
loss=diversity_bias_loss(db=diversity_bias, total_items=total_items)
metrics=[recall_metric(total_items=total_items)]

model.compile(optimizer=optimizer,
              loss=loss, 
              metrics=metrics)

In [24]:
initial_epoch = 57
total_epochs = 100

### Continue Training

In [None]:
history = model.fit(x = train_dataset, 
                    validation_data=val_dataset, 
                    epochs=total_epochs, 
                    callbacks=callbacks, 
                    initial_epoch=initial_epoch)

Epoch 58/100
    144/Unknown - 33s 228ms/step - loss: 1.2349 - recall: 0.0897

---
# Predict Sequences

## Restore Latest Checkpoints

In [None]:
from Models import build_LSTM_model
model = build_LSTM_model(total_items = total_items, 
                         embedding_dim = embedding_dim,
                         mask_value = pad_value,
                         rnn_units = rnn_units,
                         batch_size = 1,
                         return_sequences=False)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

print('Latest checkpoint from:', tf.train.latest_checkpoint(checkpoint_dir))

## Predict

In [None]:
rank_at = 20

In [None]:
from Evaluation import get_predictions
preds_df = get_predictions(model, test_set, test_left_out_items, rank_at)

In [None]:
preds_df

### Load Predictions

In [None]:
# predictions = pd.read_pickle('CFRNN_res_200_ML_01_users')

---
# Evaluate

In [None]:
from Evaluation import get_metrics

In [None]:
steps = 5

In [None]:
metrics_test = get_metrics(preds_df, steps, rank_at)
metrics_test

---
# Store Results

In [None]:
from Models import store_LSTM_model
# store_path = path + 'results/CFRNN/' + res_ext + '/all_models'
store_path = path + 'results/' + res_ext + '/all_models'
train_time = np.sum(timing_callback.logs)
all_models = store_LSTM_model(store_path, params.copy(), history.history.copy(), train_time, metrics_test, store=True)

In [None]:
all_models

### Val Set Metrics ML

In [36]:
metrics_val_set_vsl = get_metrics(preds_val, 5, 20)
metrics_val_set_vsl

Obtaining metrics time: 1.12


Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,1,0.000601,0.000601
1,5,8,0.004808,0.000962
2,10,15,0.009014,0.000901
3,15,21,0.01262,0.000841
4,20,27,0.016226,0.000811


# Appendix

In [None]:
# oh_input = tf.keras.backend.one_hot(padded, n_items)
# e = tf.keras.layers.Embedding(n_items, 100, input_length=max_l)

In [None]:
# One hot encoded input
# sequences_data_x = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_x, n_items)) 
# sequences_data_y = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_y, n_items)) 

In [None]:
# max_seq_len = 30
# min_seq_len = 10
# shift_targets_by = 1

# from Data_prep import get_x_y_sequences, min_padding
# vsl = True # Set for training later

# # Train Set
# user_sequences_x, user_sequences_y, user_order = get_x_y_sequences(train_set, shift_targets_by)
# padded_sequences_x = min_padding(user_sequences_x, BATCH_SIZE, min_seq_len, max_seq_len)
# padded_sequences_y = min_padding(user_sequences_y, BATCH_SIZE, min_seq_len, max_seq_len)

# # Val Set 
# user_sequences_val_x, user_sequences_val_y, user_order = get_x_y_sequences(val_set, shift_targets_by, stats=False)
# padded_sequences_val_x = min_padding(user_sequences_val_x, BATCH_SIZE, min_seq_len, max_seq_len)
# padded_sequences_val_y = min_padding(user_sequences_val_y, BATCH_SIZE, min_seq_len, max_seq_len)

In [None]:
# test_set_u_i = test_set.groupby('user_id')['item_id'].apply(list)
# test_X = []
# test_y = []
# all_predictions = []
# for user_items in test_set_u_i:
#     test_X.append(user_items[-200:-1])
#     test_y.append(user_items[-1:])

# for i, seq in enumerate(test_X): 
#     seq = seq.copy()
#     predictions = []
#     for i in range(20):
#         pred_item_id = model.predict_classes(np.array([seq,]), batch_size=1)[0]
#         seq.append(pred_item_id)
#         predictions.append(pred_item_id)
#     all_predictions.append(predictions)
    
# predictions_df = pd.DataFrame(list(zip(test_set.user_id.unique(), all_predictions, test_y)),
#                               columns=['user', 'pred_items_ranked', 'true_id'])