In [56]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import time
import os
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
print('version:', tf.__version__, '\t\tgpu available:', tf.test.is_gpu_available())

version: 2.0.0 		gpu available: True


# Read Data
- all datasets are datetime sorted

In [58]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
# path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
data_path = '../data/'

## Amazon Fashion 

In [59]:
# file_name = 'Amazon_full' 
# file_name = 'Amazon_05_users' 
# file_name = 'Amazon_01_users'
# file_name = 'Amazon_005_users'
# file_name = 'Amazon_001_users'

## MovieLens 

In [60]:
# file_name = 'ML_full' 
# file_name = 'ML_05_users'
file_name = 'ML_01_users'
# file_name = 'ML_005_users'
# file_name = 'ML_001_users'

In [61]:
df = pd.read_pickle(data_path + file_name)
df.head()

Unnamed: 0,user,item,rating,datetime,item_id,user_id
18590190,120461,2501,5.0,2000-04-25 02:29:35,2410,120460
18590032,120461,252,4.0,2000-04-25 02:29:35,249,120460
18590159,120461,2069,4.0,2000-04-25 02:29:35,1980,120460
18590048,120461,440,4.0,2000-04-25 02:29:35,435,120460
18590145,120461,1959,4.0,2000-04-25 02:29:35,1870,120460


# Data Prep
Create new ids for users and items that match the row and column indices of the user-item interaction matrix

In [62]:
BATCH_SIZE = 32
val_perc = test_perc = 0.1
n_last_items_val = n_last_items_test = 1

## Train Test Split

In [63]:
from Data_prep import train_val_test_split

# Train Test Val Split
total_users, total_items, train_set, val_set, test_set = \
train_val_test_split(df, BATCH_SIZE, val_perc, test_perc, n_last_items_val, n_last_items_test)

Total number of items: 27387
Total users: 16254
Number of train users: 16224
Number of test users: 1632
Number of validation users: 1632 

Users deleted: 30


## Option 1: Variable Sequence Length

In [64]:
max_seq_len = 500
shift_targets_by = 1

In [65]:
from Data_prep import get_x_y_sequences, min_padding
vsl = True # Set for training later

# Train Set
user_sequences_x, user_sequences_y, user_order = get_x_y_sequences(train_set, shift_targets_by)
padded_sequences_x = min_padding(user_sequences_x, BATCH_SIZE, max_seq_len)
padded_sequences_y = min_padding(user_sequences_y, BATCH_SIZE, max_seq_len)

# Val Set 
user_sequences_val_x, user_sequences_val_y, user_order = get_x_y_sequences(val_set, shift_targets_by, stats=False)
padded_sequences_val_x = min_padding(user_sequences_val_x, BATCH_SIZE, max_seq_len)
padded_sequences_val_y = min_padding(user_sequences_val_y, BATCH_SIZE, max_seq_len)

Number of sequences x: 16224 
Avg sequence length x: 150.29357741617358 
Std_dev sequence length x: 242.54 
Median of sequence length x: 71.0


## Option 2: Fixed Sequence Length

In [20]:
max_seq_len = 250
shift_targets_by = 1

In [21]:
from Data_prep import get_x_y_sequences, standard_padding
vsl = False

#Train Set
user_sequences_x, user_sequences_y, median = get_x_y_sequences(train_set, shift_targets_by, ordered=False)
sequences_data_x = standard_padding(user_sequences_x, max_seq_len)
sequences_data_y = standard_padding(user_sequences_y, max_seq_len, stats=False)

dataset = tf.data.Dataset.zip((sequences_data_x, sequences_data_y))
dataset = dataset.batch(BATCH_SIZE, drop_remainder=False)

#Val Set
user_sequences_val_x, user_sequences_val_y, median = get_x_y_sequences(val_set, shift_targets_by, ordered=False, stats=False)
sequences_data_val_x = standard_padding(user_sequences_val_x, max_seq_len, stats=False)
sequences_data_val_y = standard_padding(user_sequences_val_y, max_seq_len, stats=False)

val_dataset = tf.data.Dataset.zip((sequences_data_val_x, sequences_data_val_y))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=False)

Number of sequences x: 16224 
Avg sequence length x: 150.29357741617358 
Std_dev sequence length x: 242.54 
Median of sequence length x: 71.0
number of sequences: 16224 
avg sequence length: 250.0 
std_dev sequence length: 0.0


---
# LSTM Model
Collaborative Filtering with Recurrent Neural Networks
- paper: https://arxiv.org/pdf/1608.07400.pdf
- code: https://github.com/rdevooght/sequence-based-recommendations (in Theano)

## Architecture

In [66]:
def build_model(total_items, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(total_items, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.LSTM(units=rnn_units,
                             return_sequences=True,
                             stateful=False, #Reset cell states with each batch
                             recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(total_items)
    ])
    return model

## Build Model

In [67]:
embedding_dim = 100
rnn_units = 20

In [68]:
model = build_model(
total_items = total_items,
embedding_dim = embedding_dim,
rnn_units = rnn_units,
batch_size = BATCH_SIZE)

## Add Loss
- **Added one hot encoding of the labels to match logits output after dense layer**

In [69]:
# def recall(y_true, y_pred):
#     K = tf.keras.backend
#     y_true = K.one_hot(tf.dtypes.cast(y_true, tf.int32), total_items)
#     y_true = K.ones_like(y_true) 
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
#     recall = true_positives / (all_positives + K.epsilon())
#     return recall

In [70]:
def loss(labels, logits):
    oh_labels = tf.keras.backend.one_hot(tf.dtypes.cast(labels, tf.int32), total_items)
    return tf.keras.losses.categorical_crossentropy(oh_labels, logits, from_logits=True)

In [71]:
model.compile(optimizer='Adagrad', loss=loss)#, metrics=[recall])

## Summmary

In [72]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (32, None, 100)           2738700   
_________________________________________________________________
lstm_3 (LSTM)                (32, None, 20)            9680      
_________________________________________________________________
dense_3 (Dense)              (32, None, 27387)         575127    
Total params: 3,323,507
Trainable params: 3,323,507
Non-trainable params: 0
_________________________________________________________________


---
# Train Model

### Configure Checkpoints

In [76]:
# Directory where the checkpoints will be saved
if vsl:
    checkpoint_dir = '../ckpts/ckpts_vsl' + file_name + '_vsl'
else:
    checkpoint_dir = '../ckpts/ckpts_vsl' + file_name
    
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

## Fit Model

In [77]:
epochs = 200
print('#Batches:', len(user_sequences_x)/BATCH_SIZE)
print('Batch size:', BATCH_SIZE)

#Batches: 507.0
Batch size: 32


In [None]:
if vsl:
    print('Fitting LSTM with Variable sequence length')
    from Helpers import Batch_Generator
    history = model.fit_generator(generator=Batch_Generator(
                                  X=padded_sequences_x, 
                                  y=padded_sequences_y, 
                                  batch_size=BATCH_SIZE,
                                  shuffle=True),

    #                               validation_data=Batch_Generator(
    #                               X=padded_sequences_val_x,
    #                               y=padded_sequences_val_y,
    #                               batch_size=BATCH_SIZE,
    #                               shuffle=True),

                                  epochs=epochs, 
                                  callbacks=[checkpoint_callback],
                                  use_multiprocessing=True)
else:
    print('Fitting LSTM with Fixed sequence length')
    history = model.fit(dataset, 
#                         validation_data=val_dataset, 
                        epochs=epochs, 
                        callbacks=[checkpoint_callback])

Fitting LSTM with Variable sequence length
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200

In [None]:
file_name

In [None]:
results = pd.DataFrame(history.history)
if vsl:
    results.to_pickle('../results/his_vsl_01_ml_users_200_epochs')
else:
    results.to_pickle('../results/his_01_ml_users_200_epochs')

---
## Continue training from checkpoint

In [None]:
model.summary()

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(n_items, embedding_dim, rnn_units, batch_size=100)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='Adagrad', loss=loss)

In [None]:
aditional_epochs = 1

In [None]:
model.fit(dataset, epochs=aditional_epochs, callbacks=[checkpoint_callback])

---
# Predict Sequences

## Restore Latest Checkpoints
- **TODO: Can keep batches of 64 for evaluation => faster**

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(total_items, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
                   
model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

## Create Predictions
- **TODO: Can keep batches of 64 for evaluation => faster**

Using train_set sequences to predict test_set / val_set item(s)

In [None]:
from Evaluation import get_predictions

In [None]:
file_name

### Val Set Predictions

In [None]:
rank_at = 20
preds_val = get_predictions(model, train_set, val_set, rank_at)
preds_val.to_pickle('../results/preds_val_CFRNN_vsl_100_' + file_name)

### Test Set Predictions

In [None]:
rank_at = 20
preds_test = get_predictions(model, train_set, test_set, rank_at)
preds_test.to_pickle('../results/preds_test_CFRNN_vsl_100_' + file_name)

#### Read Predictions

In [None]:
# predictions = pd.read_pickle('CFRNN_res_200_ML_01_users')

---
# Evaluate

In [None]:
from Evaluation import get_metrics

### Val Set Metrics

In [None]:
metrics_val_set_vsl = get_metrics(preds_val, 5, 20)
metrics_val_set_vsl

In [None]:
metrics_val_set_vsl.to_pickle('../results/metrics_val_CFRNN_vsl_' + file_name)

### Test Set Metrics

In [None]:
metrics_test_set_vsl = get_metrics(preds_test, 5, 20)
metrics_test_set_vsl

In [None]:
metrics_test_set_vsl.to_pickle('../results/metrics_test_CFRNN_vsl_' + file_name)

In [None]:
%run CF_RNN_fixed.ipynb

---
### Older Metrics (to be removed)

In [53]:
metrics_test_set_vsl = get_metrics(predictions, 5, 20)
metrics_test_set_vsl

Obtaining metrics time: 1.21


Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,0,0.0,0.0
1,5,7,0.004207,0.000841
2,10,12,0.007212,0.000721
3,15,21,0.01262,0.000841
4,20,25,0.015024,0.000751


In [54]:
# metrics_val_set_vsl = get_metrics(predictions, 5, 20)
metrics_val_set_vsl.to_pickle('../results/metrics_val_set_vsl_200_e')

In [62]:
metrics_test_set = get_metrics(predictions, 5, 20)
metrics_test_set

Obtaining metrics time: 1.28


Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,1,0.000601,0.000601
1,5,8,0.004808,0.000962
2,10,10,0.00601,0.000601
3,15,12,0.007212,0.000481
4,20,13,0.007812,0.000391


In [91]:
metrics_val_set = get_metrics(predictions, 5, 20)

Obtaining metrics time: 1.15


In [55]:
# metrics_val_set
metrics_val_set.to_pickle('../results/metrics_val_set_200_e')

NameError: name 'metrics_val_set' is not defined

In [50]:
metrics

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,2,0.001202,0.001202
1,5,8,0.004808,0.000962
2,10,14,0.008413,0.000841
3,15,15,0.009014,0.000601
4,20,17,0.010216,0.000511


# Appendix

In [None]:
# oh_input = tf.keras.backend.one_hot(padded, n_items)
# e = tf.keras.layers.Embedding(n_items, 100, input_length=max_l)

In [None]:
# One hot encoded input
# sequences_data_x = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_x, n_items)) 
# sequences_data_y = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_y, n_items)) 