In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import sparse
import os
import random

In [2]:
print('version:', tf.__version__, '\t\tgpu available:', tf.test.is_gpu_available())

version: 2.0.0 		gpu available: True


# Read Data
- all datasets are datetime sorted

In [3]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
# path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
data_path = 'data/'

## Amazon Fashion 

In [4]:
# file_name = 'Amazon_full' 
# file_name = 'Amazon_05_users' 
# file_name = 'Amazon_01_users'
# file_name = 'Amazon_005_users'
# file_name = 'Amazon_001_users'

## MovieLens 

In [5]:
# file_name = 'ML_full' 
# file_name = 'ML_05_users'
file_name = 'ML_01_users'
# file_name = 'ML_005_users'
# file_name = 'ML_001_users'

In [6]:
df = pd.read_pickle(data_path + file_name)
df.head()

Unnamed: 0,user,item,rating,datetime,item_id,user_id
18590190,120461,2501,5.0,2000-04-25 02:29:35,2410,120460
18590032,120461,252,4.0,2000-04-25 02:29:35,249,120460
18590159,120461,2069,4.0,2000-04-25 02:29:35,1980,120460
18590048,120461,440,4.0,2000-04-25 02:29:35,435,120460
18590145,120461,1959,4.0,2000-04-25 02:29:35,1870,120460


# Data Prep
Create new ids for users and items that match the row and column indices of the user-item interaction matrix

### Data Sizes

In [8]:
BATCH_SIZE = 32
val_perc = test_perc = 0.1
n_last_items_val = n_last_items_test = 1

### Create Split

In [9]:
import data_prep_split
total_users, total_items, train_set, val_set, test_set = \
data_prep_split.train_val_test_split(df, BATCH_SIZE, val_perc, test_perc, n_last_items_val, n_last_items_test)

Total number of items: 27387
Total users: 16254
Number of train users: 16224
Number of test users: 1632
Number of validation users: 1632 

Users deleted: 30


---
# Data Prep
1. Each (relatively) ordered item sequence per user will be viewed as one time series
2. **Sequences will be batched based on their length such that padding is minimal**
3. Each batch consists out of BATCH_SIZE users sequences



## Train and Target sequences

In [10]:
def get_x_y_sequences_ordered(dataset, n_unknowns_in_y=1, stats=True):
    user_sequences_x = []
    user_sequences_y = []
    lengths = []
    users = dataset.user_id.unique()
    shortest_u_seq_order = list(df.groupby('user_id')['item_id'].count().sort_values().index)
    
    for u in shortest_u_seq_order:
        user_item_seq = np.array(dataset[dataset['user_id']==u]['item_id'])
        user_sequences_x.append(user_item_seq[:-n_unknowns_in_y])
        user_sequences_y.append(user_item_seq[n_unknowns_in_y:])
        lengths.append(len(user_item_seq))
    
    median = np.median(lengths)
    
    if stats:
        print('Number of sequences x:', len(user_sequences_x), 
              '\nAvg sequence length x:', np.average(lengths),
              '\nStd_dev sequence length x:', np.round(np.std(lengths),2),
              '\nMedian of sequence length x:', median)

    return user_sequences_x, user_sequences_y, shortest_u_seq_order

In [11]:
def min_padding(sequences, batch_size, max_len):
    padded_sequences = []
    batch = []
    max_batch_seq_len = 0
    for i, seq in enumerate(sequences):
        batch.append(seq)
        if max_batch_seq_len > max_len:
            max_batch_seq_len = max_len
            
        elif max_batch_seq_len < len(seq):
            max_batch_seq_len = len(seq)
            
        if (i+1)%batch_size == 0:
            padded_sequences.append(tf.keras.preprocessing.sequence.pad_sequences(batch, maxlen=int(max_batch_seq_len), padding='post', truncating='pre'))
            max_batch_seq_len = 0
            batch = [] 
            
    return padded_sequences


### Create minimal padded sequences

In [12]:
max_seq_len = 500
shift_targets_by = 1

In [13]:
user_sequences_x, user_sequences_y, user_order = get_x_y_sequences_ordered(train_set, shift_targets_by)

padded_sequences_x = min_padding(user_sequences_x, BATCH_SIZE, max_seq_len)
padded_sequences_y = min_padding(user_sequences_y, BATCH_SIZE, max_seq_len)

Number of sequences x: 16254 
Avg sequence length x: 150.0161806324597 
Std_dev sequence length x: 242.4 
Median of sequence length x: 70.0


---
# LSTM Model
Collaborative Filtering with Recurrent Neural Networks

- paper: https://arxiv.org/pdf/1608.07400.pdf
- code: https://github.com/rdevooght/sequence-based-recommendations (in Theano)

---
## Model Prep

### Architecture

In [14]:
def build_model(total_items, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(total_items, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.LSTM(units=rnn_units,
                             return_sequences=True,
                             stateful=False, #Reset cell states with each batch
                             recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(total_items)
    ])
    return model

### Build Model

In [15]:
embedding_dim = 100
rnn_units = 20

In [16]:
model = build_model(
total_items = total_items,
embedding_dim = embedding_dim,
rnn_units = rnn_units,
batch_size = BATCH_SIZE)

### Add Loss
- **Added one hot encoding of the labels to match logits output after dense layer**

In [17]:
# def recall(y_true, y_pred):
#     K = tf.keras.backend
#     y_true = K.one_hot(tf.dtypes.cast(y_true, tf.int32), total_items)
#     y_true = K.ones_like(y_true) 
#     true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
#     all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
#     recall = true_positives / (all_positives + K.epsilon())
#     return recall

In [18]:
def loss(labels, logits):
    oh_labels = tf.keras.backend.one_hot(tf.dtypes.cast(labels, tf.int32), total_items)
    return tf.keras.losses.categorical_crossentropy(oh_labels, logits, from_logits=True)

In [19]:
model.compile(optimizer='Adagrad', loss=loss)#, metrics=[recall])

## Summmary

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (32, None, 100)           2738700   
_________________________________________________________________
lstm (LSTM)                  (32, None, 20)            9680      
_________________________________________________________________
dense (Dense)                (32, None, 27387)         575127    
Total params: 3,323,507
Trainable params: 3,323,507
Non-trainable params: 0
_________________________________________________________________


---
## Train Model

### Batch Generator
- Needed for the different size batches 

In [21]:
class Batch_Generator(tf.keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, X, y, batch_size=1, shuffle=True):
        'Initialization'
        self.X = X
        self.y = y
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return len(self.y)#int(np.floor(len(self.y)/self.batch_size))

    def __getitem__(self, index):
        return self.__data_generation(index)

    def on_epoch_end(self):
        'Shuffles indexes after each epoch'
        self.indexes = np.arange(len(self.y))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)

    def __data_generation(self, index):
        return np.stack(self.X[index]), np.stack(self.y[index])

### Prep Validation Data

In [22]:
user_sequences_val_x, user_sequences_val_y, median = get_x_y_sequences_ordered(val_set, 1, stats=False)

padded_sequences_val_x = min_padding(user_sequences_val_x, 32, 500)
padded_sequences_val_y = min_padding(user_sequences_val_y, 32, 500)

### Configure Checkpoints

In [23]:
# Directory where the checkpoints will be saved
checkpoint_dir = './rnn_train_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

---
## Fit Model

In [24]:
epochs = 100
print('#Batches:', len(padded_sequences_x))
print('Batch size:', BATCH_SIZE)

#Batches: 507
Batch size: 32


In [29]:
# def train_generator():
#     for X_train, y_train in zip(padded_sequences_x, padded_sequences_y):
#         yield (X_train, y_train)

In [None]:
history = model.fit_generator(generator=Batch_Generator(
                              X=padded_sequences_x, 
                              y=padded_sequences_y, 
                              batch_size=BATCH_SIZE,
                              shuffle=True),
                              
#                               validation_data=Batch_Generator(
#                               X=padded_sequences_val_x,
#                               y=padded_sequences_val_y,
#                               batch_size=BATCH_SIZE,
#                               shuffle=True),
                              
                              epochs=epochs, 
                              callbacks=[checkpoint_callback],
                              use_multiprocessing=True)

Epoch 1/100
 27/507 [>.............................] - ETA: 1:55 - loss: 10.2043

In [31]:
file_name

'ML_01_users'

In [32]:
results = pd.DataFrame(history.history)
results.to_pickle('his_vsl_01_ml_users_200_epochs')

---
## Continue training from checkpoint

In [None]:
model.summary()

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(n_items, embedding_dim, rnn_units, batch_size=100)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='Adagrad', loss=loss)

In [None]:
aditional_epochs = 1

In [None]:
model.fit(dataset, epochs=aditional_epochs, callbacks=[checkpoint_callback])

---
# Predict Sequences

## Restore Latest Checkpoints

In [33]:
tf.train.latest_checkpoint(checkpoint_dir)

'./rnn_train_checkpoints/ckpt'

In [34]:
model = build_model(total_items, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
                   
model.build(tf.TensorShape([1, None]))

In [35]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 100)            2738700   
_________________________________________________________________
lstm_1 (LSTM)                (1, None, 20)             9680      
_________________________________________________________________
dense_1 (Dense)              (1, None, 27387)          575127    
Total params: 3,323,507
Trainable params: 3,323,507
Non-trainable params: 0
_________________________________________________________________


---
## Create Predictions

**Using train_set sequences to predict test_set item(s)**

In [36]:
test_user_seq = np.array(train_set[train_set['user_id']==10278]['item_id']).reshape(-1,1).T
true_items = list(test_set[test_set['user_id']==10278]['item_id'])
predictions = tf.squeeze(model(test_user_seq),0)
predictions = tf.random.categorical(predictions, num_samples=1).numpy()
user_seq = np.array(df[df['user_id']==10278]['item_id']).reshape(-1,1).T

for item, pred_item, real in zip(list(test_user_seq[0]), predictions, user_seq[0]):
    print(item, pred_item, real)

145 [585] 145
578 [368] 578
576 [5796] 576
309 [1237] 309
583 [595] 583
331 [216] 331
9 [2268] 9
156 [1146] 156
0 [2540] 0
203 [12191] 203
180 [153] 180
448 [3835] 448
401 [9701] 401
310 [289] 310
445 [581] 445
423 [1183] 423
356 [16] 356
543 [635] 543
329 [6253] 329
348 [1499] 348
230 [1201] 230
471 [2509] 471
431 [503] 431
181 [9528] 181
16 [9549] 16
137 [708] 137
163 [254] 163
259 [1148] 259
342 [2441] 342
369 [5410] 369
218 [372] 218
500 [585] 500
266 [446] 266
271 [25735] 271
61 [289] 61
491 [1164] 491
349 [695] 349
231 [11411] 231
340 [6874] 340
530 [431] 530
575 [828] 575
585 [230] 585
459 [708] 459
506 [145] 506
499 [10723] 499
57 [254] 57
4 [2724] 4
488 [295] 488
360 [828] 360
630 [2658] 630
482 [1931] 482
363 [1528] 363
190 [4127] 190
343 [1587] 343
582 [22] 582
428 [2341] 428
375 [11208] 375
520 [191] 520
139 [708] 139
466 [113] 466
622 [1237] 622
525 [9764] 525
584 [1159] 584
353 [10991] 353
524 [155] 524
526 [321] 526
528 [1874] 528
374 [1174] 374
403 [2934] 403
529 [1258]

In [37]:
def get_predictions(model, train_set, test_set, rank_at, temp=0.9):

    predictions_df = pd.DataFrame(columns=['user', 'pred_seq', 'true_seq'])
    for u in test_set.user_id.unique():
        test_user_seq = np.array(train_set[train_set['user_id']==u]['item_id'])
        true_items = list(test_set[test_set['user_id']==u]['item_id'])
        generated_predictions = []

        #Predict
        for item in range(rank_at): #could be any number of recommended items you want to predict
            predictions = model(test_user_seq.reshape(-1,1).T)
            predictions = tf.squeeze(predictions, 0)

            predictions = predictions / temp
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            test_user_seq = np.append(test_user_seq, predicted_id).reshape(-1,1).transpose()

    #         half_test_seq = tf.expand_dims([predicted_id], 0)
            generated_predictions.append(predicted_id)

        predictions_df = predictions_df.append({'user':u, 'pred_seq':generated_predictions, 'true_seq':true_items}, ignore_index=True)
        
    return predictions_df

In [46]:
rank_at = 20
predictions_df = get_predictions(model, train_set, test_set, rank_at)

In [47]:
predictions_df

Unnamed: 0,user,pred_seq,true_seq
0,183,"[1020, 518, 203, 3456, 369, 369, 1889, 49, 349...",[1836]
1,8589,"[431, 1317, 5771, 334, 10981, 16979, 1032, 156...",[2724]
2,2862,"[753, 3835, 260, 448, 708, 9528, 1138, 16674, ...",[868]
3,7793,"[580, 735, 254, 137, 341, 2860, 1199, 543, 220...",[181]
4,2679,"[25220, 181, 2303, 1675, 250, 2869, 1009, 1874...",[6871]
...,...,...,...
1659,7761,"[711, 2263, 1467, 3038, 1125, 2976, 4533, 3801...",[1161]
1660,8896,"[2934, 585, 7087, 1216, 1431, 331, 349, 271, 3...",[8994]
1661,11851,"[6665, 9572, 87, 468, 4323, 16279, 3907, 311, ...",[1378]
1662,1447,"[1313, 10735, 1911, 2590, 2516, 2894, 3108, 65...",[10807]


In [48]:
predictions_df.to_pickle('CFRNN_vsl_res_200_ML_01_users')

In [56]:
predictions_df = pd.read_pickle('CFRNN_res_200_ML_01_users')

In [None]:
for i, row in predictions_df.iterrows():
    if len(row['true_seq']) > 1:
        print(row)

# Evaluate

In [57]:
# result_path = 'Results/CFRNN/'
# predictions = pd.read_pickle(path + result_path + 'CFRNN_res_200_ML_01_users')
predictions = predictions_df

In [58]:
def get_metrics(ranked_df, steps, max_rank):
    s = time.time()
    ranks_at = [1] + [i for i in range(steps, max_rank + steps, steps)]
    hitcounts = []
    recs_at = []
    precs_at = []
    metrics = pd.DataFrame(columns=['rank_at', 'hitcounts', 'recall', 'precision'])
    for rank in ranks_at:
        hitcount = 0
        for i, row in ranked_df.iterrows():
            hitcount +=  len(set(row['true_id']) & set(row['pred_items_ranked'][:rank]))

        prec_at = hitcount / rank / len(ranked_df)
        rec_at = hitcount / len(ranked_df.iloc[0]['true_id']) / len(ranked_df)

        hitcounts.append(hitcount)                     
        recs_at.append(rec_at)
        precs_at.append(prec_at)

    metrics['rank_at'] = ranks_at
    metrics['hitcounts'] = hitcounts
    metrics['recall'] = recs_at
    metrics['precision'] = precs_at
    print('Obtaining metrics time:', round(time.time() - s,2))
    return metrics

In [59]:
renames = {'pred_seq':'pred_items_ranked', 'true_seq':'true_id'}
predictions = predictions.rename(renames, axis=1)

In [60]:
import time

In [53]:
metrics_test_set_vsl = get_metrics(predictions, 5, 20)
metrics_test_set_vsl

Obtaining metrics time: 1.21


Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,0,0.0,0.0
1,5,7,0.004207,0.000841
2,10,12,0.007212,0.000721
3,15,21,0.01262,0.000841
4,20,25,0.015024,0.000751


In [54]:
# metrics_val_set_vsl = get_metrics(predictions, 5, 20)
metrics_val_set_vsl.to_pickle('metrics_val_set_vsl_200_e')

In [62]:
metrics_test_set = get_metrics(predictions, 5, 20)
metrics_test_set

Obtaining metrics time: 1.28


Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,1,0.000601,0.000601
1,5,8,0.004808,0.000962
2,10,10,0.00601,0.000601
3,15,12,0.007212,0.000481
4,20,13,0.007812,0.000391


In [91]:
metrics_val_set = get_metrics(predictions, 5, 20)

Obtaining metrics time: 1.15


In [55]:
# metrics_val_set
metrics_val_set.to_pickle('metrics_val_set_200_e')

NameError: name 'metrics_val_set' is not defined

In [50]:
metrics

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,2,0.001202,0.001202
1,5,8,0.004808,0.000962
2,10,14,0.008413,0.000841
3,15,15,0.009014,0.000601
4,20,17,0.010216,0.000511


In [61]:
metrics_temp_06

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,2,0.001202,0.001202
1,5,3,0.001803,0.000361
2,10,3,0.001803,0.00018
3,15,3,0.001803,0.00012
4,20,3,0.001803,9e-05


In [71]:
metrics_temp_2

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,0,0.0,0.0
1,5,1,0.000601,0.00012
2,10,2,0.001202,0.00012
3,15,2,0.001202,8e-05
4,20,3,0.001803,9e-05


In [51]:
file_name

'ML_01_users'

In [None]:
metrics.to_pickle(path + 'Results/CFRNN/' + 'metrics_CFRNN_' + file_name)

# Appendix

In [None]:
# oh_input = tf.keras.backend.one_hot(padded, n_items)
# e = tf.keras.layers.Embedding(n_items, 100, input_length=max_l)

In [None]:
# One hot encoded input
# sequences_data_x = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_x, n_items)) 
# sequences_data_y = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_y, n_items)) 