In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import sparse
import os
import random

In [2]:
tf.__version__

'2.0.0'

# Read Data
- all datasets datetime sorted

In [3]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
# path = './'

## Amazon Fashion 

In [4]:
# data_path = 'data/Amazon/'
# file_name = 'Amazon_full' 
# file_name = 'Amazon_05_users' 
# file_name = 'Amazon_01_users'
# file_name = 'Amazon_005_users'
# file_name = 'Amazon_001_users'

## MovieLens 

In [7]:
data_path = 'data/ML/'
# file_name = 'ML_full' 
# file_name = 'ML_05_users'
file_name = 'ML_01_users'
# file_name = 'ML_005_users'
# file_name = 'ML_001_users'

In [8]:
df = pd.read_pickle(file_name)
df.head()

Unnamed: 0,user,item,rating,datetime,item_id,user_id
18590190,120461,2501,5.0,2000-04-25 02:29:35,2410,120460
18590032,120461,252,4.0,2000-04-25 02:29:35,249,120460
18590159,120461,2069,4.0,2000-04-25 02:29:35,1980,120460
18590048,120461,440,4.0,2000-04-25 02:29:35,435,120460
18590145,120461,1959,4.0,2000-04-25 02:29:35,1870,120460


# Data Prep
Create new ids for users and items that match the row and column indices of the user-item interaction matrix

In [82]:
df['item_id'] = df.item.astype('category').cat.codes + 1
df['user_id'] = df.user.astype('category').cat.codes + 1

## Train Test Split
### Leave last item out of subset of users

In [83]:
def leave_users_out(full_data, leave_out, seed=1234):
    np.random.seed(seed)
    full_data['index'] = full_data.index
    user_index_df = full_data.groupby('user')['index'].apply(list)
    users = np.random.choice(list(user_index_df.index), leave_out, replace=False)
    users_indices = []
    
    for user in users:
        users_indices.extend(user_index_df.loc[user])
    
    sub_set = full_data.loc[users_indices]
    remaining = full_data.drop(users_indices)
    
    return remaining.drop(columns=['index']), sub_set.drop(columns=['index'])

In [84]:
def leave_last_x_out(full_data, n_users, leave_out=1, seed=1234):
    # Input: data must contain user_id
    # Output: full_data = without all last (time order) entries in leave one out set
    #         leave_one_out_set = data with one user and one item from full_data
    np.random.seed(seed)
    
    full_data['index'] = full_data.index
    user_items_ind = full_data.groupby('user_id')['index'].apply(list)
    users = full_data.user_id.unique()
    leave_out_indices = []
    users_picked = []
    
    for i in range(len(full_data.user_id.unique())):
        random_user = np.random.choice(users)
        item_indices = user_items_ind[random_user] # random user's items indices
        if random_user in users_picked or len(item_indices) <= leave_out:
            random_user = np.random.choice(users)
            item_indices = user_items_ind[random_user] # random user's items indices
        else:
            users_picked.append(random_user)
            leave_out_indices.extend(item_indices[-leave_out:])
        
        if len(users_picked) == n_users:
            break
        
    if len(users_picked) < n_users:
        error = 'Cannot pick ' + str(n_users) + ' users with more than ' + str(leave_out) + ' items'
        solution = '\nTry a smaller test and/or validation percentage of the data'
        raise ValueError(error + solution) 
            
    leave_out_set = full_data.loc[leave_out_indices] # the last items of n_users users with n_item > leave_out
    full_data_leave_one_out = full_data.drop(leave_out_indices) # drops last items for n_users users
    
    return full_data_leave_one_out.drop(columns=['index']), leave_out_set.drop(columns=['index'])

In [85]:
def train_val_test_split(df, batch_size, val_perc, test_perc, n_items_val, n_items_test, stats=True):
    # Input: df with user and item id, batch size for CFRNN data, val and test perc of users
    #        number of last items to leave out for val and test set
    # Output:full_data = total users and items of the original df, 
    #        Train, validation and test sets
    
    total_users = len(df.user_id.unique()) # Need all users for BPR
    total_items = len(df.item_id.unique()) # Need all items for CFRNN
    
    users_to_remove = len(df.user_id.unique())%batch_size #Batch size compatible for CFRNN
    df_new, deleted_users = leave_users_out(df, users_to_remove)

    test_users = int(test_perc*total_users / 64 + 1) * 64 # Number of users to be used for testing
    test_last_items = n_items_test # Items to be removed from test users in train set and used in test set

    val_users = int(val_perc*total_users / 64 + 1) * 64
    val_last_items = n_items_val
    
    train_set, test_set = leave_last_x_out(df_new, test_users, test_last_items)
    train_set, val_set = leave_last_x_out(train_set, val_users, val_last_items)
    
    if stats:
        print('Total number of items:', total_items)
        print('Total users:', total_users)
        print('Number of train users:', len(train_set.user_id.unique()))
        print('Number of test users:', test_users)
        print('Number of validation users:', val_users, '\n')
        print('Users deleted:', len(deleted_users.user_id.unique()))
    
    return total_users, total_items, train_set, val_set, test_set

### Data Sizes

In [86]:
BATCH_SIZE = 64
val_perc = test_perc = 0.1
n_last_items_val = n_last_items_test = 1

### Create Split

In [87]:
total_users, total_items, train_set, val_set, test_set = train_val_test_split(df, BATCH_SIZE, val_perc, test_perc, n_last_items_val, n_last_items_test)

Total number of items: 27387
Total users: 16254
Number of train users: 16192
Number of test users: 1664
Number of validation users: 1664 

Users deleted: 62


---
# Data Prep
1. Each (relatively) ordered item sequence per user will be viewed as one time series
2. **Sequences that do not have the *right* size, will be padded/truncated for now**
3. Each batch consists out of BATCH_SIZE users sequences



---
## Train and Target sequences
Create the **sequences** from the item_ids per user (already sorted)

In [88]:
def get_x_y_sequences(dataset, n_unknowns_in_y=1, stats=True):
    user_sequences_x = []
    user_sequences_y = []
    lengths = []
    users = dataset.user_id.unique()
    
    for u in users:
        user_item_seq = np.array(dataset[dataset['user_id']==u]['item_id'])
        user_sequences_x.append(user_item_seq[:-n_unknowns_in_y])
        user_sequences_y.append(user_item_seq[n_unknowns_in_y:])
        lengths.append(len(user_item_seq))
    
    median = np.median(lengths)
    
    if stats:
        print('Number of sequences x:', len(user_sequences_x), 
              '\nAvg sequence length x:', np.average(lengths),
              '\nStd_dev sequence length x:', np.round(np.std(lengths),2),
              '\nMedian of sequence length x:', median)

    return user_sequences_x, user_sequences_y, median

In [89]:
user_sequences_x, user_sequences_y, median = get_x_y_sequences(train_set, 1)

Number of sequences x: 16192 
Avg sequence length x: 150.41637845849803 
Std_dev sequence length x: 242.73 
Median of sequence length x: 71.0


---
## Padding
- **Using: Median, Mean or Min**

- add zeros if they are too short
- remove item ids from the beginning if they are too long

In [90]:
def pad_sequences(sequences, max_length, stats=True):
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=int(max_length), padding='post', truncating='pre')
    if stats:
        print('number of sequences:', padded_sequences.shape[0], 
              '\navg sequence length:', np.average([i.shape[0] for i in padded_sequences]),
              '\nstd_dev sequence length:', np.std([i.shape[0] for i in padded_sequences]))
        
    return padded_sequences

In [91]:
max_seq_length = 150
padded_sequences_x = pad_sequences(user_sequences_x, max_seq_length)
padded_sequences_y = pad_sequences(user_sequences_y, max_seq_length, stats=False)

number of sequences: 16192 
avg sequence length: 150.0 
std_dev sequence length: 0.0


---
## Create Dataset
- sequences_x inputs
- sequences_y actuals
- batches of size BATCH_SIZE

In [92]:
sequences_data_x = tf.data.Dataset.from_tensor_slices(padded_sequences_x) 
sequences_data_y = tf.data.Dataset.from_tensor_slices(padded_sequences_y) 
dataset = tf.data.Dataset.zip((sequences_data_x, sequences_data_y))
dataset

<ZipDataset shapes: ((150,), (150,)), types: (tf.int32, tf.int32)>

In [93]:
for input_example, target_example in  dataset.take(1):#.as_numpy_iterator():
    print ('Input data:', input_example)
    print ('Target data:', target_example)

Input data: tf.Tensor(
[2154 1431 1480 1544 2581  511   13  995  985  988  975  257  991  994
    8  886  583  597    1 1785 1965 1963 1232 1959 1976  996 1973 1956
 2629 2897 1488 1955 3242 1958 2977 2038  992 2228  307  982  757  584
 1033 2014 2257  357 3243 2169 1911 1961 3244 1896 1924  585  604  951
  977  311  142 1935   60 1932 1926 2017  976 1967  973 1968  980 2272
  981 1883 1845  567  575  154 2019 1970 1314  970 2713  993 1960  972
 1930  850 2000 1974 1916 1931 2273 2695 1892 2302 1621 2039 1411  447
  494 1759 2322  367 2818  236 1331  200 1269 1166 1604 1153 2580 2630
 2498 2555 2869 2592 2727  578  449  157 1529  370  466  576  220   22
 1527 2671 1754 1517  722  760  636  754 1011 1906 1352  361  310  108
  472  255  581  999 3250 1149 1150 1162 1151 1241], shape=(150,), dtype=int32)
Target data: tf.Tensor(
[1431 1480 1544 2581  511   13  995  985  988  975  257  991  994    8
  886  583  597    1 1785 1965 1963 1232 1959 1976  996 1973 1956 2629
 2897 1488 1955 3242 

In [94]:
dataset = dataset.batch(BATCH_SIZE, drop_remainder=False)
for i, o in dataset.take(1):#.as_numpy_iterator():
    print('input:', i.shape, '\n\noutput:', o.shape)

input: (64, 150) 

output: (64, 150)


---
# LSTM Model
Collaborative Filtering with Recurrent Neural Networks

- paper: https://arxiv.org/pdf/1608.07400.pdf
- code:https://github.com/rdevooght/sequence-based-recommendations (in Theano)


## Model Prep

### model architecture

In [95]:
def build_model(total_items, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(total_items, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.LSTM(units=rnn_units,
                             return_sequences=True,
                             stateful=False, #Reset cell states with each batch
                             recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(total_items)
    ])
    return model

---
### build model

In [96]:
embedding_dim = 100
rnn_units = 20

In [97]:
model = build_model(
total_items = total_items,
embedding_dim = embedding_dim,
rnn_units = rnn_units,
batch_size = BATCH_SIZE)

---
### Add Loss and Custom Metric
**Added one hot encoding of the labels to match logits output after dense layer**

In [106]:
target_example_batch

<tf.Tensor: id=13355, shape=(64, 150), dtype=int32, numpy=
array([[ 1431,  1480,  1544, ...,  1151,  1241,   227],
       [ 3586,  2614,   865, ...,     0,     0,     0],
       [ 1238,  1241,  1190, ...,  9611, 16031, 15899],
       ...,
       [ 1021,   348,  1349, ...,     0,     0,     0],
       [  999,  2630,   108, ...,     0,     0,     0],
       [  712,     2,    25, ...,     0,     0,     0]], dtype=int32)>

In [109]:
recall(target_example_batch, example_batch_preds)
# print(target_example_batch[1], '\n', example_batch_preds[1])
example_batch_preds.shape
tf.keras.backend.one_hot(target_example_batch, total_items)[1]
tf.keras.backend.argmin(target_example_batch[0])
target_example_batch[0]
# tf.nn.in_top_k(
#     example_batch_preds,
#     target_example_batch,
#     20,
#     name=None
# )

<tf.Tensor: id=13636, shape=(150,), dtype=int32, numpy=
array([1431, 1480, 1544, 2581,  511,   13,  995,  985,  988,  975,  257,
        991,  994,    8,  886,  583,  597,    1, 1785, 1965, 1963, 1232,
       1959, 1976,  996, 1973, 1956, 2629, 2897, 1488, 1955, 3242, 1958,
       2977, 2038,  992, 2228,  307,  982,  757,  584, 1033, 2014, 2257,
        357, 3243, 2169, 1911, 1961, 3244, 1896, 1924,  585,  604,  951,
        977,  311,  142, 1935,   60, 1932, 1926, 2017,  976, 1967,  973,
       1968,  980, 2272,  981, 1883, 1845,  567,  575,  154, 2019, 1970,
       1314,  970, 2713,  993, 1960,  972, 1930,  850, 2000, 1974, 1916,
       1931, 2273, 2695, 1892, 2302, 1621, 2039, 1411,  447,  494, 1759,
       2322,  367, 2818,  236, 1331,  200, 1269, 1166, 1604, 1153, 2580,
       2630, 2498, 2555, 2869, 2592, 2727,  578,  449,  157, 1529,  370,
        466,  576,  220,   22, 1527, 2671, 1754, 1517,  722,  760,  636,
        754, 1011, 1906, 1352,  361,  310,  108,  472,  255,  581,  

In [98]:
def recall(y_true, y_pred):
    K = tf.keras.backend
    y_true = K.one_hot(tf.dtypes.cast(y_true, tf.int32), total_items)
    y_true = K.ones_like(y_true) 
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    all_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
    recall = true_positives / (all_positives + K.epsilon())
    return recall

In [99]:
def loss(labels, logits):
    oh_labels = tf.keras.backend.one_hot(tf.dtypes.cast(labels, tf.int32), total_items)
    return tf.keras.losses.categorical_crossentropy(oh_labels, logits, from_logits=True)

model.compile(optimizer='Adagrad', loss=loss, metrics=[recall]) #, metrics=[tf.keras.metrics.categorical_accuracy])

---
### Try Model

In [100]:
dataset.take(1)

<TakeDataset shapes: ((None, 150), (None, 150)), types: (tf.int32, tf.int32)>

In [101]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_preds = model(input_example_batch)
    print(example_batch_preds.shape, "# (batch_size, sequence_length, total_items)")

(64, 150, 27387) # (batch_size, sequence_length, total_items)


In [102]:
example_batch_loss = loss(target_example_batch, example_batch_preds)

In [103]:
example_batch_preds.shape

TensorShape([64, 150, 27387])

In [104]:
example_batch_loss.shape

TensorShape([64, 150])

---
**model summmary**

In [105]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (64, None, 100)           2738700   
_________________________________________________________________
lstm_1 (LSTM)                (64, None, 20)            9680      
_________________________________________________________________
dense_1 (Dense)              (64, None, 27387)         575127    
Total params: 3,323,507
Trainable params: 3,323,507
Non-trainable params: 0
_________________________________________________________________


---
## Train Model

### Prep Validation Data

In [43]:
user_sequences_val_x, user_sequences_val_y, median = get_x_y_sequences(val_set, 1, stats=False)

padded_sequences_val_x = pad_sequences(user_sequences_val_x, max_seq_length, stats=False)
padded_sequences_val_y = pad_sequences(user_sequences_val_y, max_seq_length, stats=False)

sequences_data_val_x = tf.data.Dataset.from_tensor_slices(padded_sequences_val_x) 
sequences_data_val_y = tf.data.Dataset.from_tensor_slices(padded_sequences_val_y) 

val_dataset = tf.data.Dataset.zip((sequences_data_val_x, sequences_data_val_y))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=False)

In [44]:
padded_sequences_val_y.shape

(1664, 150)

In [45]:
padded_sequences_x.shape

(16192, 150)

### Configure Checkpoints

In [46]:
# Directory where the checkpoints will be saved
checkpoint_dir = './rnn_train_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

---
**Fit Model**

In [47]:
epochs = 25
dataset

<BatchDataset shapes: ((None, 150), (None, 150)), types: (tf.int32, tf.int32)>

In [48]:
history = model.fit(dataset, validation_data=val_dataset, epochs=epochs, callbacks=[checkpoint_callback])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25

KeyboardInterrupt: 

---
## Continue training from checkpoint

In [None]:
model.summary()

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(n_items, embedding_dim, rnn_units, batch_size=100)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='Adagrad', loss=loss)

In [None]:
aditional_epochs = 1

In [None]:
model.fit(dataset, epochs=aditional_epochs, callbacks=[checkpoint_callback])

---
# Predict Sequences

## Restore Latest Checkpoints

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(total_items, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
                   
model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

---
## Create Predictions

**Using train_set sequences to predict test_set item(s)**

In [None]:
def get_predictions(model, train_set, test_set, rank_at, temp=1):

    predictions_df = pd.DataFrame(columns=['user', 'pred_seq', 'true_seq'])
    for u in test_set.user_id.unique():
        test_user_seq = np.array(train_set[train_set['user_id']==u]['item_id'])
        true_items = list(test_set[test_set['user_id']==u]['item_id'])
        generated_predictions = []

        #Predict
        for item in range(rank_at): #could be any number of recommended items you want to predict
            predictions = model(test_user_seq.reshape(-1,1).T)
            predictions = tf.squeeze(predictions, 0)

            predictions = predictions / temp
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            test_user_seq = np.append(test_user_seq, predicted_id).reshape(-1,1).transpose()

    #         half_test_seq = tf.expand_dims([predicted_id], 0)
            generated_predictions.append(predicted_id)

        predictions_df = predictions_df.append({'user':u, 'pred_seq':generated_predictions, 'true_seq':true_items}, ignore_index=True)
        
    return predictions_df

In [None]:
rank_at = 20
predictions_df = get_predictions(model, train_set, test_set, rank_at)

In [None]:
predictions_df

In [None]:
for i, row in predictions_df.iterrows():
    if len(row['true_seq']) > 1:
        print(row)

# Evaluate

In [None]:
result_path = 'Results/CFRNN/'
predictions = pd.read_pickle(path + result_path + 'CFRNN_res_400_ML_01_users')

In [None]:
def get_metrics(ranked_df, steps, max_rank):
    s = time.time()
    ranks_at = [1] + [i for i in range(steps, max_rank + steps, steps)]
    hitcounts = []
    recs_at = []
    precs_at = []
    metrics = pd.DataFrame(columns=['rank_at', 'hitcounts', 'recall', 'precision'])
    for rank in ranks_at:
        hitcount = 0
        for i, row in ranked_df.iterrows():
            hitcount +=  len(set(row['true_id']) & set(row['pred_items_ranked'][:rank]))

        prec_at = hitcount / rank / len(ranked_df)
        rec_at = hitcount / len(ranked_df.iloc[0]['true_id']) / len(ranked_df)

        hitcounts.append(hitcount)                     
        recs_at.append(rec_at)
        precs_at.append(prec_at)

    metrics['rank_at'] = ranks_at
    metrics['hitcounts'] = hitcounts
    metrics['recall'] = recs_at
    metrics['precision'] = precs_at
    print('Obtaining metrics time:', round(time.time() - s,2))
    return metrics

In [None]:
renames = {'pred_seq':'pred_items_ranked', 'true_seq':'true_id'}
predictions = predictions.rename(renames, axis=1)

In [None]:
import time

In [None]:
metrics = get_metrics(predictions, 5, 20)

In [None]:
metrics

In [None]:
file_name

In [None]:
metrics.to_pickle(path + 'Results/CFRNN/' + 'metrics_CFRNN_' + file_name)

# Appendix

In [None]:
# oh_input = tf.keras.backend.one_hot(padded, n_items)
# e = tf.keras.layers.Embedding(n_items, 100, input_length=max_l)

In [None]:
# One hot encoded input
# sequences_data_x = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_x, n_items)) 
# sequences_data_y = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_y, n_items)) 