In [198]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import sparse
import os
import random

In [199]:
tf.__version__

'2.1.0'

# Read Data
- all datasets datetime sorted

In [256]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
# path = './'

## Amazon Fashion 

In [270]:
# data_path = 'data/Amazon/'
# file_name = 'Amazon_full' 
# file_name = 'Amazon_05_users' 
# file_name = 'Amazon_01_users'
# file_name = 'Amazon_005_users'
# file_name = 'Amazon_001_users'

## MovieLens 

In [271]:
data_path = 'data/ML/'
# file_name = 'ML_full' 
# file_name = 'ML_05_users'
file_name = 'ML_01_users'
# file_name = 'ML_005_users'
# file_name = 'ML_001_users'

In [272]:
df = pd.read_pickle(path + data_path + file_name)
df.head()

Unnamed: 0,user,item,rating,datetime,item_id,user_id
18590190,120461,2501,5.0,2000-04-25 02:29:35,2410,120460
18590032,120461,252,4.0,2000-04-25 02:29:35,249,120460
18590159,120461,2069,4.0,2000-04-25 02:29:35,1980,120460
18590048,120461,440,4.0,2000-04-25 02:29:35,435,120460
18590145,120461,1959,4.0,2000-04-25 02:29:35,1870,120460


---
# Data Prep
1. Each (relatively) ordered item sequence per user will be viewed as one time series
2. **Sequences that do not have the *right* size, will be padded/truncated for now**
3. Each batch consists out of BATCH_SIZE users sequences



## Create new ids

In [273]:
df['item_id'] = df.item.astype('category').cat.codes
df['user_id'] = df.user.astype('category').cat.codes

## Prep Functions

In [274]:
def leave_users_out(full_data, leave_out, seed=1234):
    np.random.seed(seed)
    full_data['index'] = full_data.index
    user_index_df = full_data.groupby('user')['index'].apply(list)
    users = np.random.choice(list(user_index_df.index), leave_out, replace=False)
    users_indices = []
    
    for user in users:
        users_indices.extend(user_index_df.loc[user])
    
    sub_set = full_data.loc[users_indices]
    remaining = full_data.drop(users_indices)
    
    return remaining.drop(columns=['index']), sub_set.drop(columns=['index'])

In [275]:
def leave_last_x_out(full_data, n_users, leave_out=1, seed=1234):
    # Input: data must contain user_id
    # Output: full_data = without all last (time order) entries in leave one out set
    #         leave_one_out_set = data with one user and one item from full_data
    np.random.seed(seed)
    
    full_data['index'] = full_data.index
    user_items_ind = full_data.groupby('user_id')['index'].apply(list)
    users = full_data.user_id.unique()
    leave_out_indices = []
    users_picked = []
    
    for i in range(n_users):    
        random_user = np.random.choice(users) # random user's items indices
        item_indices = user_items_ind[random_user]
        while len(item_indices) <= leave_out or random_user in users_picked: # needs to have more items than to leave out, or deleting users
            random_user = np.random.choice(users)
            item_indices = user_items_ind[random_user]
            
        users_picked.append(random_user)
        leave_out_indices.extend(item_indices[-leave_out:])
    
    leave_out_set = full_data.loc[leave_out_indices] # the last items of n_users users with n_item > leave_out
    full_data_leave_one_out = full_data.drop(leave_out_indices) # drops last items for n_users users
    
    return full_data_leave_one_out.drop(columns=['index']), leave_out_set.drop(columns=['index'])

### Determine Sizes
- Batch Size (From CFRNN)
- Test Size
- Validation Size

In [276]:
BATCH_SIZE = 64

In [277]:
df_og = df

In [278]:
users_to_remove = len(df_og.user_id.unique())%BATCH_SIZE #Batch size compatible for CFRNN
df, deleted_users = leave_users_out(df_og, users_to_remove)

In [279]:
total_users = len(df_og.user_id.unique()) # Need all users for BPR
total_items = len(df_og.item_id.unique()) # Need all items for CFRNN

test_users = int(0.1*total_users) # Number of users to be used for testing
test_last_items = 1 # Items to be removed from test users in train set and used in test set

val_users = int(0.1*total_users) -1
val_last_items = 1

### Split

In [280]:
train_set, test_set = leave_last_x_out(df, test_users, test_last_items)
train_set, val_set = leave_last_x_out(train_set, val_users, val_last_items)

In [281]:
print('Total number of items:', total_items)
print('Total users:', total_users)
print('Number of train users:', len(train_set.user_id.unique()))
print('Number of test users:', test_users)
print('Number of validation users:', val_users, '\n')
print('Users deleted:', len(deleted_users.user_id.unique()))

Total number of items: 27387
Total users: 16254
Number of train users: 16192
Number of test users: 1625
Number of validation users: 1624 

Users deleted: 62


---
## Train and Target sequences
Create the **sequences** from the item_ids per user (already sorted)

In [282]:
def get_x_y_sequences(dataset, n_unknowns_in_y=1, stats=True):
    user_sequences_x = []
    user_sequences_y = []
    lengths = []
    users = dataset.user_id.unique()
    
    for u in users:
        user_item_seq = np.array(dataset[dataset['user_id']==u]['item_id'])
        user_sequences_x.append(user_item_seq[:-n_unknowns_in_y])
        user_sequences_y.append(user_item_seq[n_unknowns_in_y:])
        lengths.append(len(user_item_seq))
    
    median = np.median(lengths)
    
    if stats:
        print('Number of sequences x:', len(user_sequences_x), 
              '\nAvg sequence length x:', np.average(lengths),
              '\nStd_dev sequence length x:', np.round(np.std(lengths),2),
              '\nMedian of sequence length x:', median)

    return user_sequences_x, user_sequences_y, median

In [283]:
user_sequences_x, user_sequences_y, median = get_x_y_sequences(train_set, 1)

Number of sequences x: 16192 
Avg sequence length x: 150.4212574110672 
Std_dev sequence length x: 242.73 
Median of sequence length x: 71.0


---
## Padding
- **Using: Median, Mean or Min**

- add zeros if they are too short
- remove item ids from the beginning if they are too long

In [241]:
def pad_sequences(sequences, max_length, stats=True):
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=int(max_length), padding='post', truncating='pre')
    if stats:
        print('number of sequences:', padded_sequences.shape[0], 
              '\navg sequence length:', np.average([i.shape[0] for i in padded_sequences]),
              '\nstd_dev sequence length:', np.std([i.shape[0] for i in padded_sequences]))
        
    return padded_sequences

In [242]:
max_seq_length = median
padded_sequences_x = pad_sequences(user_sequences_x, max_seq_length)
padded_sequences_y = pad_sequences(user_sequences_y, max_seq_length, stats=False)

number of sequences: 121344 
avg sequence length: 6.0 
std_dev sequence length: 0.0


---
## Create Dataset
- sequences_x inputs
- sequences_y actuals
- batches of size BATCH_SIZE

In [243]:
sequences_data_x = tf.data.Dataset.from_tensor_slices(padded_sequences_x) 
sequences_data_y = tf.data.Dataset.from_tensor_slices(padded_sequences_y) 
dataset = tf.data.Dataset.zip((sequences_data_x, sequences_data_y))
dataset

<ZipDataset shapes: ((6,), (6,)), types: (tf.int32, tf.int32)>

In [244]:
for input_example, target_example in  dataset.take(1).as_numpy_iterator():
    print ('Input data:', input_example)
    print ('Target data:', target_example)

Input data: [104506 175639  99224 238824 222085      0]
Target data: [175639  99224 238824 222085   4041      0]


In [245]:
dataset = dataset.batch(BATCH_SIZE, drop_remainder=False)
for i, o in dataset.take(1).as_numpy_iterator():
    print('input:', i.shape, '\n\noutput:', o.shape)

input: (64, 6) 

output: (64, 6)


---
# LSTM Model
Collaborative Filtering with Recurrent Neural Networks

- paper: https://arxiv.org/pdf/1608.07400.pdf
- code:https://github.com/rdevooght/sequence-based-recommendations (in Theano)


## Model Prep

### model architecture

In [246]:
def build_model(total_items, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(total_items, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.LSTM(units=rnn_units,
                             return_sequences=True,
                             stateful=False, #Reset cell states with each batch
                             recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(total_items)
    ])
    return model

---
### build model

In [247]:
embedding_dim = 100
rnn_units = 20

In [248]:
model = build_model(
total_items = total_items,
embedding_dim = embedding_dim,
rnn_units = rnn_units,
batch_size = BATCH_SIZE)

---
### Add Loss
**Added one hot encoding of the labels to match logits output after dense layer**

In [249]:
def loss(labels, logits):
    oh_labels = tf.keras.backend.one_hot(tf.dtypes.cast(labels, tf.int32), total_items)
    return tf.keras.losses.categorical_crossentropy(oh_labels, logits, from_logits=True)

model.compile(optimizer='Adagrad', loss=loss)

---
### Try Model

In [250]:
dataset.take(1)

<TakeDataset shapes: ((None, 6), (None, 6)), types: (tf.int32, tf.int32)>

In [251]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_preds = model(input_example_batch)
    print(example_batch_preds.shape, "# (batch_size, sequence_length, total_items)")

(64, 6, 247465) # (batch_size, sequence_length, total_items)


In [252]:
example_batch_loss = loss(target_example_batch, example_batch_preds)

In [253]:
example_batch_preds.shape

TensorShape([64, 6, 247465])

In [254]:
example_batch_loss.shape

TensorShape([64, 6])

---
**model summmary**

In [255]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (64, None, 100)           24746500  
_________________________________________________________________
lstm_6 (LSTM)                (64, None, 20)            9680      
_________________________________________________________________
dense_6 (Dense)              (64, None, 247465)        5196765   
Total params: 29,952,945
Trainable params: 29,952,945
Non-trainable params: 0
_________________________________________________________________


---
## Train Model

### Configure Checkpoints

In [134]:
# Directory where the checkpoints will be saved
checkpoint_dir = './rnn_train_checkpoints'

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_best_only = True)

---
**Fit Model**

In [135]:
epochs = 2
dataset

<BatchDataset shapes: ((None, 6), (None, 6)), types: (tf.int32, tf.int32)>

In [139]:
history = model.fit(dataset, epochs=epochs, callbacks=[checkpoint_callback])

ValueError: `validation_split` argument is not supported when data adapter is <class 'tensorflow.python.keras.engine.data_adapter.DatasetAdapter'>. Received: x=<BatchDataset shapes: ((None, 6), (None, 6)), types: (tf.int32, tf.int32)>, validation_split=0.2

In [107]:
tf.keras.losses.custom_loss = loss

In [108]:
model.save('test_save_model.h5')

In [110]:
same_model = tf.keras.models.load_model('test_save_model.h5', compile=False)

In [111]:
same_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 100)           6161600   
_________________________________________________________________
lstm_2 (LSTM)                (64, None, 20)            9680      
_________________________________________________________________
dense_2 (Dense)              (64, None, 61616)         1293936   
Total params: 7,465,216
Trainable params: 7,465,216
Non-trainable params: 0
_________________________________________________________________


---
## Continue training from checkpoint

In [None]:
model.summary()

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(n_items, embedding_dim, rnn_units, batch_size=100)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='Adagrad', loss=loss)

In [None]:
aditional_epochs = 1

In [None]:
model.fit(dataset, epochs=aditional_epochs, callbacks=[checkpoint_callback])

---
# Predict Sequences

## Restore Latest Checkpoints

In [118]:
tf.train.latest_checkpoint(checkpoint_dir)

'./rnn_train_checkpoints/ckpt'

In [119]:
model = build_model(total_items, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
                   
model.build(tf.TensorShape([1, None]))



In [120]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 100)            6161600   
_________________________________________________________________
lstm_3 (LSTM)                (1, None, 20)             9680      
_________________________________________________________________
dense_3 (Dense)              (1, None, 61616)          1293936   
Total params: 7,465,216
Trainable params: 7,465,216
Non-trainable params: 0
_________________________________________________________________


---
## Create Predictions

**Using train_set sequences to predict test_set item(s)**

In [39]:
def get_predictions(model, train_set, test_set, rank_at, temp=1):

    predictions_df = pd.DataFrame(columns=['user', 'pred_seq', 'true_seq'])
    for u in test_set.user_id.unique():
        test_user_seq = np.array(train_set[train_set['user_id']==u]['item_id'])
        true_items = list(test_set[test_set['user_id']==u]['item_id'])
        generated_predictions = []

        #Predict
        for item in range(rank_at): #could be any number of recommended items you want to predict
            predictions = model(test_user_seq.reshape(-1,1).T)
            predictions = tf.squeeze(predictions, 0)

            predictions = predictions / temp
            predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
            test_user_seq = np.append(test_user_seq, predicted_id).reshape(-1,1).transpose()

    #         half_test_seq = tf.expand_dims([predicted_id], 0)
            generated_predictions.append(predicted_id)

        predictions_df = predictions_df.append({'user':u, 'pred_seq':generated_predictions, 'true_seq':true_items}, ignore_index=True)
        
    return predictions_df

In [40]:
rank_at = 20
predictions_df = get_predictions(model, train_set, test_set, rank_at)

In [41]:
predictions_df

Unnamed: 0,user,pred_seq,true_seq
0,8004,"[9844, 7013, 49858, 18502, 59118, 20609, 42601...",[60894]
1,5430,"[29045, 29296, 43388, 4925, 38654, 5391, 32961...",[44397]
2,4794,"[30808, 11637, 5233, 20121, 1346, 4273, 13996,...",[58377]
3,340,"[8808, 12603, 9635, 45144, 45255, 15132, 16999...",[2574]
4,5894,"[21874, 46665, 48823, 2607, 30576, 40088, 3607...",[46022]
...,...,...,...
1208,2833,"[24555, 61064, 14933, 34995, 41070, 20266, 255...",[51379]
1209,5259,"[11833, 10622, 1713, 17195, 51161, 24459, 1910...",[2962]
1210,3916,"[47736, 60959, 57877, 38982, 2521, 26365, 1355...",[57102]
1211,9786,"[19235, 36039, 30496, 54791, 26335, 50582, 433...",[49642]


In [42]:
for i, row in predictions_df.iterrows():
    if len(row['true_seq']) > 1:
        print(row)

# Evaluate

In [189]:
result_path = 'Results/CFRNN/'
predictions = pd.read_pickle(path + result_path + 'CFRNN_res_400_ML_01_users')

In [190]:
def get_metrics(ranked_df, steps, max_rank):
    s = time.time()
    ranks_at = [1] + [i for i in range(steps, max_rank + steps, steps)]
    hitcounts = []
    recs_at = []
    precs_at = []
    metrics = pd.DataFrame(columns=['rank_at', 'hitcounts', 'recall', 'precision'])
    for rank in ranks_at:
        hitcount = 0
        for i, row in ranked_df.iterrows():
            hitcount +=  len(set(row['true_id']) & set(row['pred_items_ranked'][:rank]))

        prec_at = hitcount / rank / len(ranked_df)
        rec_at = hitcount / len(ranked_df.iloc[0]['true_id']) / len(ranked_df)

        hitcounts.append(hitcount)                     
        recs_at.append(rec_at)
        precs_at.append(prec_at)

    metrics['rank_at'] = ranks_at
    metrics['hitcounts'] = hitcounts
    metrics['recall'] = recs_at
    metrics['precision'] = precs_at
    print('Obtaining metrics time:', round(time.time() - s,2))
    return metrics

In [191]:
renames = {'pred_seq':'pred_items_ranked', 'true_seq':'true_id'}
predictions = predictions.rename(renames, axis=1)

In [192]:
import time

In [193]:
metrics = get_metrics(predictions, 5, 20)

Obtaining metrics time: 1.6


In [194]:
metrics

Unnamed: 0,rank_at,hitcounts,recall,precision
0,1,1,0.000615,0.000615
1,5,4,0.002462,0.000492
2,10,7,0.004308,0.000431
3,15,14,0.008615,0.000574
4,20,14,0.008615,0.000431


In [195]:
file_name

'Amazon_01_users'

In [197]:
metrics.to_pickle(path + 'Results/CFRNN/' + 'metrics_CFRNN_' + file_name)

# Appendix

In [None]:
# oh_input = tf.keras.backend.one_hot(padded, n_items)
# e = tf.keras.layers.Embedding(n_items, 100, input_length=max_l)

In [None]:
# One hot encoded input
# sequences_data_x = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_x, n_items)) 
# sequences_data_y = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_y, n_items)) 