In [42]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import sparse
import os
import random

In [43]:
tf.__version__

'2.1.0'

# Read Data
- all datasets datetime sorted

In [44]:
# path = 'C:/Users/robin.opdam/Google Drive/Thesis (Msc)/Thesis_shared_files/'
path = '/Users/Robin/Google Drive/Thesis (Msc)/Thesis_shared_files/'
# path = './'

## Amazon Fashion 

In [45]:
data_path = 'data/Amazon/'
# file_name = 'Amazon_full' 
# file_name = 'Amazon_05_users' 
# file_name = 'Amazon_01_users'
# file_name = 'Amazon_005_users'
file_name = 'Amazon_001_users'

## MovieLens 

In [46]:
# data_path = 'data/ML/'
# file_name = 'ML_full' 
# file_name = 'ML_05_users'
# file_name = 'ML_01_users'
# file_name = 'ML_005_users'
# file_name = 'ML_001_users'

In [47]:
df = pd.read_pickle(path + data_path + file_name)
df.head()

Unnamed: 0,user,item,datetime,rating,item_id,user_id
5282697,A1QP7FHEZRZ1LZ,B00HMXYKMM,2017-02-08,5.0,168923,238328
8819596,A1QP7FHEZRZ1LZ,B01A88MEV6,2017-04-10,5.0,335203,238328
3335926,A1QP7FHEZRZ1LZ,B007900UZY,2017-04-10,5.0,77864,238328
6822032,A1QP7FHEZRZ1LZ,B00RI9TL7E,2017-04-10,4.0,245981,238328
4500311,A1QP7FHEZRZ1LZ,B00DBUVIVQ,2017-04-10,5.0,131704,238328


---
# Data Prep
1. Each (relatively) ordered item sequence per user will be viewed as one time series
2. **Sequences that do not have the *right* size, will be padded/truncated for now**
3. Each batch consists out of BATCH_SIZE users sequences



## Prep Functions

In [48]:
def leave_users_out(full_data, leave_out):
    full_data['index'] = full_data.index
    user_index_df = full_data.groupby('user')['index'].apply(list)
    users = np.random.choice(list(user_index_df.index), leave_out, replace=False)
    users_indices = []
    
    for user in users:
        users_indices.extend(user_index_df.loc[user])
    
    sub_set = full_data.loc[users_indices]
    remaining = full_data.drop(users_indices)
    
    return remaining.drop(columns=['index']), sub_set.drop(columns=['index'])

In [49]:
def leave_last_x_out(full_data, n_users, leave_out=1, seed=1234):
    # Input: data must contain user_id
    # Output: full_data = without all entries last entries in leave one out set
    #         leave_one_out_set = data with one user and one item from full_data
    
    full_data['index'] = full_data.index
    user_items_ind = full_data.groupby('user_id')['index'].apply(list)
    np.random.RandomState(seed)
    users = np.random.choice(list(user_items_ind.index), n_users*3, replace=False)
    leave_out_indices = []
    
    user_counter = 0
    indices = user_items_ind.loc[users[user_counter]]
    while user_counter < n_users and len(indices) < leave_out:
        to_leave_out_indices = indices[- leave_out:]
        leave_out_indices.extend(to_leave_out_indices)
        
        user_counter += 1
        indices = user_items_ind.loc[users[user_counter]]
    
    leave_out_set = full_data.loc[leave_out_indices]
    full_data_leave_one_out = full_data.drop(leave_out_indices)
    
    return full_data_leave_one_out.drop(columns=['index']), leave_out_set.drop(columns=['index'])

## Create new ids

In [50]:
df['user_id'] = df.user.astype('category').cat.codes
df['item_id'] = df.item.astype('category').cat.codes

## Train Test Split

### Determine Sizes
- Batch Size
- Test Size
- Validation Size

In [51]:
BATCH_SIZE = 25

In [52]:
df_og = df

users_to_remove = len(df_og.user_id.unique())%BATCH_SIZE
df, delete = leave_users_out(df_og, users_to_remove)

In [53]:
total_users = len(df.user_id.unique())
total_items = len(df_og.item_id.unique())

test_users = int(0.1*total_users) # Number of users to be used for testing
test_last_items = 1 # Items to be removed from test users in train set and used in test set

val_users = int(0.1*total_users) -1
val_last_items = 1

### Create Split

In [54]:
train_set, test_set = leave_last_x_out(df, test_users, test_last_items)
train_set, val_set = leave_last_x_out(train_set, val_users, val_last_items)

In [55]:
print('Total number of items:', total_items)
print('Total users:', total_users)
print('Number of train users:', len(train_set.user_id.unique()))
print('Number of test users:', test_users)
print('Number of validation users:', val_users)

Total number of items: 61616
Total users: 12125
Number of train users: 12125
Number of test users: 1212
Number of validation users: 1211


---
## Train and Target sequences
Create the **sequences** from the item_ids per user (already sorted)

In [56]:
def get_x_y_sequences(dataset, n_unknowns_in_y=1, stats=True):
    user_sequences_x = []
    user_sequences_y = []
    lengths = []
    
    for u in dataset.user_id.unique():
        user_item_seq = np.array(df[df['user_id']==u]['item_id'])
        user_sequences_x.append(user_item_seq[:-n_unknowns_in_y])
        user_sequences_y.append(user_item_seq[n_unknowns_in_y:])
        lengths.append(len(user_item_seq))
    
    median = np.median(lengths)
    
    if stats:
        print('Number of sequences x:', len(user_sequences_x), 
              '\nAvg sequence length x:', np.average(lengths),
              '\nStd_dev sequence length x:', np.round(np.std(lengths),2),
              '\nMedian of sequence length x:', median)

    return user_sequences_x, user_sequences_y, median

In [57]:
user_sequences_x, user_sequences_y, median = get_x_y_sequences(train_set, 1)

Number of sequences x: 12125 
Avg sequence length x: 8.705896907216495 
Std_dev sequence length x: 6.69 
Median of sequence length x: 7.0


---
## Padding
- **Using: Median, Mean or Min**

- add zeros if they are too short
- remove item ids from the beginning if they are too long

In [58]:
def pad_sequences(sequences, max_length, stats=True):
    padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=int(max_length), padding='post', truncating='pre')
    if stats:
        print('number of sequences:', padded_sequences.shape[0], 
              '\navg sequence length:', np.average([i.shape[0] for i in padded_sequences]),
              '\nstd_dev sequence length:', np.std([i.shape[0] for i in padded_sequences]))
        
    return padded_sequences

In [59]:
max_seq_length = median
padded_sequences_x = pad_sequences(user_sequences_x, max_seq_length)
padded_sequences_y = pad_sequences(user_sequences_y, max_seq_length, stats=False)

number of sequences: 12125 
avg sequence length: 7.0 
std_dev sequence length: 0.0


---
## Create Dataset
- sequences_x inputs
- sequences_y actuals
- batches of size BATCH_SIZE

In [60]:
sequences_data_x = tf.data.Dataset.from_tensor_slices(padded_sequences_x) 
sequences_data_y = tf.data.Dataset.from_tensor_slices(padded_sequences_y) 
dataset = tf.data.Dataset.zip((sequences_data_x, sequences_data_y))
dataset

<ZipDataset shapes: ((7,), (7,)), types: (tf.int32, tf.int32)>

In [61]:
for input_example, target_example in  dataset.take(1).as_numpy_iterator():
    print ('Input data:', input_example)
    print ('Target data:', target_example)

Input data: [29969 55489 15358 41759     0     0     0]
Target data: [55489 15358 41759 24154     0     0     0]


In [62]:
dataset = dataset.batch(BATCH_SIZE, drop_remainder=False)
for i, o in dataset.take(1).as_numpy_iterator():
    print('input:', i.shape, '\n\noutput:', o.shape)

input: (25, 7) 

output: (25, 7)


---
# LSTM Model
Collaborative Filtering with Recurrent Neural Networks

- paper: https://arxiv.org/pdf/1608.07400.pdf
- code:https://github.com/rdevooght/sequence-based-recommendations (in Theano)


## Model Prep

### model architecture

In [63]:
def build_model(total_items, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(total_items, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        
        tf.keras.layers.LSTM(units=rnn_units,
                             return_sequences=True,
                             stateful=False, #Reset cell states with each batch
                             recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(total_items)
    ])
    return model

---
### build model

In [64]:
embedding_dim = 100
rnn_units = 20

In [65]:
model = build_model(
total_items = total_items,
embedding_dim = embedding_dim,
rnn_units = rnn_units,
batch_size = BATCH_SIZE)

---
### Add Loss
**Added one hot encoding of the labels to match logits output after dense layer**

In [66]:
def loss(labels, logits):
    oh_labels = tf.keras.backend.one_hot(tf.dtypes.cast(labels, tf.int32), total_items)
    return tf.keras.losses.categorical_crossentropy(oh_labels, logits, from_logits=True)

model.compile(optimizer='Adagrad', loss=loss)

---
### Try Model

In [67]:
dataset.take(1)

<TakeDataset shapes: ((None, 7), (None, 7)), types: (tf.int32, tf.int32)>

In [68]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_preds = model(input_example_batch)
    print(example_batch_preds.shape, "# (batch_size, sequence_length, total_items)")

(25, 7, 61616) # (batch_size, sequence_length, total_items)


In [69]:
example_batch_loss = loss(target_example_batch, example_batch_preds)

In [70]:
example_batch_preds.shape

TensorShape([25, 7, 61616])

In [71]:
example_batch_loss.shape

TensorShape([25, 7])

---
**model summmary**

In [72]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (25, None, 100)           6161600   
_________________________________________________________________
lstm_1 (LSTM)                (25, None, 20)            9680      
_________________________________________________________________
dense_1 (Dense)              (25, None, 61616)         1293936   
Total params: 7,465,216
Trainable params: 7,465,216
Non-trainable params: 0
_________________________________________________________________


---
## Train Model

### Configure Checkpoints

In [73]:
# Directory where the checkpoints will be saved
## Laptop
# checkpoint_dir = path + 'Results/rnn_train_checkpoints_' + file_name
checkpoint_dir = './rnn_train_checkpoints'
## Online GPU
# checkpoint_dir = path + 'rnn_train_checkpoints' + file_name

# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = checkpoint_prefix,
    save_weights_only = True)

---
**Fit Model**

In [74]:
epochs = 2
dataset

<BatchDataset shapes: ((None, 7), (None, 7)), types: (tf.int32, tf.int32)>

In [75]:
history = model.fit(dataset, epochs=epochs, callbacks=[checkpoint_callback])

Train for 485 steps
Epoch 1/4000
Epoch 2/4000
 27/485 [>.............................] - ETA: 1:14 - loss: 10.6056

KeyboardInterrupt: 

---
## Continue training from checkpoint

In [0]:
model.summary()

In [0]:
tf.train.latest_checkpoint(checkpoint_dir)
model = build_model(n_items, embedding_dim, rnn_units, batch_size=100)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.compile(optimizer='Adagrad', loss=loss)

In [0]:
aditional_epochs = 1

In [0]:
model.fit(dataset, epochs=aditional_epochs, callbacks=[checkpoint_callback])

---
# Predict Sequences

## Restore Latest Checkpoints

In [0]:
tf.train.latest_checkpoint(checkpoint_dir)

In [0]:
model = build_model(n_items, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
                   
model.build(tf.TensorShape([1, None]))

In [0]:
model.summary()

---
## Create Predictions

**Using test_df**

In [0]:
print('test users:', len(test_df.user_id.unique()))

In [0]:
predictions_df = pd.DataFrame(columns=['user', 'pred_seq', 'true_seq'])

In [None]:
rank_at = 20

In [0]:
temperature = 1.0

for u in test_df.user_id.unique(): #Note: Can use multiprocessing for this
    generated_predictions = []
    user_item_seq = np.array(test_df[test_df['user_id']==u]['item_id'])
    half_test_seq = user_item_seq[:int(len(user_item_seq)/2)]
    half_test_seq = half_test_seq.reshape(-1,1).transpose()
    other_half = user_item_seq[int(len(user_item_seq)/2):]

    #Predict
    for item in range(rank_athalf_test_seq[0]: #could be any number of recommended items you want to predict
        predictions = model(half_test_seq)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        half_test_seq = np.append(half_test_seq, predicted_id).reshape(-1,1).transpose()

        half_test_seq = tf.expand_dims([predicted_id], 0)
        generated_predictions.append(predicted_id)
        
    predictions_df = predictions_df.append({'user':u, 'pred_seq':generated_predictions, 'true_seq':other_half}, ignore_index=True)

In [0]:
len(predictions_df.iloc[0]['true_seq'])

# Evaluate

In [0]:
result_path = 'Results/CFRNN/'
predictions = pd.read_pickle(path + result_path + 'CFRNN_ml_6k_users_700_epochs')

In [0]:
list(predictions)

In [0]:
def evaluate(predictions, max_rank, steps):
    hitcounts = []
    recs_at = []
    precs_at = []
    ranks_at = [1] + [i for i in range(steps, max_rank + steps, steps)]
    res = pd.DataFrame(columns=['rank_at', 'hitcounts', 'recall', 'precision'])
    for rank in ranks_at:
        hitcount = 0
        for i, row in predictions.iterrows():
            for true_item in row.true_seq[:rank]:
                if true_item in row.pred_seq:
                    hitcount += 1
        for u in ranked_df.index:
            hitcount +=  len(set(predictions.loc[u]['true_id']) & set(ranked_df.loc[u]['pred_items_ranked'][:rank]))

        prec_at = hitcount / rank / len(ranked_df)
        rec_at = hitcount / len(ranked_df.iloc[0]['true_id']) / len(ranked_df)

        print('rank_at:', rank, '  Hits:', hitcount)
        hitcounts.append(hitcount)                     
        recs_at.append(rec_at)
        precs_at.append(prec_at)

    res['rank_at'] = ranks_at
    res['hitcounts'] = hitcounts
    res['recall'] = recs_at
    res['precision'] = precs_at


In [0]:
hitcount = 0
for i, row in predictions.iterrows():
    for true_item in row.true_seq:
        if true_item in row.pred_seq:
            hitcount += 1

In [0]:
hitcount

# Appendix

In [0]:
# oh_input = tf.keras.backend.one_hot(padded, n_items)
# e = tf.keras.layers.Embedding(n_items, 100, input_length=max_l)

In [0]:
# One hot encoded input
# sequences_data_x = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_x, n_items)) 
# sequences_data_y = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_y, n_items)) 