In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from scipy import sparse
import os
import random

In [2]:
tf.__version__

'2.1.0'

# Read Data

In [3]:
path = 'C:/Users/robin.opdam/Dropbox/'
# path = '/Users/Robin/Dropbox'
data_path = 'data/'

## Amazon Fashion

In [4]:
#Full data
# file_name = 'amazon_clothing_shoes_jewelry_data' 

#2m user above 5 ratings
# file_name = 'amazon_csj_2m'

#0.63m user above 5 ratings
# file_name = 'df_amazon_csj_with_styles_0.63m_u_above_5_rui' 

## MovieLens

In [5]:
# Full data
# file_name = 'ml-25m'

# 2m subset
# file_name = '2m-ml'

# 0.7m subset
# file_name = 'ml_0.7_u_above_5'

## Time sorted ml 2m dataset

In [None]:
#25m rows sorted per user on datetime
file_name = 'ml-25m_sorted_u_dt'

#0.7m rows sorted per user on datetime
# file_name = 'ml_07m_sorted'

In [None]:
df = pd.read_pickle(path + data_path + file_name)
df.head()

In [None]:
df['user_id'] = df.user.astype('category').cat.codes
df['item_id'] = df.item.astype('category').cat.codes

## Leave users out 

In [None]:
def leave_users_out(full_data, leave_out):
    full_data['index'] = full_data.index
    user_index_df = full_data.groupby('user')['index'].apply(list)
    users = np.random.choice(list(user_index_df.index), leave_out, replace=False)
    users_indices = []
    
    for user in users:
        users_indices.extend(user_index_df.loc[user])
    
    sub_set = full_data.loc[users_indices]
    remaining = full_data.drop(users_indices)
    
    return remaining.drop(columns=['index']), sub_set.drop(columns=['index'])

In [None]:
# leftovers, new_df = leave_users_out(df, 4500)

# Data Exploration

First filtering active users and rated items with x or more ratings:

In [None]:
user_ratings = df.groupby('user')['rating'].count()
item_ratings = df.groupby('item')['rating'].count()
norpu = user_ratings.mean()
norpi = item_ratings.mean()
total_users = df.user.unique().size
total_items = df.item.unique().size
sparseness = 1 - len(df) / (len(df['user'].unique()) * len(df['item'].unique()))

In [None]:
print('rows ', len(df), '\n#ratings', len(df[df['rating'] != 0]), '\n#ratings/user', round(norpu,2), '\n#ratings/item', round(norpi,2), '\naverage rating', "{0:.2f}".format(np.average(df['rating'])), '\n#users ', df['user'].unique().size, '\n#items ', df['item'].unique().size, '\nsparse ', round(sparseness,5), '%')

df.hist(column='rating', bins=5, grid=False)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.xticks(range(1,6))
plt.savefig('Plots/Deliverables/rating_dist_ml')
plt.show()

plt.hist(item_ratings, bins = 2000)
plt.xlim([0,100])
plt.title('#ratings per item distribution (1000 bins)')
plt.xlabel('Items')
plt.ylabel('Count')
plt.savefig('Plots/Deliverables/#ratings_per_item_dist_ml')
plt.show()

plt.hist(user_ratings, bins = 2000)
plt.xlim([0,50])
plt.title('#ratings per user distribution (1000 bins)')
plt.xlabel('Users')
plt.ylabel('Count')
plt.savefig('Plots/Deliverables/#ratings_per_user_dist_ml')
plt.show()

---
# LSTM Model
Collaborative Filtering with Recurrent Neural Networks

- paper: https://arxiv.org/pdf/1608.07400.pdf
- code:https://github.com/rdevooght/sequence-based-recommendations (in Theano)

## Data Prep

Create 100 people **train df**

In [None]:
df_og = df

In [None]:
test_users = 500
df, test_df = leave_users_out(df_og, test_users)
df['item_id'] = df.item_id.astype('category').cat.codes
df['user_id'] = df.user_id.astype('category').cat.codes
n_items = len(df_og.item_id.unique())

print('total number of items:', n_items)
print('total users:', len(df_og.user_id.unique()))
print('number of train users:', len(df.user_id.unique()))
print('number of test users:', test_users)

---
### Train and Target sequences
Create the **sequences** from the item_ids per user (already sorted)

In [None]:
user_sequences_x = []
user_sequences_y = []

lengths = []
for u in df.user_id.unique():
    user_item_seq = np.array(df[df['user_id']==u]['item_id'])
    user_sequences_x.append(user_item_seq[:-1])
    user_sequences_y.append(user_item_seq[1:])
    lengths.append(len(user_item_seq))
print('number of sequences x:', len(user_sequences_x), 
      '\navg sequence length x:', np.average(lengths),
      '\nstd_dev sequence length x:', np.round(np.std(lengths),2))

In [None]:
user_sequences_x[0]

In [None]:
user_sequences_y[0]

---
### Padding
**pad** the sequences (needed for rectangular tf.data.Dataset):
- add zeros if they are too short
- remove item ids from the beginning if they are too long

In [None]:
max_length = 250
padded_sequences_x = tf.keras.preprocessing.sequence.pad_sequences(user_sequences_x, maxlen=max_length, padding='post', truncating='pre')
padded_sequences_y = tf.keras.preprocessing.sequence.pad_sequences(user_sequences_y, maxlen=max_length, padding='post', truncating='pre')
print('number of sequences x:', padded_sequences_x.shape[0], 
      '\navg sequence length x:', np.average([i.shape[0] for i in padded_sequences_x]),
      '\nstd_dev sequence length x:', np.std([i.shape[0] for i in padded_sequences_x]))


---
### Create Dataset
**create batch dataset**
- sequences_x inputs
- sequences_y actuals
- batches of size BATCH_SIZE

In [None]:
# One hot encoded input
# sequences_data_x = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_x, n_items)) 
# sequences_data_y = tf.data.Dataset.from_tensor_slices(tf.keras.backend.one_hot(padded_sequences_y, n_items)) 

#normal sequence input
sequences_data_x = tf.data.Dataset.from_tensor_slices(padded_sequences_x) 
sequences_data_y = tf.data.Dataset.from_tensor_slices(padded_sequences_y) 
dataset = tf.data.Dataset.zip((sequences_data_x, sequences_data_y))
dataset

In [None]:
for input_example, target_example in  dataset.take(1).as_numpy_iterator():
    print ('Input data: ', input_example)
    print ('Target data:', target_example)

In [None]:
BATCH_SIZE = 100
dataset = dataset.batch(BATCH_SIZE, drop_remainder=False)
for i, o in dataset.take(1).as_numpy_iterator():
    print('input:', i.shape, '\n\noutput:', o.shape)

--- 
## Model Prep

**model architecture**

In [None]:
def build_model(n_items, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(n_items, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(units=rnn_units,
                             return_sequences=True,
                             stateful=False, #Reset cell states with each batch
                             recurrent_initializer='glorot_uniform'),
        
        tf.keras.layers.Dense(n_items)
    ])
    return model

---
**build model**

In [None]:
embedding_dim = 100
rnn_units = 20

In [None]:
model = build_model(
n_items = n_items,
embedding_dim = embedding_dim,
rnn_units = rnn_units,
batch_size = BATCH_SIZE)

---
### Add Loss
<br>
one hot encode labels for dimensionality match of LSTM output
<br> 
**Added one hot encoding of the labels to match logits output after dense layer**

In [None]:
def loss(labels, logits):
    oh_labels = tf.keras.backend.one_hot(tf.dtypes.cast(labels, tf.int32), n_items)
    return tf.keras.losses.categorical_crossentropy(oh_labels, logits, from_logits=True)

model.compile(optimizer='Adagrad', loss=loss)

---
## Try Model

In [None]:
dataset.take(1)

In [None]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_preds = model(input_example_batch)
    print(example_batch_preds.shape, "# (batch_size, sequence_length, n_items)")

In [None]:
example_batch_loss = loss(target_example_batch, example_batch_preds)

In [None]:
example_batch_preds.shape

In [None]:
example_batch_loss.shape

---
**model summmary**

In [None]:
model.summary()

---
## Train Model

---
**Configure Checkpoints**

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './rnn_train_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

---
**Fit Model**

In [None]:
epochs = 1
dataset

In [None]:
history = model.fit(dataset, epochs=epochs, callbacks=[checkpoint_callback])

---
# Predict Sequences

## Restore Latest Checkpoints

In [None]:
tf.train.latest_checkpoint(checkpoint_dir)

In [None]:
model = build_model(n_items, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
                   
model.build(tf.TensorShape([1, None]))

In [None]:
model.summary()

---
## Try Predictions

**Test user**

In [None]:
predictions_df = pd.DataFrame(columns=['user', 'pred_seq', 'true_seq'])

In [None]:
temperature = 1.0

for u in test_df.user_id.unique(): #Note: Can use multiprocessing for this
    generated_predictions = []
    user_item_seq = np.array(test_df[test_df['user_id']==u]['item_id'])
    half_test_seq = user_item_seq[:int(len(user_item_seq)/2)]
    half_test_seq = half_test_seq.reshape(-1,1).transpose()
    other_half = user_item_seq[int(len(user_item_seq)/2):]

    #Predict
    for item in half_test_seq[0]:
        predictions = model(half_test_seq)
        predictions = tf.squeeze(predictions, 0)

        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
        half_test_seq = np.append(half_test_seq, predicted_id).reshape(-1,1).transpose()

        half_test_seq = tf.expand_dims([predicted_id], 0)
        generated_predictions.append(predicted_id)
        
    predictions_df = predictions_df.append({'user':u, 'pred_seq':generated_predictions, 'true_seq':other_half}, ignore_index=True)

In [None]:
user_item_seq = np.array(test_df[test_df['user_id']==659]['item_id'])
half_test_seq = user_item_seq[:int(len(user_item_seq)/2)]
half_test_seq = half_test_seq.reshape(-1,1).transpose()
other_half = user_item_seq[int(len(user_item_seq)/2):]

#Predict
for item in half_test_seq[0]:
    predictions = model(half_test_seq)
    predictions = tf.squeeze(predictions, 0)

    predictions = predictions / temperature
    predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
    half_test_seq = np.append(half_test_seq, predicted_id).reshape(-1,1).transpose()
    print('User:', u)
    print(half_test_seq, predicted_id)
    generated_predictions.append(predicted_id)

predictions_df = predictions_df.append({'user':u, 'pred_seq':generated_predictions, 'true_seq':other_half}, ignore_index=True)

In [None]:
len(predictions_df.iloc[0]['true_seq'])

# Appendix

In [None]:
# oh_input = tf.keras.backend.one_hot(padded, n_items)
# e = tf.keras.layers.Embedding(n_items, 100, input_length=max_l)