# Module Imports

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Conv1D, Dropout, Dense, GlobalAveragePooling1D, Lambda
import tensorflow.keras.backend as K
import tensorflow_probability as tfp
tfd = tfp.distributions
tfpl = tfp.layers

random.seed(42)
np.random.seed(42)

# Load and Process Data

In [2]:
# Load MovieLens dataset
data_path = 'ml-100k/u.data'
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
data = pd.read_csv(data_path, sep='\t', names=column_names)

In [3]:
movies = pd.read_csv('ml-100k/movies.csv', sep=',')

In [4]:
# Preprocess the dataset
user_ids = data['user_id'].unique()
item_ids = data['item_id'].unique()
uid2idx = {uid: idx for idx, uid in enumerate(user_ids)}
iid2idx = {iid: idx for idx, iid in enumerate(item_ids)}
idx2uid = {idx: uid for uid, idx in uid2idx.items()}
idx2iid = {idx: iid for iid, idx in iid2idx.items()}
data['user_id'] = data['user_id'].map(uid2idx)
data['item_id'] = data['item_id'].map(iid2idx)
movies['item_id'] = movies['movieId'].map(iid2idx)

In [5]:
# Sort data by timestamp
train_data = data.sort_values(by='timestamp')

In [6]:
def generate_sequences_and_labels(data, user_ids, item_ids, seq_length, num_items):
    sequences, labels = [], []

    for user_id in user_ids:
        user_data = data[data['user_id'] == user_id]
        user_items = user_data['item_id'].values.tolist()
        
        # Create sequences of fixed length for each user
        for i in range(len(user_items) - seq_length):
            sequences.append(user_items[i:i + seq_length])
            
            label = np.zeros(num_items)
            if user_items[i + seq_length] in user_items[i:i + seq_length]:
                label[user_items[i + seq_length]] = 1  # Positive feedback
            else:
                label[user_items[i + seq_length]] = 0  # Negative feedback
            labels.append(label)

    sequences = np.array(sequences)
    labels = np.array(labels)
    return sequences, labels

In [7]:
seq_length = 5  # Adjust this value based on your requirements
train_sequences, train_labels = generate_sequences_and_labels(train_data, user_ids, item_ids, seq_length, len(item_ids))

# Visualize Data

In [8]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,0,3,881250949
1,1,1,3,891717742
2,2,2,1,878887116
3,3,3,2,880606923
4,4,4,1,886397596


In [9]:
print('Number of users: ', len(user_ids))
print('Number of items: ', len(item_ids))
print('Number of interactions: ', len(data))
print('Average rating: ', data['rating'].mean())
print('Start Date: ', pd.to_datetime(data['timestamp'].min(), unit='s'))
print('End Date: ', pd.to_datetime(data['timestamp'].max(), unit='s'))

Number of users:  943
Number of items:  1682
Number of interactions:  100000
Average rating:  3.52986
Start Date:  1997-09-20 03:05:10
End Date:  1998-04-22 23:10:38


# NextItNet Model

In [10]:
def create_nextitnet_model(num_items, embedding_dim, num_layers, kernel_size, dropout_rate):
    input_layer = Input(shape=(None,))
    x = Embedding(num_items, embedding_dim)(input_layer)
    
    for _ in range(num_layers):
        x = Conv1D(embedding_dim, kernel_size, padding='causal', activation='relu')(x)
        x = Dropout(dropout_rate)(x, training=True)  # Enable dropout during inference for uncertainty estimation

    x = GlobalAveragePooling1D()(x)
    output_layer = Dense(num_items, activation='sigmoid')(x)
    
    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    return model

In [11]:
num_items = len(item_ids)
embedding_dim = 64
num_layers = 3
kernel_size = 3
dropout_rate = 0.5

nextitnet = create_nextitnet_model(num_items, embedding_dim, num_layers, kernel_size, dropout_rate)
nextitnet.compile(optimizer='adam', loss='binary_crossentropy')

In [12]:
history = nextitnet.fit(train_sequences, train_labels, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Uncertainty Score

In [42]:
def estimate_uncertainty(model, user_sequence, num_samples=100):
    user_sequence = np.expand_dims(user_sequence, axis=0)
    predictions = np.stack([model.predict(user_sequence) for _ in range(num_samples)], axis=-1)
    means = np.mean(predictions, axis=-1)
    variances = np.var(predictions, axis=-1)
    return means, variances

def get_movie_titles(recommendations, movies_df):
    movie_titles = []
    for index in recommendations:
        movie_title = movies_df.loc[index, 'title']
        movie_titles.append(movie_title)
    return movie_titles

In [43]:
# Grab X random item ids
test_sequence_length = 3
user_sequence = list(np.random.choice(item_ids, test_sequence_length))  # A sample user sequence
means, variances = estimate_uncertainty(nextitnet, user_sequence)

k = 5
top_k_recommendations = np.argsort(means[0])[-k:][::-1]  # Get top 5 recommended items
top_k_uncertainties = variances[0][top_k_recommendations]  # Get corresponding uncertainty scores



In [44]:
top_k_movie_titles = get_movie_titles(top_k_recommendations, movies)
for i in range(len(top_k_movie_titles)):
    print(top_k_movie_titles[i], 'Uncertainty Score: ', top_k_uncertainties[i]) # this doesnt work

Love and Other Catastrophes (1996) Uncertainty Score:  1.502689e-06
Bread and Chocolate (Pane e cioccolata) (1973) Uncertainty Score:  8.9743696e-07
Newton Boys, The (1998) Uncertainty Score:  6.807371e-07
Murder, My Sweet (1944) Uncertainty Score:  7.035755e-07
BASEketball (1998) Uncertainty Score:  7.783811e-07


# NLL Loss

In [29]:
def nll_loss(y_true, y_pred):
    dist = tfp.distributions.Categorical(logits=y_pred)
    return -K.mean(dist.log_prob(tf.argmax(y_true, axis=-1)))

# Hit Rate

In [36]:
def hit_rate_metric(k):
    def hit_rate(y_true, y_pred):
        top_k_items = tf.nn.top_k(y_pred, k=k).indices
        
        # Convert top_k_items to int64 to match the type of y_true
        top_k_items = tf.cast(top_k_items, tf.int64)

        # Get the relevant items (the ones with a 1 in the true labels)
        relevant_items = tf.where(y_true == 1)
        relevant_item_indices = relevant_items[:, 1]

        # Broadcast relevant_item_indices to match the shape of top_k_items
        relevant_item_indices = tf.broadcast_to(tf.expand_dims(relevant_item_indices, -1), tf.shape(top_k_items))

        # Count the number of hits
        hits = K.cast(K.equal(relevant_item_indices, top_k_items), 'float32')

        # Compute the hit rate
        return K.mean(hits)

    hit_rate.__name__ = f'hit_rate@{k}'
    return hit_rate

# Modified NextItNet Model

In [72]:
train_sequences, train_labels

(array([[608, 652, 157,  72, 261],
        [652, 157,  72, 261, 104],
        [157,  72, 261, 104, 719],
        ...,
        [652, 420, 157, 608, 117],
        [420, 157, 608, 117, 184],
        [157, 608, 117, 184, 585]]),
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]))

In [25]:
def create_mod_nextitnet_model(num_items, embedding_dim, num_layers, kernel_size, dropout_rate, num_samples=100):
    input_layer = Input(shape=(None,))
    x = Embedding(num_items, embedding_dim)(input_layer)

    for _ in range(num_layers):
        x = Conv1D(embedding_dim, kernel_size, padding='causal', activation='relu')(x)
        x = Dropout(dropout_rate)(x, training=True)  # Enable dropout during inference for uncertainty estimation

    x = GlobalAveragePooling1D()(x)
    x = Dense(num_items * 2)(x)
    x = tfp.layers.IndependentNormal(num_items)(x)

    def sample_and_softmax(distribution):
        samples = distribution.sample(num_samples)  # shape: (num_samples, batch_size, num_items)
        samples = tf.transpose(samples, perm=[1, 0, 2])  # shape: (batch_size, num_samples, num_items)
        softmax_samples = tf.nn.softmax(samples, axis=-1)  # Apply softmax
        return tf.reduce_mean(softmax_samples, axis=1)  # Average the softmax values across samples
    
    output_layer = Lambda(sample_and_softmax)(x)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)
    return model

In [39]:
nextitnetmod = create_mod_nextitnet_model(num_items, embedding_dim, num_layers, kernel_size, dropout_rate, num_samples=100)
nextitnetmod.compile(optimizer='adam', loss=nll_loss,) #metrics=[hit_rate_metric(5), hit_rate_metric(10), hit_rate_metric(20)])

In [40]:
history2 = nextitnetmod.fit(train_sequences, train_labels, epochs=10, batch_size=128, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [65]:
user_sequence = list(np.random.choice(item_ids, test_sequence_length))
print(user_sequence)
yhat = nextitnetmod.predict(np.expand_dims(user_sequence, axis=0))

[1415, 1016, 1562]


In [66]:
yhat

array([[1.0000000e+00, 8.1224203e-12, 4.1030261e-12, ..., 1.9453430e-12,
        1.6006552e-12, 2.1300719e-12]], dtype=float32)

In [67]:
top_k_rec = np.argsort(yhat[0])[-k:][::-1] 
top_k_movie_titles = get_movie_titles(top_k_rec, movies)
for i in range(len(top_k_movie_titles)):
    print(top_k_movie_titles[i], 'Uncertainty Score: ', yhat[0][top_k_rec[i]])

Toy Story (1995) Uncertainty Score:  1.0
Kramer vs. Kramer (1979) Uncertainty Score:  2.6732146e-11
Great Expectations (1998) Uncertainty Score:  2.5019714e-11
Game, The (1997) Uncertainty Score:  2.4002338e-11
What Happened Was... (1994) Uncertainty Score:  2.3460229e-11
