In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras
from collections import deque

In [15]:
import csv

# Define the file paths
data_path = 'yoochoose_dataset/yoochoose-clicks.dat'
output_path = 'yoochoose_dataset/filtered_clicks.dat'

# Open the input and output files
with open(data_path, 'r') as f_in, open(output_path, 'w', newline='') as f_out:
    reader = csv.reader(f_in, delimiter=',')
    writer = csv.writer(f_out, delimiter=',')
    
    session_dict = {}
    count = 0
    
    # Loop through the rows in the input file
    for row in reader:
        # Extract the session_id and item_id
        session_id = row[0]
        item_id = row[2]
        
        # Check if the session_id already exists in the dictionary
        if session_id in session_dict:
            # If it exists, append the item_id to the existing list
            session_dict[session_id].append(item_id)
        else:
            # If it doesn't exist, create a new list with the current item_id
            session_dict[session_id] = [item_id]
        
        # Check if the session length is at least 2
        if len(session_dict[session_id]) >= 2:
            # If it is, write the row to the output file
            writer.writerow(row)
            count += 1
        
        # Check if we've written 10000 rows to the output file
        if count == 10000:
            break
    print(f"Filtered {count} rows")

Filtered 10000 rows


In [7]:
# Load clickstream file
data = pd.read_csv('yoochoose_dataset/filtered_clicks.dat',
                   names=['session_id', 'timestamp', 'item_id', 'category'],
                   dtype={'session_id': 'int64', 'timestamp': 'str', 'item_id': 'int64', 'category': 'int64'},
                   parse_dates=['timestamp'])

In [4]:
# Create item and session maps
item_map = dict(zip(np.unique(data.item_id), range(len(np.unique(data.item_id)))))
session_map = dict(zip(np.unique(data.session_id), range(len(np.unique(data.session_id)))))

In [5]:
# Map item and session IDs
data['item_id'] = data['item_id'].map(item_map)
data['session_id'] = data['session_id'].map(session_map)

In [6]:
# Sort by session and timestamp
data = data.sort_values(['session_id', 'timestamp'])

In [7]:
# Create next item and session columns
data['next_item_id'] = data.groupby('session_id')['item_id'].shift(-1)
data['next_session_id'] = data.groupby('session_id')['session_id'].shift(-1)
data = data.dropna()

In [8]:
# Convert data to numpy arrays
session_ids = data['session_id'].values
item_ids = data['item_id'].values
next_item_ids = data['next_item_id'].values
next_session_ids = data['next_session_id'].values
timestamps = data['timestamp'].values

In [9]:
# Create graph
graph = nx.Graph()

In [10]:
# Add edges between items that co-occur in the same session
for session_id in np.unique(session_ids):
    items_in_session = item_ids[session_ids == session_id]
    for i in range(len(items_in_session)):
        for j in range(i + 1, len(items_in_session)):
            if not graph.has_edge(items_in_session[i], items_in_session[j]):
                graph.add_edge(items_in_session[i], items_in_session[j], weight=0)
            graph[items_in_session[i]][items_in_session[j]]['weight'] += 1

In [11]:
# Normalize edge weights
for u, v, d in graph.edges(data=True):
    d['weight'] /= np.sqrt(graph.degree(u) * graph.degree(v))

In [42]:
import scipy.sparse as sp

# Create adjacency matrix
adj_matrix = sp.coo_matrix(nx.to_numpy_array(graph, weight='weight', dtype=np.float32))
adj_matrix = tf.sparse.SparseTensor(indices=np.array([adj_matrix.row, adj_matrix.col]).T,
                                    values=adj_matrix.data,
                                    dense_shape=adj_matrix.shape)
print(adj_matrix.shape)

(2273, 2273)


In [34]:
# Define hyperparameters
num_nodes = adj_matrix.shape[0]
embedding_dim = 32
num_layers = 2
temperature = 0.07
learning_rate = 0.001
num_epochs = 10
batch_size = 128
num_classes = len(np.unique(item_ids))
node_ids = tf.keras.Input(shape=(num_nodes,), dtype=tf.int64)

In [35]:
class GNN(tf.keras.layers.Layer):
    def __init__(self, num_hidden=16, num_layers=2, num_classes=2, **kwargs):
        super().__init__(**kwargs)
        self.num_hidden = num_hidden
        self.num_layers = num_layers
        self.num_classes = num_classes
        
        # define dense layers
        self.dense_layers = []
        for i in range(num_layers):
            self.dense_layers.append(tf.keras.layers.Dense(num_hidden, activation="relu"))
            
        self.embedding = tf.keras.layers.Embedding(input_dim=num_nodes, output_dim=num_hidden)
        
        # define final classification layer
        self.classification_layer = tf.keras.layers.Dense(num_classes, activation="softmax")

        
    def call(self, inputs, **kwargs):
        node_ids, adj_matrix = inputs
        
        # create node embeddings
        x = self.embedding(node_ids)
        
        # apply dense layers
        for layer in self.dense_layers:
            # Transpose the feature matrix before multiplying with the adjacency matrix
            x = layer(tf.transpose(x, perm=[0, 2, 1]))
            x = tf.transpose(x, perm=[0, 2, 1])
            
            # apply dropout
            x = tf.keras.layers.Dropout(0.5)(x, training=kwargs.get("training", False))
            
            # apply skip connection
            x = x + self.embedding(node_ids)
            
            # apply normalization
            x = tf.keras.layers.BatchNormalization()(x)
            
            # apply activation
            x = tf.keras.activations.relu(x)
            
            # multiply with adjacency matrix
            x = tf.sparse.sparse_dense_matmul(adj_matrix, x)
            
        return x

In [36]:
# Define contrastive loss function
def contrastive_loss(y_true, y_pred, temperature):
    logits = tf.matmul(y_pred, tf.transpose(y_pred)) / temperature
    labels = tf.one_hot(tf.range(tf.shape(y_pred)[0]), tf.shape(y_pred)[0] * 2)
    mask = 1 - tf.eye(tf.shape(y_pred)[0], dtype=tf.int32)
    labels = tf.reshape(labels, (-1, tf.shape(y_pred)[0] * 2))
    mask = tf.reshape(mask, (-1,))
    labels = tf.boolean_mask(labels, mask)
    logits = tf.boolean_mask(logits, mask)
    loss = tf.nn.softmax_cross_entropy_with_logits(labels, logits)
    return tf.reduce_mean(loss)

In [37]:
# Create GNN model
gnn = GNN(embedding_dim, num_layers, num_classes)

In [38]:
# Define optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate)

In [39]:
print(adj_matrix.shape)

(2273, 2273)


In [40]:
# Train GNN model
for epoch in range(num_epochs):
    for i in range(0, len(session_ids), batch_size):
        batch_node_ids = session_ids[i:i + batch_size]
        # Extract the submatrix corresponding to the batch
        batch_adj_matrix = tf.sparse.slice(adj_matrix, [i, 0], [batch_size, adj_matrix.dense_shape[1]])

        # mask = tf.equal(batch_node_ids[:, tf.newaxis], adj_matrix.indices[:, 0])
        # mask = tf.transpose(mask)  # transpose to match the expected shape
        # indices = tf.boolean_mask(adj_matrix.indices, mask)
        # values = tf.boolean_mask(adj_matrix.values, mask)
        # batch_adj_matrix = tf.sparse.SparseTensor(
        #     indices=indices,
        #     values=values,
        #     dense_shape=[len(batch_node_ids), adj_matrix.dense_shape[1]]
        # )
        batch_features = gnn([batch_node_ids, batch_adj_matrix])
        batch_labels = tf.concat([batch_features, batch_features], axis=0)
        batch_loss = contrastive_loss(None, batch_labels, temperature)
        grads = tf.gradients(batch_loss, gnn.trainable_weights)
        optimizer.apply_gradients(zip(grads, gnn.trainable_weights))

InvalidArgumentError: Exception encountered when calling layer 'gnn_2' (type GNN).

{{function_node __wrapped__Transpose_device_/job:localhost/replica:0/task:0/device:CPU:0}} transpose expects a vector of size 2. But input(1) is a vector of size 3 [Op:Transpose]

Call arguments received by layer 'gnn_2' (type GNN):
  • inputs=['tf.Tensor(shape=(128,), dtype=int64)', 'SparseTensor(indices=tf.Tensor(\n[[  0   1]\n [  1   0]\n [  2   3]\n ...\n [127 135]\n [127 136]\n [127 137]], shape=(3365, 2), dtype=int64), values=tf.Tensor([1.         1.         0.07332356 ... 0.15430336 0.15430336 0.15430336], shape=(3365,), dtype=float32), dense_shape=tf.Tensor([ 128 2273], shape=(2,), dtype=int64))']
  • kwargs={'training': 'None'}

In [29]:
# Train GNN model
for epoch in range(num_epochs):
    batch_size = 64
    num_batches = int(np.ceil(len(session_ids) / batch_size))

    for i in range(num_batches):
        # get batch node ids
        batch_node_ids = session_ids[i:i + batch_size]

        # assuming adj_matrix is a sparse tensor
        dense_adj_matrix = tf.sparse.to_dense(adj_matrix)
        dense_adj_matrix = tf.convert_to_tensor(dense_adj_matrix, dtype=tf.float32)
        print(dense_adj_matrix)
        print(adj_matrix)

        indices = tf.where(tf.not_equal(dense_adj_matrix, 0))
        values = tf.gather_nd(dense_adj_matrix, indices)

        # construct sparse tensor from indices, values and dense shape
        adj = tf.SparseTensor(indices, values, dense_shape=dense_adj_matrix.shape)
        node_ids = np.random.choice(adj_matrix.shape[0], size=batch_size, replace=False)
        
        def get_batch(node_ids, adj_matrix):
            node_ids = np.random.choice(adj_matrix.shape[0], size=batch_size, replace=False)
            min_id = tf.reduce_min(node_ids)
            max_id = tf.reduce_max(node_ids)
            batch_adj_matrix = tf.sparse.slice(adj_matrix, start=[min_id, 0], size=[max_id - min_id + 1, adj_matrix.shape[1]])
            return node_ids, batch_adj_matrix
        batch_node_ids, batch_adj_matrix = get_batch(node_ids, adj_matrix)
        batch_emb = gnn([batch_node_ids, batch_adj_matrix], training=None).numpy()
        batch_labels = tf.concat([batch_emb, batch_emb], axis=0)
        train_step(batch_labels)
        batch_loss = contrastive_loss(None, batch_labels, temperature)
        grads = tf.gradients(batch_loss, gnn.trainable_weights)
        optimizer.apply_gradients(zip(grads, gnn.trainable_weights))

tf.Tensor(
[[0.        1.        0.        ... 0.        0.        0.       ]
 [1.        0.        0.        ... 0.        0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]
 ...
 [0.        0.        0.        ... 0.2       0.5163978 0.       ]
 [0.        0.        0.        ... 0.5163978 0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.25     ]], shape=(2273, 2273), dtype=float32)
SparseTensor(indices=tf.Tensor(
[[   0    1]
 [   1    0]
 [   2    3]
 ...
 [2272 1478]
 [2272 1479]
 [2272 2272]], shape=(30744, 2), dtype=int64), values=tf.Tensor([1.         1.         0.07332356 ... 0.70710677 0.70710677 0.25      ], shape=(30744,), dtype=float32), dense_shape=tf.Tensor([2273 2273], shape=(2,), dtype=int64))


InvalidArgumentError: Exception encountered when calling layer 'gnn_1' (type GNN).

{{function_node __wrapped__SparseTensorDenseMatMul_device_/job:localhost/replica:0/task:0/device:CPU:0}} Cannot multiply A and B because inner dimension does not match: 2273 vs. 64.  Did you forget a transpose?  Dimensions of A: [2132, 2273).  Dimensions of B: [64,32] [Op:SparseTensorDenseMatMul]

Call arguments received by layer 'gnn_1' (type GNN):
  • inputs=['tf.Tensor(shape=(64,), dtype=int64)', 'SparseTensor(indices=tf.Tensor(\n[[   0   85]\n [   0   86]\n [   0   87]\n ...\n [2131 2236]\n [2131 2237]\n [2131 2238]], shape=(27910, 2), dtype=int64), values=tf.Tensor([0.14285715 0.14285715 0.14285715 ... 0.03571429 0.03571429 0.03571429], shape=(27910,), dtype=float32), dense_shape=tf.Tensor([2132 2273], shape=(2,), dtype=int64))']
  • kwargs={'training': 'None'}

In [30]:
# Define session-based recommender system with reinforcement learning
class RecommenderSystem:
    def __init__(self, gnn, item_map, gamma=0.9, alpha=0.1):
        self.gnn = gnn
        self.item_map = item_map
        self.gamma = gamma
        self.alpha = alpha
        self.replay_buffer = deque(maxlen=10000)
        self.session_history = []

    def recommend(self, session_items):
        session_node_ids = [self.item_map[item] for item in session_items if item in self.item_map]
        if len(session_node_ids) == 0:
            return []
        session_adj_matrix = tf.sparse.SparseTensor(indices=adj_matrix.indices,
                                                    values=adj_matrix.values[np.isin(adj_matrix.row, session_node_ids)],
                                                    dense_shape=adj_matrix.dense_shape)
        session_features = gnn([session_node_ids, session_adj_matrix])
        item_scores = np.matmul(session_features, gnn.embedding.weights[0].numpy().T)
        item_scores[np.isin(np.arange(len(item_map)), session_node_ids)] = -np.inf
        item_indices = np.argsort(item_scores)[::-1]
        return [item_map[i] for i in item_indices[:10]]

    def update(self, session_items, reward):
        session_node_ids = [self.item_map[item] for item in session_items if item in self.item_map]
        if len(session_node_ids) == 0:
            return
        session_adj_matrix = tf.sparse.SparseTensor(indices=adj_matrix.indices,
                                                    values=adj_matrix.values[np.isin(adj_matrix.row, session_node_ids)],
                                                    dense_shape=adj_matrix.dense_shape)
        session_features = gnn([session_node_ids, session_adj_matrix])
        item_scores = np.matmul(session_features, gnn.embedding.weights[0].numpy().T)
        item_indices = np.argsort(item_scores)[::-1]
        item_probs = np.exp(item_scores) / np.sum(np.exp(item_scores))
        item_probs[np.isin(np.arange(len(item_map)), session_node_ids)] = 0
        item_probs = item_probs / np.sum(item_probs)
        item_rewards = np.zeros(len(item_map))
        item_rewards[item_indices[:10]] = reward
        self.replay_buffer.append((session_features.numpy(), item_probs, item_rewards))
        self.session_history.append(session_node_ids)

        if len(self.replay_buffer) == self.replay_buffer.maxlen:
            for i in range(self.replay_buffer.maxlen):
                session_features, item_probs, item_rewards = self.replay_buffer[i]
                discounted_rewards = np.zeros(len(item_map))
                running_reward = 0
                for j in range(len(item_map)):
                    if item_rewards[j] != 0:
                        running_reward = item_rewards[j]
                    else:
                        running_reward = running_reward * self.gamma
                    discounted_rewards[j] = running_reward
                item_values = np.sum(np.exp(np.matmul(session_features, gnn.embedding.weights[0].numpy().T)) * discounted_rewards,axis=1)
                item_grads = tf.gradients(tf.math.log(item_probs), gnn.trainable_weights, grad_ys=item_values)
                optimizer.apply_gradients(zip(item_grads, gnn.trainable_weights))

            self.replay_buffer.clear()
            self.session_history.clear()

In [31]:
# Evaluate the performance of the GNN using precision, recall, and mean squared error
buy_events = pd.read_csv('yoochoose_dataset/yoochoose-buys.dat', header=None, usecols=[0, 2, 3], names=['session_id', 'item_id', 'Price'])
buy_events = buy_events[buy_events['item_id'].isin(item_map.keys())].groupby('session_id')['item_id'].apply(set).reset_index()
buy_events['item_id'] = buy_events['item_id'].apply(lambda x: list(x))

test_clickstream = data.sample(frac=0.2)
test_clickstream['item_id'] = test_clickstream['item_id'].apply(lambda x: item_map.get(x, -1))
test_clickstream = test_clickstream[test_clickstream['item_id'] != -1]

# test_sessions = test_clickstream.groupby('session_id')['item_id'].apply(list).reset_index()
test_sessions = test_clickstream.groupby('session_id', group_keys=False)['item_id'].apply(list).reset_index()


test_session_graphs = []
for session_items in test_sessions['item_id']:
    session_nodes = set(session_items)
    session_adj_matrix = adj_matrix.copy()
    session_adj_matrix = tf.sparse.SparseTensor(indices=session_adj_matrix.indices,
                                                values=session_adj_matrix.values[
                                                    np.isin(session_adj_matrix.row, session_nodes)],
                                                dense_shape=session_adj_matrix.dense_shape)
    session_features = gnn([session_nodes, session_adj_matrix])
    test_session_graphs.append(session_features.numpy())
print(test_session_graphs)
precision = 0
recall = 0
mse = 0
item_map_inv = {v: k for k, v in item_map.items()}
for i, session_graph in enumerate(test_session_graphs):
    session_items = test_sessions['item_id'][i]
    item_scores = np.matmul(session_graph, gnn.embedding.weights[0].numpy().T)
    item_indices = np.argsort(item_scores)[::-1]
    recommended_items = item_indices[:10]
    recommended_items = [item_map_inv[x] for x in recommended_items if x in item_map_inv]
    purchased_items = buy_events[buy_events['session_id'] == test_sessions['session_id'][i]]['item_id'].iloc[0]
    true_positives = set(recommended_items).intersection(purchased_items)
    precision += len(true_positives) / len(recommended_items)
    recall += len(true_positives) / len(purchased_items)
    mse += mean_squared_error(item_scores, [1 if x in purchased_items else 0 for x in range(len(item_map))]).numpy()
if len(test_session_graphs) > 0:
    precision /= len(test_session_graphs)
    recall /= len(test_session_graphs)
    mse /= len(test_session_graphs)
else:
    precision, recall, mse = 0.0, 0.0, 0.0
print('Precision:', precision)
print('Recall:', recall)
print('MSE:', mse)

[]
Precision: 0.0
Recall: 0.0
MSE: 0.0
