In [None]:
import pandas as pd

#load data
data = pd.read_csv('Diginetica.csv')

#data preprocessing because we do not have timestamp column in this data, so created one from timeframe and eventdate
def preprocess_data(data):
    data['timestamp'] = pd.to_datetime(data['eventdate'] + ' ' + data['timeframe'].astype(str), errors='coerce')
    data = data.drop(columns=['eventdate', 'timeframe'])
    data = data.dropna(subset=['timestamp'])
    return data

data = preprocess_data(data)

#display first few rows of data
print(data.head())


  data['timestamp'] = pd.to_datetime(data['eventdate'] + ' ' + data['timeframe'].astype(str), errors='coerce')


    session_id  user_id  item_id           timestamp
5            1      NaN    33043 2016-09-05 17:39:12
13           2      NaN    32971 2016-09-05 18:27:59
22           5      NaN    35472 2016-09-05 12:10:36
41          13      NaN     3680 2016-05-04 14:48:50
51          15      NaN    58223 2016-05-04 10:44:26


In [None]:
pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.3
    Uninstalling networkx-3.3:
      Successfully uninstalled networkx-3.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.0+cu121 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.0+cu121 requires nvidia-cuda-cupti-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.0+cu121 requires nvidia-

In [None]:
import pandas as pd
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
from node2vec import Node2Vec
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam


#sot data by session_id and timestamp
data.sort_values(by=['session_id', 'timestamp'], inplace=True)

#encode session_id and item_id columns
encoder_session = LabelEncoder()
encoder_item = LabelEncoder()
data['session_id'] = encoder_session.fit_transform(data['session_id'])
data['item_id'] = encoder_item.fit_transform(data['item_id'])

#graph construction
graph = nx.Graph()
for session_id, group in data.groupby('session_id'):
    items = list(group['item_id'])
    for i in range(len(items) - 1):
        graph.add_edge(items[i], items[i + 1])


#generate walks
node2vec = Node2Vec(graph, dimensions=128, walk_length=10, num_walks=20, workers=4)

#model training
model = node2vec.fit(window=10, min_count=1, batch_words=4)

#get embeddings
embeddings = {}
for node in graph.nodes():
    try:
        embeddings[node] = model.wv[node]
    except KeyError:
        #if node is not present in the model vocabulary then assignd a random embedding
        embeddings[node] = np.random.uniform(-1, 1, model.vector_size)

def recommend(session_id, top_n=5):
    session_items = data[data['session_id'] == session_id]['item_id'].unique()
    session_embedding = np.mean([embeddings[item] for item in session_items if item in embeddings], axis=0)

    scores = {}
    for item in graph.nodes():
        if item not in session_items and item in embeddings:
            item_embedding = embeddings[item]
            scores[item] = np.sum(session_embedding * item_embedding)

    top_items = sorted(scores, key=scores.get, reverse=True)[:top_n]
    return [encoder_item.inverse_transform([item])[0] for item in top_items]

#test
test_session_id = 5
recommended_items = recommend(test_session_id)
print(f"Recommended items for session {test_session_id}: {recommended_items}")

#split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

#model defination
input_dim = len(encoder_item.classes_)
embedding_dim = 128
input_layer = Input(shape=(1,))
embedding_layer = Dense(embedding_dim, activation='relu')(input_layer)
output_layer = Dense(input_dim, activation='softmax')(embedding_layer)
model = Model(inputs=input_layer, outputs=output_layer)

model.compile(optimizer=Adam(), loss=SparseCategoricalCrossentropy())

#data for training
X_train = train_data['item_id']
y_train = train_data['item_id']

#data for testing
X_test = test_data['item_id']
y_test = test_data['item_id']

#reshape the input data
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

#model training
model.fit(X_train, X_train, epochs=10, batch_size=32, validation_data=(X_test, X_test), verbose=2)

#model evaluation
predicted_embeddings = model.predict(X_test)
predicted_items = [np.argmax(embedding) for embedding in predicted_embeddings]
actual_items = y_test.tolist()

#MRR calculation
ranks = []
for actual, predicted in zip(actual_items, predicted_items):
    # Check if the predicted item has embeddings
    if predicted < len(predicted_embeddings):
        rank = np.where(np.argsort(-predicted_embeddings[predicted]))[0][actual]
        ranks.append(rank)
mrr = np.mean([1.0 / (rank + 1) for rank in ranks])
print(f"Mean Reciprocal Rank (MRR): {mrr}")

#Recall@5 calculation
recall_at_5 = sum([1 for rank in ranks if rank < 5]) / len(ranks)
print(f"Recall@5: {recall_at_5}")


Computing transition probabilities:   0%|          | 0/256 [00:00<?, ?it/s]

  pid = os.fork()
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Recommended items for session 5: [378729, 381701, 377512, 376526, 5691]
Epoch 1/10
18/18 - 1s - loss: 41.6121 - val_loss: 38.5530 - 832ms/epoch - 46ms/step
Epoch 2/10
18/18 - 0s - loss: 19.9010 - val_loss: 40.4561 - 92ms/epoch - 5ms/step
Epoch 3/10
18/18 - 0s - loss: 12.0189 - val_loss: 43.5615 - 95ms/epoch - 5ms/step
Epoch 4/10
18/18 - 0s - loss: 9.0724 - val_loss: 48.9177 - 95ms/epoch - 5ms/step
Epoch 5/10
18/18 - 0s - loss: 8.0356 - val_loss: 51.7725 - 100ms/epoch - 6ms/step
Epoch 6/10
18/18 - 0s - loss: 7.6753 - val_loss: 54.2862 - 139ms/epoch - 8ms/step
Epoch 7/10
18/18 - 0s - loss: 7.5395 - val_loss: 56.8140 - 130ms/epoch - 7ms/step
Epoch 8/10
18/18 - 0s - loss: 7.5332 - val_loss: 58.7168 - 145ms/epoch - 8ms/step
Epoch 9/10
18/18 - 0s - loss: 7.5391 - val_loss: 60.7459 - 139ms/epoch - 8ms/step
Epoch 10/10
18/18 - 0s - loss: 7.4911 - val_loss: 63.1298 - 138ms/epoch - 8ms/step
Mean Reciprocal Rank (MRR): 0.14166666666666666
Recall@5: 0.5
