In [None]:
!pip install torch
!pip install torch-geometric

This notebook is to visualize the generation of the initial embedding and target embedding from lightGCN.

In [4]:
%load_ext autoreload
%autoreload 2

In [16]:
import warnings
warnings.filterwarnings("ignore")
import logging
logging.basicConfig(level=logging.ERROR)

import numpy as np
import pandas as pd
import sys
import os

# Add the project root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

# Import modules from the utils folder
from utils.dataloader import DataLoader
from utils.data_split import train_test_split

In [17]:
movie_data = DataLoader(size="100k")
data = movie_data.load_ratings()

In [19]:
# Calculate user degree
user_degree_df = data.groupby("user").size().reset_index(name="degree")

# Calculate item degree
item_degree_df = data.groupby("item").size().reset_index(name="degree")

# Determine the cutoff for top 50% users and items based on degree
user_cutoff = int(len(user_degree_df) * 0.5)
item_cutoff = int(len(item_degree_df) * 0.5)

# Get top 50% users and items
top_users_df = user_degree_df.nlargest(user_cutoff, 'degree')[['user']]
top_items_df = item_degree_df.nlargest(item_cutoff, 'degree')[['item']]

# Display the DataFrames
print("Top 50% Users DataFrame:")
print(top_users_df.shape)

print("\nTop 50% Items DataFrame:")
print(top_items_df.shape)

Top 50% Users DataFrame:
(471, 1)

Top 50% Items DataFrame:
(841, 1)


In [21]:
filtered_user_movie_rating = data[
    (data['user'].isin(top_users_df['user'])) &
    (data['item'].isin(top_items_df['item']))
]
filtered_user_movie_rating

Unnamed: 0,user,item,rating,timestamp
1,186,302,3,891717742
3,244,51,2,880606923
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
...,...,...,...,...
99994,378,78,3,880056976
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795


In [24]:
import torch
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))

from lightgcn_model import LightGCN, LightGCNModel

filtered_user_movie_rating = filtered_user_movie_rating[['user','item','rating']]

# Assuming filtered_user_movie_rating is already available
# and contains the columns 'user', 'item', and 'rating'.

# Split the filtered dataset into training and testing sets
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(filtered_user_movie_rating, test_size=0.3, random_state=42)

# Instantiate LightGCNModel with the filtered data
model = LightGCNModel(
    train_set=train_set,
    test_set=test_set,
    num_layers=3,        
    embedding_dim=64,    
    learning_rate=0.01,  
    epochs=50,          
    device='cpu'         
)

# Prepare training data (includes building graph and initializing model)
model.prepare_training_data()


with torch.no_grad():  
    user_embeddings, item_embeddings = model.model(model.adj_norm)

print("User Embeddings Shape:", user_embeddings.shape)  # Shape: (num_users, embedding_dim)
print("Item Embeddings Shape:", item_embeddings.shape)  # Shape: (num_items, embedding_dim)


User Embeddings Shape: torch.Size([471, 64])
Item Embeddings Shape: torch.Size([841, 64])


In [25]:
# Convert user_embeddings to DataFrame
target_user_embeddings = pd.DataFrame({
    'user': [model.idx2user[i] for i in range(user_embeddings.shape[0])],  # Map row indices to user IDs
    'embedding': user_embeddings.cpu().numpy().tolist()  # Convert tensor rows to lists
})

# Convert item_embeddings to DataFrame
target_item_embeddings = pd.DataFrame({
    'item': [model.idx2item[i] for i in range(item_embeddings.shape[0])],  # Map row indices to item IDs
    'embedding': item_embeddings.cpu().numpy().tolist()  # Convert tensor rows to lists
})

# Display the DataFrames
print("User Embeddings DataFrame:")
print(target_user_embeddings.head())

print("\nItem Embeddings DataFrame:")
print(target_item_embeddings.head())


User Embeddings DataFrame:
   user                                          embedding
0   846  [0.020096536725759506, 0.016458189114928246, -...
1   712  [-0.017245786264538765, -0.022235743701457977,...
2   295  [-0.022194115445017815, 0.024604925885796547, ...
3   524  [-0.009617995470762253, 0.0221116840839386, 0....
4   456  [-0.0067113605327904224, -0.00901773851364851,...

Item Embeddings DataFrame:
   item                                          embedding
0   229  [0.021624302491545677, -0.0043231043964624405,...
1   739  [0.01615786924958229, 0.011852799914777279, 0....
2   790  [0.014538457617163658, 0.019471891224384308, 0...
3   494  [-0.019489113241434097, 0.0063668289221823215,...
4    80  [0.004681231454014778, 0.01722409389913082, 0....


In [26]:
import torch.nn as nn

# Step 1: Define the number of users and embedding dimension
num_users = len(user_degree_df['user'])  # Assuming user_degree_df contains the user IDs
num_items = len(item_degree_df['item'])
embedding_dim = 64  # Specify embedding dimension

# Step 2: Initialize the embedding layer
init_user_embedding = nn.Embedding(num_users, embedding_dim)
init_item_embedding = nn.Embedding(num_items, embedding_dim)

# Step 3: Apply Xavier Uniform Initialization
nn.init.xavier_uniform_(init_user_embedding.weight)
nn.init.xavier_uniform_(init_item_embedding.weight)

# Step 4: Convert initialized embeddings into a DataFrame
user_ids = user_degree_df['user'].values  # Extract user IDs from user_degree_df
user_embeddings = init_user_embedding.weight.cpu().detach().numpy()  # Convert to numpy array
item_ids = item_degree_df['item'].values  # Extract item IDs from item_degree_df
item_embeddings = init_item_embedding.weight.cpu().detach().numpy()  # Convert to numpy array

# Create DataFrame
user_embeddings_df = pd.DataFrame({
    'user': user_ids,  # Map rows to user IDs
    'embedding': user_embeddings.tolist()  # Convert each embedding row to a list
})

item_embeddings_df = pd.DataFrame({
    'item': item_ids,  # Map rows to item IDs
    'embedding': item_embeddings.tolist()  # Convert each embedding row to a list
})

# Display the resulting DataFrame
print("User Embeddings DataFrame:")
print(user_embeddings_df.head)

print("Item Embeddings DataFrame:")
print(item_embeddings_df.head)


User Embeddings DataFrame:
<bound method NDFrame.head of      user                                          embedding
0       1  [0.06949957460165024, 0.025013238191604614, 0....
1       2  [-0.018330782651901245, -0.006095901131629944,...
2       3  [-0.030082087963819504, -0.07420439273118973, ...
3       4  [0.023689530789852142, -0.02303868904709816, -...
4       5  [0.02666112780570984, 0.07092168182134628, 0.0...
..    ...                                                ...
938   939  [-0.021430332213640213, -0.0345788411796093, 0...
939   940  [0.04179903864860535, 0.07008939236402512, -0....
940   941  [-0.04155192896723747, -0.013902410864830017, ...
941   942  [-0.06330128759145737, 0.03262082487344742, 0....
942   943  [-0.01836545765399933, 0.003199160099029541, 0...

[943 rows x 2 columns]>
Item Embeddings DataFrame:
<bound method NDFrame.head of       item                                          embedding
0        1  [0.009058002382516861, -0.0286041758954525, -0...
1    

In [28]:
# Save to CSV

# Define the folder name
output_folder = "ml_gnn_ebd"

# Create the folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Save the CSV files into the specified folder
user_embeddings_df.to_csv(os.path.join(output_folder, "initial_user_ebds.csv"), index=False)
item_embeddings_df.to_csv(os.path.join(output_folder, "initial_item_ebds.csv"), index=False)
target_user_embeddings.to_csv(os.path.join(output_folder, "target_user_ebds.csv"), index=False)
target_item_embeddings.to_csv(os.path.join(output_folder, "target_item_ebds.csv"), index=False)