In [None]:
!pip install torch
!pip install torch-geometric

This notebook is to visualize the generation of the initial embedding and target embedding from lightGCN.

In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import warnings
warnings.filterwarnings("ignore")
import logging
logging.basicConfig(level=logging.ERROR)

import numpy as np
import pandas as pd
import sys
import os

# Add the project root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../..")))

# Import modules from the utils folder
from utils.dataloader import DataLoader
from utils.data_split import train_test_split

In [19]:
movie_data = DataLoader(size="100k")
data = movie_data.load_ratings()
train_set, test_set = train_test_split(data)

In [20]:
training_ratings = train_set

In [21]:
len(training_ratings['item'].unique())

1609

In [22]:
# Calculate user degree
user_degree_df = training_ratings.groupby("user").size().reset_index(name="degree")

# Calculate item degree
item_degree_df = training_ratings.groupby("item").size().reset_index(name="degree")

# Determine the cutoff for top 50% users and items based on degree
user_cutoff = int(len(user_degree_df) * 0.5)
item_cutoff = int(len(item_degree_df) * 0.5)

# Get top 50% users and items
top_users_df = user_degree_df.nlargest(user_cutoff, 'degree')[['user']]
top_items_df = item_degree_df.nlargest(item_cutoff, 'degree')[['item']]

# Display the DataFrames
print("Top 50% Users DataFrame:")
print(top_users_df.shape)

print("\nTop 50% Items DataFrame:")
print(top_items_df.shape)

Top 50% Users DataFrame:
(471, 1)

Top 50% Items DataFrame:
(804, 1)


In [23]:
filtered_user_movie_rating = training_ratings[
    (training_ratings['user'].isin(top_users_df['user'])) &
    (training_ratings['item'].isin(top_items_df['item']))
]
filtered_user_movie_rating

Unnamed: 0,user,item,rating,timestamp
85,332,566,4,888360342
86,332,451,5,888360179
87,332,595,4,887938574
88,332,44,3,888360342
90,332,258,5,887916151
...,...,...,...,...
65546,436,427,3,887769105
65547,436,234,3,887769471
65548,436,187,5,887768982
65551,436,856,4,887769952


In [34]:
top_users_df.head()

Unnamed: 0,user
654,655
12,13
415,416
536,537
233,234


In [40]:
filtered_user_movie_rating['user'].unique()

array([332, 894, 500, 308, 883, 437, 533,  64, 766,  87, 588, 327, 650,
       200, 622, 892, 267, 452, 823, 328, 886, 303, 618, 389, 276, 524,
       897, 586, 425, 312, 535, 606, 854, 627, 311, 405, 851, 543, 450,
       833, 216,  83, 466, 345,   3, 102, 454, 632, 232,   6, 907, 788,
       774, 901, 600, 665,  26,  85,  11, 334, 429, 942, 119,  56, 320,
       197, 553,  82, 658, 299, 828, 329, 315, 291, 463, 807, 297, 752,
        60, 343, 880, 409, 399, 548,   8, 421, 577, 534, 293, 363, 483,
       871, 922, 213, 721, 295, 582,   1, 666, 313, 864, 877, 495, 848,
       347,  91, 275, 843,  70, 292, 301, 663, 145, 194, 881, 141, 174,
         7, 605, 870, 239, 908, 235, 221, 918, 934, 186,  42, 868, 109,
       839,  57, 835, 733,  25, 406, 484, 919, 339,  52, 458, 778, 536,
       378,  84,  62, 286, 361, 279,  13, 619, 224, 540, 659, 479, 177,
       501, 933, 457, 164, 249,  75, 537, 391, 115, 532,  90, 932, 867,
       878,  95, 397,  44,  43, 330, 168, 825, 875, 101, 417, 83

In [24]:
import torch
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../")))

from lightgcn_model import LightGCN, LightGCNModel

filtered_user_movie_rating = filtered_user_movie_rating[['user','item','rating']]

# Assuming filtered_user_movie_rating is already available
# and contains the columns 'user', 'item', and 'rating'.

# Split the filtered dataset into training and testing sets
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(filtered_user_movie_rating, test_size=0.3, random_state=42)

# Instantiate LightGCNModel with the filtered data
model = LightGCNModel(
    train_set=train_set,
    test_set=test_set,
    num_layers=3,        
    embedding_dim=64,    
    learning_rate=0.01,  
    epochs=500,          
    device='cpu'         
)

# Prepare training data (includes building graph and initializing model)
model.prepare_training_data()


with torch.no_grad():  
    user_embeddings, item_embeddings = model.model(model.adj_norm)

print("User Embeddings Shape:", user_embeddings.shape)  # Shape: (num_users, embedding_dim)
print("Item Embeddings Shape:", item_embeddings.shape)  # Shape: (num_items, embedding_dim)


User Embeddings Shape: torch.Size([471, 64])
Item Embeddings Shape: torch.Size([804, 64])


In [45]:
def reshuffle_target(initial_embeddings, target_item_embeddings, column):
    """
    Reorder target_item_embeddings to match the order of initial_embeddings based on a given column.
    
    Args:
        initial_embeddings (pd.DataFrame): DataFrame with initial embeddings and a column to determine order.
        target_item_embeddings (pd.DataFrame): DataFrame with target embeddings to be reshuffled.
        column (str): The column name to determine the matching order.
    
    Returns:
        pd.DataFrame: Reordered target_item_embeddings matching the order of initial_embeddings.
    """
    # Merge the two DataFrames on the given column
    reshuffled_embeddings = initial_embeddings[[column]].merge(
        target_item_embeddings,
        on=column,
        how='left'
    )
    return reshuffled_embeddings

# Example usage:
# initial_embeddings = pd.DataFrame({'userid': [1, 2, 3], 'embedding': [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]})
# target_item_embeddings = pd.DataFrame({'userid': [3, 1, 2], 'embedding': [[0.5, 0.6], [0.1, 0.2], [0.3, 0.4]]})
# reshuffled = reshuffle_target(initial_embeddings, target_item_embeddings, 'userid')
# print(reshuffled)


In [47]:
reshuffled_items = reshuffle_target(item_embeddings_df, target_item_embeddings, "item")
reshuffle_users = reshuffle_target(user_embeddings_df, target_user_embeddings, "user")

In [25]:
# Convert user_embeddings to DataFrame
target_user_embeddings = pd.DataFrame({
    'user': [model.idx2user[i] for i in range(user_embeddings.shape[0])],  # Map row indices to user IDs
    'embedding': user_embeddings.cpu().numpy().tolist()  # Convert tensor rows to lists
})

# Convert item_embeddings to DataFrame
target_item_embeddings = pd.DataFrame({
    'item': [model.idx2item[i] for i in range(item_embeddings.shape[0])],  # Map row indices to item IDs
    'embedding': item_embeddings.cpu().numpy().tolist()  # Convert tensor rows to lists
})

# Display the DataFrames
print("User Embeddings DataFrame:")
print(target_user_embeddings.shape)

print("\nItem Embeddings DataFrame:")
print(target_item_embeddings.shape)


User Embeddings DataFrame:
(471, 2)

Item Embeddings DataFrame:
(804, 2)


In [30]:
top_users_df.head

<bound method NDFrame.head of      user
654   655
12     13
415   416
536   537
233   234
..    ...
585   586
779   780
792   793
2       3
122   123

[471 rows x 1 columns]>

In [26]:
import torch.nn as nn

# Step 1: Define the number of users and embedding dimension
num_users = len(top_users_df['user'])  # Assuming user_degree_df contains the user IDs
num_items = len(top_items_df['item'])
embedding_dim = 64  # Specify embedding dimension

# Step 2: Initialize the embedding layer
init_user_embedding = nn.Embedding(num_users, embedding_dim)
init_item_embedding = nn.Embedding(num_items, embedding_dim)

# Step 3: Apply Xavier Uniform Initialization
nn.init.xavier_uniform_(init_user_embedding.weight)
nn.init.xavier_uniform_(init_item_embedding.weight)

# Step 4: Convert initialized embeddings into a DataFrame
user_ids = top_users_df['user'].values  # Extract user IDs from top_users_df
user_embeddings = init_user_embedding.weight.cpu().detach().numpy()  # Convert to numpy array
item_ids = top_items_df['item'].values  # Extract item IDs from top_users_df
item_embeddings = init_item_embedding.weight.cpu().detach().numpy()  # Convert to numpy array

# Create DataFrame
user_embeddings_df = pd.DataFrame({
    'user': user_ids,  # Map rows to user IDs
    'embedding': user_embeddings.tolist()  # Convert each embedding row to a list
})

item_embeddings_df = pd.DataFrame({
    'item': item_ids,  # Map rows to item IDs
    'embedding': item_embeddings.tolist()  # Convert each embedding row to a list
})

# Display the resulting DataFrame
print("User Embeddings DataFrame:")
print(user_embeddings_df.shape)

print("Item Embeddings DataFrame:")
print(item_embeddings_df.shape)


User Embeddings DataFrame:
(471, 2)
Item Embeddings DataFrame:
(804, 2)


In [29]:
print(user_embeddings_df.head)
print(item_embeddings_df.head)

<bound method NDFrame.head of      user                                          embedding
0     655  [0.10007774829864502, 0.018424294888973236, 0....
1      13  [-0.10390898585319519, 0.10543130338191986, 0....
2     416  [-0.022570259869098663, 0.04818075895309448, -...
3     537  [0.05264461040496826, -0.07918736338615417, -0...
4     234  [-0.07219883799552917, -0.004899948835372925, ...
..    ...                                                ...
466   586  [-0.09346890449523926, 0.014625325798988342, 0...
467   780  [0.024634823203086853, 0.0008009001612663269, ...
468   793  [0.025385022163391113, -0.0337277427315712, -0...
469     3  [-0.0256737619638443, -0.010066032409667969, 0...
470   123  [-0.01820867508649826, -0.0662456601858139, -0...

[471 rows x 2 columns]>
<bound method NDFrame.head of      item                                          embedding
0      50  [-0.08057637512683868, 0.06575184315443039, -0...
1     100  [-0.02456614375114441, -0.04976752772927284, -...


In [50]:
# Save to CSV

# Define the folder name
output_folder = "ml_gnn_ebd"

# Create the folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Save the CSV files into the specified folder
user_embeddings_df.to_csv(os.path.join(output_folder, "initial_user_ebds.csv"), index=False)
item_embeddings_df.to_csv(os.path.join(output_folder, "initial_item_ebds.csv"), index=False)
reshuffle_users.to_csv(os.path.join(output_folder, "target_user_ebds.csv"), index=False)
reshuffled_items.to_csv(os.path.join(output_folder, "target_item_ebds.csv"), index=False)