In [36]:
from utils.dataloader import DataLoader as myDataLoader
import torch
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import numpy as np
import networkx as nx

In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Params settings

In [65]:
class Settings():
    batch_size = 64
    epochs = 20

    embedding_size = 256
    learning_rate = 0.003
    
    # 100k dataset
    num_users = 943
    num_items = 1682

    # Transformer encoder
    dropout_rate = 0
    num_heads = 4
    d_ff = 4
    num_blocks = 2


    negative_num = 99
    # checkpoint_path_user_task = './Checkpoint/user_task/'
    # checkpoint_path_item_task = './Checkpoint/item_task/'
    verbose = 1

    hidden_dim = 256
    user_epoch = 5
    item_epoch = 25

    second_user_epoch = 10
    second_item_epoch = 10

    third_user_epoch = 10
    third_item_epoch = 10

    train_user_dataset = './models/gnn_embedding/ml_gnn_ebd/initial_user_ebds.csv'
    train_item_dataset = './models/gnn_embedding/ml_gnn_ebd/initial_item_ebds.csv'
    valid_user_dataset = './models/gnn_embedding/ml_gnn_ebd/target_user_ebds.csv'
    valid_item_dataset = './models/gnn_embedding/ml_gnn_ebd/target_item_ebds.csv'

    dataset_size = '100k'

    # set device
    if torch.cuda.is_available():
        print("Using CUDA (Nvidia GPU)")
        device = torch.device('cuda')
    else:
        print("CUDA not available, using CPU")
        device = torch.device('cpu')


settings = Settings()

CUDA not available, using CPU


## Data loading and searching for 1st 2nd 3rd order neighbours

In [66]:
# load the target USER embedding
# ['userid', 'embedding']
initial_user_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/initial_user_ebds.csv"
initial_item_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/initial_item_ebds.csv"
target_user_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/target_user_ebds.csv"
target_item_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/target_item_ebds.csv"

initial_user_embedding_df = pd.read_csv(initial_user_embedding_path)
initial_item_embedding_df = pd.read_csv(initial_item_embedding_path)  
target_user_embedding_df = pd.read_csv(target_user_embedding_path)
target_item_embedding_df = pd.read_csv(target_item_embedding_path)

# load the rating 
# ['userId', 'movieId', 'rating', 'timestamp']
ratings_df = pd.read_csv("./data/ml-100k/u.data", sep="\t",header=None, names=["user", "item", "rating", "timestamp"])

filtered_user_ids = target_user_embedding_df['user'].unique()
filtered_item_ids = target_item_embedding_df['item'].unique()

filtered_ratings_df = ratings_df[
    (ratings_df['user'].isin(filtered_user_ids)) &
    (ratings_df['item'].isin(filtered_item_ids))
]

filtered_ratings_user_df = filtered_ratings_df[filtered_ratings_df['user'].isin(filtered_user_ids)]
filtered_ratings_item_df = filtered_ratings_df[filtered_ratings_df['item'].isin(filtered_item_ids)]

In [67]:
len(filtered_user_ids)

471

In [68]:
len(filtered_item_ids)

804

In [69]:
user_graph = nx.Graph()
user_graph.add_nodes_from(filtered_user_ids, bipartite="user")
user_graph.add_nodes_from(filtered_ratings_user_df['item'].unique(), bipartite="item")
user_graph.add_edges_from(zip(filtered_ratings_user_df['user'], filtered_ratings_user_df['item']))

item_graph = nx.Graph()
item_graph.add_nodes_from(filtered_item_ids, bipartite="item")
item_graph.add_nodes_from(filtered_ratings_item_df['user'].unique(), bipartite="user")
item_graph.add_edges_from(zip(filtered_ratings_item_df['item'], filtered_ratings_item_df['user']))


print("Nodes in user_graph:", user_graph.nodes())
print("Edges in user_graph:", user_graph.edges())
print("Nodes in item_graph:", item_graph.nodes())
print("Edges in item_graph:", item_graph.edges())

Nodes in user_graph: [648, 622, 7, 871, 927, 796, 59, 363, 518, 472, 70, 174, 151, 216, 846, 417, 361, 484, 385, 409, 271, 488, 435, 711, 301, 447, 454, 806, 634, 391, 416, 160, 429, 290, 451, 553, 828, 42, 561, 56, 268, 89, 178, 436, 100, 838, 748, 313, 286, 62, 267, 293, 432, 712, 776, 342, 94, 608, 177, 25, 514, 373, 85, 442, 14, 497, 896, 222, 715, 393, 234, 758, 197, 790, 254, 624, 753, 679, 727, 297, 532, 92, 43, 815, 279, 483, 49, 843, 763, 621, 18, 559, 161, 119, 437, 235, 44, 825, 224, 831, 164, 1, 144, 354, 102, 379, 698, 805, 660, 336, 653, 269, 932, 457, 707, 455, 682, 899, 38, 864, 823, 567, 84, 378, 536, 374, 466, 500, 537, 207, 907, 655, 223, 860, 63, 249, 936, 643, 387, 793, 328, 130, 23, 554, 650, 311, 239, 877, 318, 345, 125, 880, 152, 798, 505, 924, 778, 721, 751, 398, 299, 357, 37, 605, 76, 615, 918, 566, 897, 221, 592, 833, 600, 493, 795, 504, 606, 280, 619, 54, 862, 804, 749, 881, 495, 562, 296, 870, 666, 840, 757, 206, 276, 331, 919, 347, 69, 453, 887, 6, 684, 61

In [70]:
def get_order_neighbors_bipartite(graph, node, max_order=3, is_user=True):
    neighbors = {1: set(graph.neighbors(node))}
    for order in range(2, max_order + 1):
        neighbors[order] = set()
        for neighbor in neighbors[order - 1]:
            neighbors[order].update(graph.neighbors(neighbor))

        # Enforce bipartite structure: alternate between users and items
        if order % 2 == 0:  
            neighbors[order] = {n for n in neighbors[order] if (n in filtered_user_ids if is_user else n in filtered_item_ids)}
        else:
            neighbors[order] = {n for n in neighbors[order] if (n in filtered_item_ids if is_user else n in filtered_user_ids)}

    return [list(neighbors[i]) for i in range(1, max_order + 1)]

def compute_user_neighbors(user_graph, target_user_ids, target_embeddings_df):
    data = []
    for user_id in target_user_ids:
        neighbors = get_order_neighbors_bipartite(user_graph, user_id, 3, True)
        embedding = target_embeddings_df.loc[target_embeddings_df['user'] == user_id, 'embedding'].values[0]
        data.append({
            'userid': user_id,
            '1st_order': neighbors[0],
            '2nd_order': neighbors[1],
            '3rd_order': neighbors[2],
            'oracle_embedding': embedding
        })

    return pd.DataFrame(data)

def compute_item_neighbors(item_graph, target_item_ids, target_embeddings_df):
    data = []
    for item_id in target_item_ids:
        neighbors = get_order_neighbors_bipartite(item_graph, item_id, 3, False)
        embedding = target_embeddings_df.loc[target_embeddings_df['item'] == item_id, 'embedding'].values[0]
        data.append({
            'itemid': item_id,
            '1st_order': neighbors[0],
            '2nd_order': neighbors[1],
            '3rd_order': neighbors[2],
            'oracle_embedding': embedding
        })

    return pd.DataFrame(data)

In [71]:
initial_user_input_df = compute_user_neighbors(user_graph, filtered_user_ids, initial_user_embedding_df)
initial_item_input_df = compute_item_neighbors(item_graph, filtered_item_ids, initial_item_embedding_df)
target_user_input_df = compute_user_neighbors(user_graph, filtered_user_ids, target_user_embedding_df)
target_item_input_df = compute_item_neighbors(item_graph, filtered_item_ids, target_item_embedding_df)

# initial_user_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_initial_user_input.csv", index=False)
# initial_item_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_initial_item_input.csv", index=False)
# target_user_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_target_user_input.csv", index=False)
# target_item_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_target_item_input.csv", index=False)

In [74]:
class DataframeToDataset(Dataset):
    def __init__(self, df, task_type):
        self.df = df
        self.task_type=task_type

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        if self.task_type == "user":
            return {
                'user_id': torch.tensor(row['userid'], dtype=torch.long),
                '1st_order': torch.tensor(row['1st_order'], dtype=torch.long),
                '2nd_order': torch.tensor(row['2nd_order'], dtype=torch.long),
                '3rd_order': torch.tensor(row['3rd_order'], dtype=torch.long),
                'oracle_embedding': torch.tensor(row['oracle_embedding'], dtype=torch.float32),
            }
        elif self.task_type == "item":
            return {
                'item_id': torch.tensor(row['itemid'], dtype=torch.long),
                '1st_order': torch.tensor(row['1st_order'], dtype=torch.long),
                '2nd_order': torch.tensor(row['2nd_order'], dtype=torch.long),
                '3rd_order': torch.tensor(row['3rd_order'], dtype=torch.long),
                'oracle_embedding': torch.tensor(row['oracle_embedding'], dtype=torch.float32),
            }

In [76]:
train_user_dataset = DataframeToDataset(initial_user_input_df,"user")
train_item_dataset = DataframeToDataset(initial_item_input_df,"item")
valid_user_dataset = DataframeToDataset(target_user_input_df,"user")
valid_item_dataset = DataframeToDataset(target_item_input_df,"item")

# Create DataLoaders
train_user_loader = DataLoader(train_user_dataset, batch_size=settings.batch_size, shuffle=True)
valid_user_loader = DataLoader(valid_user_dataset, batch_size=settings.batch_size, shuffle=False)

train_item_loader = DataLoader(train_item_dataset, batch_size=settings.batch_size, shuffle=True)
valid_item_loader = DataLoader(valid_item_dataset, batch_size=settings.batch_size, shuffle=False)



In [90]:
print("Debug: train_user_dataset type:", type(train_user_dataset))
print("Debug: train_user_dataset sample:", train_user_dataset[0] if len(train_user_dataset) > 0 else "Empty dataset")

Debug: train_user_dataset type: <class '__main__.DataframeToDataset'>


TypeError: new(): invalid data type 'str'

## Train First Embedding with 1rd, 2nd, 3rd order user item relationship

In [81]:
from models.gnn_embedding.GeneralGNN import GeneralGNN
from models.gnn_embedding.train_helper import train_first_order_task, train_second_order_task, train_third_order_task

In [82]:
model = GeneralGNN(name="GraphSAGE", settings=settings)

In [88]:
# Train for user tasks
num_epochs = settings.epochs
device = settings.device

print("Training user tasks...")
print(" -> Training 1st order user tasks...")
train_first_order_task(
    model=model,
    train_loader=train_user_loader,
    valid_loader=valid_user_loader,
    epochs=num_epochs,
    device=device,
    task="user",
)

print(" -> Training 2nd order user tasks...")
train_second_order_task(
    model=model,
    train_loader=train_user_loader,
    valid_loader=valid_user_loader,
    epochs=num_epochs,
    device=device,
    task="user",
)

print(" -> Training 3rd order user tasks...")
train_third_order_task(
    model=model,
    train_loader=train_user_loader,
    valid_loader=valid_user_loader,
    epochs=num_epochs,
    device=device,
    task="user",
)

# Train for item tasks
print("Training item tasks...")
print(" -> Training 1st order item tasks...")
train_first_order_task(
    model=model,
    train_loader=train_item_loader,
    valid_loader=valid_item_loader,
    epochs=num_epochs,
    device=device,
    task="item",
)

print(" -> Training 2nd order item tasks...")
train_second_order_task(
    model=model,
    train_loader=train_item_loader,
    valid_loader=valid_item_loader,
    epochs=num_epochs,
    device=device,
    task="item",
)

print(" -> Training 3rd order item tasks...")
train_third_order_task(
    model=model,
    train_loader=train_item_loader,
    valid_loader=valid_item_loader,
    epochs=num_epochs,
    device=device,
    task="item",
)

print("Training completed.")

Training user tasks...
 -> Training 1st order user tasks...
entering the loop...
Debug: train_loader: <torch.utils.data.dataloader.DataLoader object at 0x15dc20670>
Debug: train_loader length: 8


Epoch 1/20 - Training First-Order user Task:   0%|          | 0/8 [00:00<?, ?it/s]


TypeError: new(): invalid data type 'str'

## Movie Data aggragation