In [38]:
from utils.dataloader import DataLoader as myDataLoader
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
# from ast import literal_eval
import ast

import pandas as pd
import numpy as np
import networkx as nx


In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Params settings

In [40]:
class Settings():
    batch_size = 64
    epochs = 20

    embedding_size = 64
    learning_rate = 0.003
    
    # 100k dataset - active user
    num_users = 943
    num_items = 1682
    # num_users = 471
    # num_items = 841

    # Transformer encoder
    dropout_rate = 0
    num_heads = 4
    d_ff = 4
    num_blocks = 2


    negative_num = 99
    # checkpoint_path_user_task = './Checkpoint/user_task/'
    # checkpoint_path_item_task = './Checkpoint/item_task/'
    verbose = 1

    hidden_dim = 256
    user_epoch = 5
    item_epoch = 25

    second_user_epoch = 10
    second_item_epoch = 10

    third_user_epoch = 10
    third_item_epoch = 10

    train_user_dataset = './models/gnn_embedding/ml_gnn_ebd/gnn_initial_user_input.csv'
    train_item_dataset = './models/gnn_embedding/ml_gnn_ebd/gnn_initial_item_input.csv'
    valid_user_dataset = './models/gnn_embedding/ml_gnn_ebd/gnn_target_user_input.csv'
    valid_item_dataset = './models/gnn_embedding/ml_gnn_ebd/gnn_target_item_input.csv'

    dataset_size = '100k'

    # set device
    if torch.cuda.is_available():
        print("Using CUDA (Nvidia GPU)")
        device = torch.device('cuda')
    else:
        print("CUDA not available, using CPU")
        device = torch.device('cpu')


settings = Settings()

CUDA not available, using CPU


## Data loading and searching for 1st 2nd 3rd order neighbours

In [41]:
# load the target USER embedding
# ['userid', 'embedding']
initial_user_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/initial_user_ebds.csv"
initial_item_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/initial_item_ebds.csv"
target_user_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/target_user_ebds.csv"
target_item_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/target_item_ebds.csv"

initial_user_embedding_df = pd.read_csv(initial_user_embedding_path)
initial_item_embedding_df = pd.read_csv(initial_item_embedding_path)  
target_user_embedding_df = pd.read_csv(target_user_embedding_path)
target_item_embedding_df = pd.read_csv(target_item_embedding_path)

# initial_user_embedding_df['embedding'] = initial_user_embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=","))
# initial_item_embedding_df['embedding'] = initial_item_embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=","))
# target_user_embedding_df['embedding'] = target_user_embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=","))
# target_item_embedding_df['embedding'] = target_item_embedding_df['embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=","))

# load the rating 
# ['userId', 'movieId', 'rating', 'timestamp']
ratings_df = pd.read_csv("./data/ml-100k/u.data", sep="\t",header=None, names=["user", "item", "rating", "timestamp"])
filtered_user_ids = target_user_embedding_df['user'].unique()
filtered_item_ids = target_item_embedding_df['item'].unique()

filtered_ratings_df = ratings_df[(ratings_df['user'].isin(filtered_user_ids))&(ratings_df['item'].isin(filtered_item_ids))]
filtered_ratings_user_df = filtered_ratings_df[filtered_ratings_df['user'].isin(filtered_user_ids)]
filtered_ratings_item_df = filtered_ratings_df[filtered_ratings_df['item'].isin(filtered_item_ids)]


filtered_ratings_user_df = ratings_df[ratings_df['user'].isin(filtered_user_ids)]
filtered_ratings_item_df = ratings_df[ratings_df['item'].isin(filtered_item_ids)]

In [42]:
user_graph = nx.Graph()
user_graph.add_nodes_from(filtered_user_ids, bipartite="user")
user_graph.add_nodes_from(filtered_ratings_user_df['item'].unique(), bipartite="item")
user_graph.add_edges_from(zip(filtered_ratings_user_df['user'], filtered_ratings_user_df['item']))

item_graph = nx.Graph()
item_graph.add_nodes_from(filtered_item_ids, bipartite="item")
item_graph.add_nodes_from(filtered_ratings_item_df['user'].unique(), bipartite="user")
item_graph.add_edges_from(zip(filtered_ratings_item_df['item'], filtered_ratings_item_df['user']))


print("Nodes in user_graph:", len(user_graph.nodes()))
print("Edges in user_graph:", len(user_graph.edges()))
print("Nodes in item_graph:", len(item_graph.nodes()))
print("Edges in item_graph:", len(item_graph.edges()))

Nodes in user_graph: 1676
Edges in user_graph: 79720
Nodes in item_graph: 1043
Edges in item_graph: 87958


In [None]:
def pad_or_truncate(lst, target_length, pad_value=0):
    lst = lst[:target_length] # truncate
    lst += [pad_value] * (target_length - len(lst))  # padding
    return lst

def get_order_neighbors(graph, node, max_order=3):

    neighbors = {1: set(graph.neighbors(node))}
    for order in range(2, max_order + 1):
        neighbors[order] = set()
        for neighbor in neighbors[order - 1]:
            neighbors[order].update(graph.neighbors(neighbor))
        # TODO
        for prev_order in range(1, order):
            neighbors[order] -= neighbors[prev_order]
    return [list(neighbors[i]) for i in range(1, max_order + 1)]

def compute_max_neighbors(graph, ids, max_order=3):
    max_neighbors = {order: 0 for order in range(1, max_order + 1)}
    for node in ids:
        neighbors = get_order_neighbors(graph, node, max_order=max_order)
        for order in range(1, max_order + 1):
            current_length = len(neighbors[order - 1])
            if current_length > max_neighbors[order]:
                max_neighbors[order] = current_length
    return max_neighbors

def compute_neighbors_with_padding(graph, ids, embeddings_df, max_neighbors, max_order=3, pad_value=0):
    data = []
    for node_id in ids:
        neighbors = get_order_neighbors(graph, node_id, max_order=max_order)
        padded_neighbors = [
            pad_or_truncate(list(neighbors[order - 1]), max_neighbors, pad_value=pad_value)
            for order in range(1, max_order + 1)
        ]
        if 'user' in embeddings_df.columns:
            embedding = embeddings_df.loc[embeddings_df['user'] == node_id, 'embedding'].values[0]
        else:
            embedding = embeddings_df.loc[embeddings_df['item'] == node_id, 'embedding'].values[0]
        data.append({
            'id': node_id,
            '1st_order': padded_neighbors[0],
            '2nd_order': padded_neighbors[1],
            '3rd_order': padded_neighbors[2],
            'oracle_embedding': embedding
        })
    return pd.DataFrame(data)

In [44]:
# max_user_neighbors = compute_max_neighbors(user_graph, filtered_user_ids, max_order=3)
# max_item_neighbors = compute_max_neighbors(item_graph, filtered_item_ids, max_order=3)
# print("Max 1st order neighbors (user):", max_user_neighbors[1])
# print("Max 2nd order neighbors (user):", max_user_neighbors[2])
# print("Max 3rd order neighbors (user):", max_user_neighbors[3])

# print("Max 1st order neighbors (item):", max_item_neighbors[1])
# print("Max 2nd order neighbors (item):", max_item_neighbors[2])
# print("Max 3rd order neighbors (item):", max_item_neighbors[3])
max_user_neighbors = settings.num_users
max_item_neighbors = settings.num_items

initial_user_input_df = compute_neighbors_with_padding(user_graph, filtered_user_ids, initial_user_embedding_df, max_user_neighbors, max_order=3)
initial_item_input_df = compute_neighbors_with_padding(item_graph, filtered_item_ids, initial_item_embedding_df, max_item_neighbors, max_order=3)
target_user_input_df = compute_neighbors_with_padding(user_graph, filtered_user_ids, target_user_embedding_df, max_user_neighbors, max_order=3)
target_item_input_df = compute_neighbors_with_padding(item_graph, filtered_item_ids, target_item_embedding_df, max_item_neighbors, max_order=3)

# initial_user_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_initial_user_input.csv", index=False)
# initial_item_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_initial_item_input.csv", index=False)
# target_user_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_target_user_input.csv", index=False)
# target_item_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_target_item_input.csv", index=False)

TypeError: 'int' object is not subscriptable

In [58]:
def csv_to_dataloader(file_path, batch_size, column_specs, shuffle=True):
    df = pd.read_csv(file_path)

    tensor_list = []
    for col, dtype in column_specs.items():
        if col == "id":
            tensor = torch.tensor(df[col].values, dtype=dtype)
        elif col in ["1st_order", "2nd_order", "3rd_order"]:
            tensor = torch.tensor(df[col].apply(ast.literal_eval).tolist(), dtype=dtype)
        elif col == "oracle_embedding":
            tensor = torch.tensor(df[col].apply(ast.literal_eval).tolist(), dtype=dtype)
        else:
            raise ValueError(f"error in column: {col}")
        
        tensor_list.append(tensor)

    dataset = TensorDataset(*tensor_list)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    return dataloader

In [34]:
initial_user_input_path = "./models/gnn_embedding/ml_gnn_ebd/gnn_initial_user_input.csv"
initial_item_input_path = "./models/gnn_embedding/ml_gnn_ebd/gnn_initial_item_input.csv"
target_user_input_path = "./models/gnn_embedding/ml_gnn_ebd/gnn_target_user_input.csv"
target_item_input_path = "./models/gnn_embedding/ml_gnn_ebd/gnn_target_item_input.csv"

# train_user_dataset = CSVToDataset(initial_user_input_path, "userid")
# train_item_dataset = CSVToDataset(initial_item_input_path, "itemid")
# valid_user_dataset = CSVToDataset(target_user_input_path, "userid")
# valid_item_dataset = CSVToDataset(target_item_input_path, "itemid")

# train_user_loader = DataLoader(train_user_dataset, batch_size=settings.batch_size, shuffle=True)
# train_item_loader = DataLoader(train_item_dataset, batch_size=settings.batch_size, shuffle=True)
# valid_user_loader = DataLoader(valid_user_dataset, batch_size=settings.batch_size, shuffle=False)
# valid_item_loader = DataLoader(valid_item_dataset, batch_size=settings.batch_size, shuffle=False)

columns_to_tensor_user = {
    "id": torch.long,
    "1st_order": torch.long,
    "2nd_order": torch.long,
    "3rd_order": torch.long,
    "oracle_embedding": torch.float32, 
}

columns_to_tensor_item = {
    "id": torch.long,
    "1st_order": torch.long,
    "2nd_order": torch.long,
    "3rd_order": torch.long,
    "oracle_embedding": torch.float32,
}

b_size = settings.batch_size

train_user_loader = csv_to_dataloader(initial_user_input_path, b_size, columns_to_tensor_user, shuffle=True)
train_item_loader = csv_to_dataloader(initial_item_input_path, b_size, columns_to_tensor_item, shuffle=True)
valid_user_loader = csv_to_dataloader(target_user_input_path, b_size, columns_to_tensor_user, shuffle=False)
valid_item_loader = csv_to_dataloader(target_item_input_path, b_size, columns_to_tensor_item, shuffle=False)



# for batch in train_user_loader:
#     user_ids, first_order, second_order, third_order, oracle_embeddings = batch
#     print("User IDs:", user_ids)
#     print("First Order Neighbors:", first_order)
#     print("Second Order Neighbors:", second_order)
#     print("Third Order Neighbors:", third_order)
#     print("Oracle Embeddings:", oracle_embeddings)
#     break


## Train First Embedding with 1rd, 2nd, 3rd order user item relationship

In [35]:
from models.gnn_embedding.GeneralGNN import GeneralGNN
from models.gnn_embedding.train_helper import train_first_order_task, train_second_order_task, train_third_order_task

In [60]:
model = GeneralGNN(name="GraphSAGE", settings=settings)

In [61]:
# Train for user tasks
num_epochs = settings.epochs
device = settings.device

print("Training user tasks...")
print(" -> Training 1st order user tasks...")
train_first_order_task(
    model=model,
    train_loader=train_user_loader,
    valid_loader=valid_user_loader,
    epochs=num_epochs,
    device=device,
    task="user",
)

print(" -> Training 2nd order user tasks...")
train_second_order_task(
    model=model,
    train_loader=train_user_loader,
    valid_loader=valid_user_loader,
    epochs=num_epochs,
    device=device,
    task="user",
)

print(" -> Training 3rd order user tasks...")
train_third_order_task(
    model=model,
    train_loader=train_user_loader,
    valid_loader=valid_user_loader,
    epochs=num_epochs,
    device=device,
    task="user",
)

# Train for item tasks
print("Training item tasks...")
print(" -> Training 1st order item tasks...")
train_first_order_task(
    model=model,
    train_loader=train_item_loader,
    valid_loader=valid_item_loader,
    epochs=num_epochs,
    device=device,
    task="item",
)

print(" -> Training 2nd order item tasks...")
train_second_order_task(
    model=model,
    train_loader=train_item_loader,
    valid_loader=valid_item_loader,
    epochs=num_epochs,
    device=device,
    task="item",
)

print(" -> Training 3rd order item tasks...")
train_third_order_task(
    model=model,
    train_loader=train_item_loader,
    valid_loader=valid_item_loader,
    epochs=num_epochs,
    device=device,
    task="item",
)

print("Training completed.")

Training user tasks...
 -> Training 1st order user tasks...


Epoch 1/20 - Training First-Order user Task: 100%|██████████| 8/8 [00:00<00:00, 135.32it/s]

Epoch 1/20 - First-Order user Task: Train Loss = 1.0013
IndexError during validation: index out of range in self
Support_1st IDs: tensor([[  2,   4,   8,  ...,   0,   0,   0],
        [  4,  10, 524,  ...,   0,   0,   0],
        [513,   1,   4,  ...,   0,   0,   0],
        ...,
        [  1,   4,   7,  ...,   0,   0,   0],
        [512, 515,   4,  ...,   0,   0,   0],
        [512, 513,   1,  ...,   0,   0,   0]])
Support_2nd IDs: tensor([[1, 3, 5,  ..., 0, 0, 0],
        [1, 2, 3,  ..., 0, 0, 0],
        [2, 3, 5,  ..., 0, 0, 0],
        ...,
        [2, 3, 5,  ..., 0, 0, 0],
        [1, 2, 3,  ..., 0, 0, 0],
        [2, 3, 5,  ..., 0, 0, 0]])
Support_3rd IDs: tensor([[ 599,  677,  957,  ...,    0,    0,    0],
        [ 599,  677,  987,  ...,    0,    0,    0],
        [ 814,  850,  852,  ...,    0,    0,    0],
        ...,
        [ 352,  695,  935,  ...,    0,    0,    0],
        [  74, 1080, 1116,  ...,    0,    0,    0],
        [ 599,  677,  957,  ...,    0,    0,    0]])





IndexError: index out of range in self

## Movie Data aggragation