In [None]:
from utils.dataloader import DataLoader as myDataLoader
import torch
from torch.utils.data import DataLoader as torchDataLoader
from torch.utils.data import TensorDataset

import pandas as pd
import numpy as np
import networkx as nx
import torch.nn as nn

In [None]:
%load_ext autoreload
%autoreload 2

## Model Params settings

In [None]:
class Settings():
    batch_size = 64
    epochs = 200

    embedding_size = 64
    learning_rate = 0.001
    
    num_users = 471
    num_items = 804

    # Transformer encoder
    dropout_rate = 0
    num_heads = 4
    d_ff = 4
    num_blocks = 2


    negative_num = 99
    verbose = 1

    hidden_dim = 256
    user_epoch = 5
    item_epoch = 25

    second_user_epoch = 200
    second_item_epoch = 200

    third_user_epoch = 200
    third_item_epoch = 200

    train_user_dataset = './models/gnn_embedding/ml_gnn_ebd/initial_user_ebds.csv'
    train_item_dataset = './models/gnn_embedding/ml_gnn_ebd/initial_item_ebds.csv'
    valid_user_dataset = './models/gnn_embedding/ml_gnn_ebd/target_user_ebds.csv'
    valid_item_dataset = './models/gnn_embedding/ml_gnn_ebd/target_item_ebds.csv'

    dataset_size = '100k'

    # set device
    if torch.cuda.is_available():
        print("Using CUDA (Nvidia GPU)")
        device = torch.device('cuda')
    else:
        print("CUDA not available, using CPU")
        device = torch.device('cpu')


settings = Settings()

CUDA not available, using CPU


# Data Preparation

## Data loading and searching for 1st 2nd 3rd order neighbours

In [None]:
from utils.dataloader import DataLoader
from utils.data_split import train_test_split

# load the intial and target USER embedding for GNN training
initial_user_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/initial_user_ebds.csv"
initial_item_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/initial_item_ebds.csv"
target_user_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/target_user_ebds.csv"
target_item_embedding_path = "./models/gnn_embedding/ml_gnn_ebd/target_item_ebds.csv"

initial_user_embedding_df = pd.read_csv(initial_user_embedding_path)
initial_item_embedding_df = pd.read_csv(initial_item_embedding_path)  
target_user_embedding_df = pd.read_csv(target_user_embedding_path)
target_item_embedding_df = pd.read_csv(target_item_embedding_path)

# load the user movie rating dataframe
movie_data = DataLoader(size="100k")
data = movie_data.load_ratings()
ratings_df, test_set = train_test_split(data)
train_ratings_df = ratings_df.copy()

# create the user movie rating for trainning set
user_ids = list(initial_user_embedding_df['user'].unique())
item_ids = list(initial_item_embedding_df['item'].unique())
ratings_df = ratings_df[ratings_df['user'].isin(user_ids) & ratings_df['item'].isin(item_ids)]


## Generate the graph

In [None]:
def build_user_item_graph(df):
    G = nx.Graph()
    
    for _, row in df.iterrows():
        user_node = f'u_{row["user"]}'
        item_node = f'i_{row["item"]}'
        G.add_edge(user_node, item_node)
    
    return G

# generate the trainning graph
G = build_user_item_graph(ratings_df)
print(G.number_of_nodes())
print(G.number_of_edges())

1275
55275


In [None]:
# generate high-order graph information
def get_neighbors(graph, node):
    first_order = list(graph.neighbors(node))
    
    # 2nd order neighbors
    second_order = []
    for first_neighbor in first_order:
        second_order.extend(list(graph.neighbors(first_neighbor)))
    second_order = list(set(second_order) - set(first_order) - {node})
    
    # 3rd order neighbors
    third_order = []
    for second_neighbor in second_order:
        third_order.extend(list(graph.neighbors(second_neighbor)))
    third_order = list(set(third_order) - set(first_order) - set(second_order) - {node})
    
    first_order = [int(n.split('_')[1]) for n in first_order]
    second_order = [int(n.split('_')[1]) for n in second_order]
    third_order = [int(n.split('_')[1]) for n in third_order]
    
    return [first_order, second_order, third_order]


def compute_user_neighbors(user_graph, target_user_ids, target_embeddings_df):
    data = []
    for user_id in target_user_ids:
        neighbors = get_neighbors(user_graph, f"u_{user_id}")
        embedding = target_embeddings_df.loc[target_embeddings_df['user'] == user_id, 'embedding'].values[0]
        data.append({
            'userid': user_id,
            '1st_order': neighbors[0],
            '2nd_order': neighbors[1],
            '3rd_order': neighbors[2],
            'oracle_embedding': embedding
        })

    return pd.DataFrame(data)

def compute_item_neighbors(item_graph, target_item_ids, target_embeddings_df):
    data = []
    for item_id in target_item_ids:
        neighbors = get_neighbors(item_graph, f"i_{item_id}")
        embedding = target_embeddings_df.loc[target_embeddings_df['item'] == item_id, 'embedding'].values[0]
        data.append({
            'itemid': item_id,
            '1st_order': neighbors[0],
            '2nd_order': neighbors[1],
            '3rd_order': neighbors[2],
            'oracle_embedding': embedding
        })

    return pd.DataFrame(data)

In [None]:
# generate the input data for user and item with high-order information
target_user_input_df = compute_user_neighbors(G, user_ids, target_user_embedding_df)
target_item_input_df = compute_item_neighbors(G, item_ids, target_item_embedding_df)

# target_user_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_user_input.csv", index=False)
# target_item_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_item_input.csv", index=False)

# Model Trainning

## Train First Embedding with 1rd, 2nd, 3rd order user/item interactions

In [None]:
from models.gnn_embedding.GeneralGNN import GeneralGNN
from models.gnn_embedding.train_helper_new import train_first_order_task, train_second_order_task, train_third_order_task

In [None]:
# Load the initial random embedding for user/item generated from init_embedding.ipynb

init_user_embedding_path="./models/gnn_embedding/ml_gnn_ebd/initial_user_ebds.csv"
init_item_embedding_path="./models/gnn_embedding/ml_gnn_ebd/initial_item_ebds.csv"

model = GeneralGNN(name="GraphSAGE", settings=settings, init_user_embedding_path=init_user_embedding_path,init_item_embedding_path=init_item_embedding_path)

## Sequential Trainning Step

In [None]:
# Train for user tasks
num_epochs = settings.epochs
device = settings.device

print("Training user tasks...")
print(" -> Training 1st order user tasks...")
train_first_order_task(
    model=model,
    train_data=target_user_input_df,
    epochs=num_epochs,
    device=device,
    task="user",
)

print(" -> Training 2nd order user tasks...")
train_second_order_task(
    model=model,
    train_data=target_user_input_df,
    epochs=num_epochs,
    device=device,
    task="user",
)

print(" -> Training 3rd order user tasks...")
train_third_order_task(
    model=model,
    train_data=target_user_input_df,
    epochs=num_epochs,
    device=device,
    task="user",
)

# Train for item tasks
print("Training item tasks...")
print(" -> Training 1st order item tasks...")
train_first_order_task(
    model=model,
    train_data=target_item_input_df,
    epochs=num_epochs,
    device=device,
    task="item",
)

print(" -> Training 2nd order item tasks...")
train_second_order_task(
    model=model,
    train_data=target_item_input_df,
    epochs=num_epochs,
    device=device,
    task="item",
)

print(" -> Training 3rd order item tasks...")
train_third_order_task(
    model=model,
    train_data=target_item_input_df,
    epochs=num_epochs,
    device=device,
    task="item",
)

print("Training completed.")

# Inference - predict the embeddings for cold-start user/item

In [None]:
# load the initial embeddings for the full graph structure
full_user_embedding_init_path = "./models/gnn_embedding/ml_gnn_ebd/full_user_init_ebds.csv"
full_item_embedding_init_path = "./models/gnn_embedding/ml_gnn_ebd/full_user_item_ebds.csv"

full_user_init_embedding = pd.read_csv(full_user_embedding_init_path)
full_item_init_embedding = pd.read_csv(full_item_embedding_init_path)  

In [None]:
full_user_ids = ratings_df['user'].unique()
full_item_ids = ratings_df['item'].unique()

In [None]:
# reorder all the data to align with every step of the model architecture
movie_data = DataLoader(size="100k")
data = movie_data.load_ratings()
train_list, test_list = train_test_split(data)
ratings = pd.concat([train_list, test_list], axis=0, ignore_index=True)

user_list = ratings['user'].unique().tolist()
item_list = ratings['item'].unique().tolist()

user2idx = {user: idx for idx, user in enumerate(user_list)}
idx2user = {idx: user for user, idx in user2idx.items()}

item2idx = {item:idx for idx, item in enumerate(item_list)}
idx2item = {idx: item for item, idx in item2idx.items()}

In [None]:
def reorder_dataframe(df, user2idx,column):
    """
    Reorder a DataFrame based on the user2idx mapping.

    Parameters:
        df (pd.DataFrame): DataFrame with columns ['user', 'embedding'].
        user2idx (dict): Dictionary mapping users to their new indices.

    Returns:
        pd.DataFrame: Reordered DataFrame with new indices based on user2idx.
    """
    mapping = user2idx if column == 'user' else item2idx
    # Shuffle the rows of the DataFrame
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Map the 'user' column to the new index using user2idx
    df['new_index'] = df[column].map(mapping)
    
    # Sort the DataFrame by the new index
    df = df.sort_values(by='new_index').set_index('new_index')
    
    # Drop the 'new_index' column if you want only ['user', 'embedding']
    df = df[[column, 'embedding']]
    
    return df


In [None]:
full_user_init_embedding

In [None]:
# reorder the full trainning item/user embedding to align with other parts of the model
reordered_user_df = reorder_dataframe(full_user_init_embedding,user2idx,'user')
reordered_item_df = reorder_dataframe(full_item_init_embedding,user2idx,'item')

In [None]:
reordered_user_df.head(),reordered_item_df.head()

In [None]:
# save the reordered initial embedding for later reference
import os

output_folder = "ml_gnn_ebd"
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

reordered_user_df.to_csv(os.path.join(output_folder, "full_initial_user_ebds.csv"), index=False)
reordered_item_df.to_csv(os.path.join(output_folder, "full_initial_item_ebds.csv"), index=False)

In [None]:
# load the inputs for model inference, the inputs are generated in the notebook: ./models/gnn_embedding/init_embedding.ipynb

full_init_item_ebd_path = "./models/gnn_embedding/ml_gnn_ebd/full_initial_item_ebds.csv"
full_init_user_ebd_path = "./models/gnn_embedding/ml_gnn_ebd/full_initial_user_ebds.csv"
model.reload_embedding(full_init_user_ebd_path,full_init_item_ebd_path)


In [None]:
import ast

# the final inference task to output the final inferred embeddings of all items/users in the trainning set
def inference_3rd_task(model, train_data, device, task="user"):
    if (task == "user"):
        target_ids_train = train_data["userid"].tolist()
    else:
        target_ids_train = train_data["itemid"].tolist()
    support_1st_train = train_data["1st_order"].tolist()
    support_2nd_train = train_data["2nd_order"].tolist()
    support_3rd_train = train_data["3rd_order"].tolist()

    temp_embedding_list = train_data['oracle_embedding'].tolist()
    if type(temp_embedding_list[0]) == str:
        oracle_embeddings_train = torch.tensor([ast.literal_eval(s) for s in temp_embedding_list], dtype=torch.float32)
    else:
        oracle_embeddings_train = torch.tensor(temp_embedding_list, dtype=torch.float32)

    oracle_embeddings_train = oracle_embeddings_train.to(device)
    model.eval()

    # 前向传播
    all_predictions = torch.empty(0,oracle_embeddings_train.shape[1])
    for i in range(0, len(target_ids_train)):
        predicted_embeddings = model(target_ids_train[i], support_1st_train[i], support_2nd_train[i], support_3rd_train[i], task=task)
        all_predictions = torch.cat((all_predictions, predicted_embeddings), dim = 0)
    
    return all_predictions

In [None]:
# craete the full graph for the tranning set
full_G = build_user_item_graph(train_ratings_df)
print(full_G.number_of_nodes())
print(full_G.number_of_edges())

2625
75398


In [None]:
full_user_ids = train_ratings_df['user'].unique()
full_item_ids = train_ratings_df['item'].unique()

In [None]:
# for inspection purpose only
len(full_user_ids), len(full_item_ids)

(943, 1682)

In [None]:
full_user_init_embedding = pd.read_csv(full_init_user_ebd_path)
full_item_init_embedding = pd.read_csv(full_init_item_ebd_path)  

In [None]:
full_user_init_embedding.head(),full_item_init_embedding.head()

In [None]:
# generate the input dfs with high-order graph information
inference_user_input_df = compute_user_neighbors(full_G, full_user_ids, full_user_init_embedding)
inference_item_input_df = compute_item_neighbors(full_G, full_item_ids, full_item_init_embedding)

# save the file for inspection and reference
inference_user_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_inference_user_input.csv", index=False)
inference_item_input_df.to_csv("./models/gnn_embedding/ml_gnn_ebd/gnn_inference_item_input.csv", index=False)

In [None]:
# The final inference result for all item/user embeddings
inferred_user_embedding = inference_3rd_task(model,inference_user_input_df,"cpu","user")
inferred_item_embedding = inference_3rd_task(model,inference_item_input_df,"cpu","item")

In [None]:
pretrain_user_ebd_path = "./data/pretrain_user_embeddings.pt"
pretrain_item_ebd_path = "./data/pretrain_item_embeddings.pt"

In [None]:
torch.save(inferred_user_embedding, pretrain_user_ebd_path)
print("User embeddings saved to 'pretrain_user_embeddings.pt'")

User embeddings saved to 'pretrain_user_embeddings.pt'


In [None]:
torch.save(inferred_item_embedding, 'pretrain_item_embeddings.pt')
print("Item embeddings saved to 'pretrain_item_embeddings.pt'")

Item embeddings saved to 'pretrain_item_embeddings.pt'
