In [None]:
import torch
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import from_networkx
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import graphsage_experiments
import torch.nn.functional as F
import read_data

# Understanding the Dataset

nodes.csv: Contains the node IDs, where each ID represents a unique user on YouTube.

groups.csv:  Contains group IDs, each representing a different group or community within the YouTube platform.

edges.csv:  Comprises pairs of user IDs, with each pair indicating a friendship link between two users. For example, an entry "1,2" means that the user with ID "1" is friends with the user with ID "2".

group-edges.csv: Each line has two numbers; the first is a user ID, and the second is a group ID. This file links users to the groups they are part of.

In [None]:
# Paths to data files
nodes_path = 'datasets/BlogCatalog-dataset/data/nodes.csv'
edges_path = 'datasets/BlogCatalog-dataset/data/edges.csv'
groups_path = 'datasets/BlogCatalog-dataset/data/groups.csv'
group_edges_path = 'datasets/BlogCatalog-dataset/data/group-edges.csv'

# Create graph object and transform it to torch_geometric.data

In [None]:
graph, labels = read_data.read_dataset_arizona_university(nodes_path, edges_path, groups_path, group_edges_path)
data = from_networkx(graph)

In [None]:
# Add x variable containing the features per node (just id)
data.y = torch.from_numpy(labels.astype(float))
# Add x variable which contains features 
node_ids = torch.arange(data.num_nodes).unsqueeze(1)
data.x = node_ids.float()
# Add masks 
data = read_data.add_masks(data, 2, 1, 1)

# Define hyperparameters

In [None]:
learning_rate = 0.0001 # variable to change/play around with for experiments --> 0.0001
aggregator = 'mean' # variable to change/play around with for experiments

epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512

In [None]:
number_features, number_nodes = data.num_features, data.x.shape[0]

# Train and obtain results

In [None]:
emb_matrix = graphsage_experiments.compute_embedding_matrix(
    data = data,
    number_features = number_features,
    number_nodes = number_nodes,
    batch_size = batch_size,
    hidden_layer = hidden_layer, 
    epochs = epochs, 
    neighborhood_1 = neighborhood_1,
    neighborhood_2 = neighborhood_2,
    embedding_dimension = embedding_dimension,
    learning_rate = learning_rate,
    dropout_rate = dropout_rate,
    activation_function = activation_function,
    aggregator = aggregator,
    activation_before_normalization= True,
    bias= True,
    normalize = normalization,
)
