In [2]:
import torch
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import from_networkx
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import torch.nn.functional as F
import read_data
from itertools import combinations


# Understanding the Dataset

nodes.csv: Contains blogger IDs. Each blogger in the BlogCatalog is represented by a unique ID.

groups.csv: Contains group IDs. Bloggers can belong to various groups representing different interests or topics.

edges.csv: Represents the friendship network among bloggers. Each pair of IDs indicates a friendship link.

group-edges.csv: Represents group memberships. Each line indicates which group a particular blogger belongs to.

In [3]:
# Paths to data files
nodes_path = 'datasets/BlogCatalog-dataset/data/nodes.csv'
edges_path = 'datasets/BlogCatalog-dataset/data/edges.csv'
groups_path = 'datasets/BlogCatalog-dataset/data/groups.csv'
group_edges_path = 'datasets/BlogCatalog-dataset/data/group-edges.csv'

# Create graph object and transform it to torch_geometric.data

In [4]:
graph, labels = read_data.read_dataset_arizona_university(nodes_path, edges_path, groups_path, group_edges_path)
data = from_networkx(graph)

In [5]:
# Add x variable containing the features per node (just id)
data.y = torch.from_numpy(labels.astype(float))
# Add x variable which diagonal matrix with 1's as entries and size (num_nodes x num_nodes)
x_diagonal = torch.eye(data.num_nodes)
data.x = x_diagonal.float()

In [14]:
data

Data(edge_index=[2, 667966], y=[10312, 39], id=[10312], num_nodes=10312, x=[10312, 1])

In [6]:
edge_index = data.edge_index
edges = list(zip(edge_index[0].numpy(), edge_index[1].numpy()))
len(edges)

667966

In [7]:
unique_edges = list(set(map(lambda e: tuple(sorted(e)), edges)))
len(unique_edges)

333983

In [15]:
from torch_geometric.transforms import RemoveDuplicatedEdges

transform = RemoveDuplicatedEdges()
new_data = transform(data)

In [16]:
new_data

Data(edge_index=[2, 667966], y=[10312, 39], id=[10312], num_nodes=10312, x=[10312, 1])

In [13]:
edge_index = new_data.edge_index
edges = list(zip(edge_index[0].numpy(), edge_index[1].numpy()))
len(edges)

667966

# Define hyperparameters

In [None]:
learning_rate = 0.0001 # variable to change/play around with for experiments --> 0.0001
aggregator = 'mean' # variable to change/play around with for experiments

epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512

In [None]:
number_features, number_nodes = data.num_features, data.x.shape[0]

# Train and obtain results

In [None]:
emb_matrix = graphsage_experiments.compute_embedding_matrix(
    data = data,
    number_features = number_features,
    number_nodes = number_nodes,
    batch_size = batch_size,
    hidden_layer = hidden_layer, 
    epochs = epochs, 
    neighborhood_1 = neighborhood_1,
    neighborhood_2 = neighborhood_2,
    embedding_dimension = embedding_dimension,
    learning_rate = learning_rate,
    dropout_rate = dropout_rate,
    activation_function = activation_function,
    aggregator = aggregator,
    activation_before_normalization = True, 
    bias= True,
    normalize = normalization
)
