In [1]:
import torch
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import from_networkx
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import graphsage_experiments
import torch.nn.functional as F
import read_data

# Understanding the Dataset

nodes.csv: Contains blogger IDs. Each blogger in the BlogCatalog is represented by a unique ID.

groups.csv: Contains group IDs. Bloggers can belong to various groups representing different interests or topics.

edges.csv: Represents the friendship network among bloggers. Each pair of IDs indicates a friendship link.

group-edges.csv: Represents group memberships. Each line indicates which group a particular blogger belongs to.

In [2]:
# Paths to data files
nodes_path = 'datasets/BlogCatalog-dataset/data/nodes.csv'
edges_path = 'datasets/BlogCatalog-dataset/data/edges.csv'
groups_path = 'datasets/BlogCatalog-dataset/data/groups.csv'
group_edges_path = 'datasets/BlogCatalog-dataset/data/group-edges.csv'

# Create graph object and transform it to torch_geometric.data

In [5]:
graph, labels = read_data.read_dataset_arizona_university(nodes_path, edges_path, groups_path, group_edges_path)
data = from_networkx(graph)

In [6]:
# Add x variable containing the features per node (just id)
data.y = torch.from_numpy(labels.astype(float))
# Add x variable which contains features 
node_ids = torch.arange(data.num_nodes).unsqueeze(1)
data.x = node_ids.float()
# Add masks 
data = read_data.add_masks(data, 2, 1, 1)

# Define hyperparameters

In [7]:
learning_rate = 0.0001 # variable to change/play around with for experiments --> 0.0001
aggregator = 'mean' # variable to change/play around with for experiments

epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512

In [8]:
number_features, number_nodes = data.num_features, data.x.shape[0]

# Train and obtain results

In [9]:
emb_matrix = graphsage_experiments.compute_embedding_matrix(
    data = data,
    number_features = number_features,
    number_nodes = number_nodes,
    batch_size = batch_size,
    hidden_layer = hidden_layer, 
    epochs = epochs, 
    neighborhood_1 = neighborhood_1,
    neighborhood_2 = neighborhood_2,
    embedding_dimension = embedding_dimension,
    learning_rate = learning_rate,
    dropout_rate = dropout_rate,
    activation_function = activation_function,
    aggregator = aggregator,
    activation_before_normalization= True,
    bias= True,
    normalize = normalization,
)


Node classification 
Epoch: 000, Accuracy: 0.0026, Total loss: 41.5042, f1_macro: 0.0030, f1_micro:0.0054 


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Node classification 
Epoch: 001, Accuracy: 0.0000, Total loss: 40.9124, f1_macro: 0.0000, f1_micro:0.0000 


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Node classification 
Epoch: 002, Accuracy: 0.0000, Total loss: 40.2911, f1_macro: 0.0000, f1_micro:0.0000 


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Node classification 
Epoch: 003, Accuracy: 0.0000, Total loss: 40.0841, f1_macro: 0.0000, f1_micro:0.0000 


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Node classification 
Epoch: 004, Accuracy: 0.0000, Total loss: 40.0419, f1_macro: 0.0000, f1_micro:0.0000 


: 

: 