In [1]:
import torch
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import from_networkx
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import graphsage_experiments
import torch.nn.functional as F

# Understanding the Dataset:

nodes.csv: It contains a list of node IDs, each representing a unique user on Flickr.

groups.csv: Contains group IDs representing various groups within the Flickr community.

edges.csv: Consists of pairs of user IDs, each pair representing a friendship link. For example, a line "1,2" in this file signifies that the user with ID "1" is friends with the user with ID "2".

group-edges.csv: Each line has two entries; the first entry is a user ID, and the second entry is a group ID. This file maps which users belong to which groups.

In [5]:
# Paths to data files
nodes_path = 'datasets/Flickr-dataset/data/nodes.csv'
edges_path = 'datasets/Flickr-dataset/data/edges.csv'
groups_path = 'datasets/Flickr-dataset/data/groups.csv'
group_edges_path = 'datasets/Flickr-dataset/data/group-edges.csv'

# Auxiliary methods

In [2]:
def add_masks(data, train_size, val_size, test_size):
    """Add masks to the data object 

    Args:
        data: torch_geometric.data
        train_size: ratio 
        val_size: ratio
        test_size: ratio

    Returns:
        train mask, validation mask, test mask 
    """
    # Ensure the sizes sum up to the number of nodes
    assert train_size + val_size + test_size <= data.num_nodes

    # Shuffle node indices
    node_indices = torch.randperm(data.num_nodes)

    # Split indices for train, val, test
    train_indices = node_indices[:train_size]
    val_indices = node_indices[train_size:train_size + val_size]
    test_indices = node_indices[train_size + val_size:]

    # Initialize masks
    data.train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    data.test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)

    # Assign masks
    data.train_mask[train_indices] = True
    data.val_mask[val_indices] = True
    data.test_mask[test_indices] = True

    return data

In [3]:
def read_dataset(nodes_path, edges_path, groups_path, group_edges_path):
    """Read in the data from the csv files and transform it into networkx object 

    Args:
        nodes_path: path to the csv file 
        edges_path: path to the csv file 
        groups_path: path to the csv file 
        group_edges_path: path to the csv 
            
    Returns:
        graph, labels or classes to the nodes 
    """
    nodes_id = pd.read_csv(nodes_path, header=None, names=['id'])
    #groups_id = pd.read_csv(groups_path, header=None, names=['group'])
    edges = pd.read_csv(edges_path, header=None, names=['id_1', 'id_2'])
    user_group_membership = pd.read_csv(group_edges_path, header=None, names=['id', 'group'])

    # Create a graph
    G_BC = nx.Graph()

    # Add nodes to the graph
    #G_BC.add_nodes_from(nodes_id['id'])
    for node_id in nodes_id['id']:
        G_BC.add_node(node_id, id=node_id)
    
    # Add edges (friendships) to the graph
    G_BC.add_edges_from(edges[['id_1', 'id_2']].values)

    # Create a dictionary to store groups for each ID
    group_dict = {}

    # Populate the group_dict
    for _, row in user_group_membership.iterrows():
        user_id = row['id']
        group_id = row['group']

        # Check if the user_id is already in the dictionary
        if user_id in group_dict:
            group_dict[user_id].append(group_id)
        else:
            group_dict[user_id] = [group_id]

    # Add group labels to the nodes
    for user_id, groups in group_dict.items():
        nx.set_node_attributes(G_BC, {user_id: groups}, 'y') # 'group belonging'
    
    # Find and preprocess labels for the graph
    labels = []
    c = 0 
    for n in G_BC.nodes:
        l = G_BC.nodes[n].get('y')
        labels.append(l)

    mlb = MultiLabelBinarizer()
    preprocessed_labels = mlb.fit_transform(labels)

    return G_BC, preprocessed_labels

# Create graph object and transform it to torch_geometric.data

In [6]:
graph, labels = read_dataset(nodes_path, edges_path, groups_path, group_edges_path)
data = from_networkx(graph)

In [11]:
# Add x variable containing the features per node (just id)
data.y = torch.from_numpy(labels.astype(float))
# Add x variable which contains features 
node_ids = torch.arange(data.num_nodes).unsqueeze(1)
data.x = node_ids.float()

# Define hyperparameters

In [8]:
learning_rate = 0.0001 # variable to change/play around with for experiments --> 0.0001
aggregator = 'mean' # variable to change/play around with for experiments

epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512

In [9]:
number_features, number_nodes = data.num_features, data.x.shape[0]

# Train and obtain results

In [10]:
emb_matrix = graphsage_experiments.compute_embedding_matrix(
    data = data,
    number_features = number_features,
    number_nodes = number_nodes,
    batch_size = batch_size,
    hidden_layer = hidden_layer, 
    epochs = epochs, 
    neighborhood_1 = neighborhood_1,
    neighborhood_2 = neighborhood_2,
    embedding_dimension = embedding_dimension,
    learning_rate = learning_rate,
    dropout_rate = dropout_rate,
    activation_function = activation_function,
    aggregator = aggregator,
    activation_before_normalization = True, 
    bias= True,
    normalize = normalization
)


Node classification 
Epoch: 000, Accuracy: 0.0000, Total loss: 86.4825, f1_macro: 0.0000, f1_micro:0.0000 
Node classification 
Epoch: 001, Accuracy: 0.0000, Total loss: 85.9784, f1_macro: 0.0000, f1_micro:0.0000 


KeyboardInterrupt: 