In [1]:
import torch
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import from_networkx
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
import graphsage_calculate_embeddings
import torch.nn.functional as F
import read_data
import test_embeddings
import locale

# Understanding the Dataset

nodes.csv: Contains the node IDs, where each ID represents a unique user on YouTube.

groups.csv:  Contains group IDs, each representing a different group or community within the YouTube platform.

edges.csv:  Comprises pairs of user IDs, with each pair indicating a friendship link between two users. For example, an entry "1,2" means that the user with ID "1" is friends with the user with ID "2".

group-edges.csv: Each line has two numbers; the first is a user ID, and the second is a group ID. This file links users to the groups they are part of.

In [2]:
# Paths to data files
nodes_path = 'datasets/YouTube-dataset/data/nodes.csv'
edges_path = 'datasets/YouTube-dataset/data/edges.csv'
groups_path = 'datasets/YouTube-dataset/data/groups.csv'
group_edges_path = 'datasets/Youtube-dataset/data/group-edges.csv'

# Create graph object and transform it to torch_geometric.data

In [8]:
def read_dataset(nodes_path, edges_path, groups_path, group_edges_path):
    """Read in the data from the csv files and transform it into networkx object. 
    If a node has no labels, this is removed from the outputed/created graph.  

    Args:
        nodes_path: path to the csv file 
        edges_path: path to the csv file 
        groups_path: path to the csv file 
        group_edges_path: path to the csv 
            
    Returns:
        graph, labels or classes to the nodes 
    """
    nodes_id = pd.read_csv(nodes_path, header=None, names=['id'])
    #groups_id = pd.read_csv(groups_path, header=None, names=['group'])
    edges = pd.read_csv(edges_path, header=None, names=['id_1', 'id_2'])
    user_group_membership = pd.read_csv(group_edges_path, header=None, names=['id', 'group'])

    # Sort the node pairs and drop duplicates to ensure each edge is unique
    edges[['id_1', 'id_2']] = np.sort(edges[['id_1', 'id_2']], axis=1)
    edges = edges.drop_duplicates()

    # Create a graph
    graph = nx.Graph()

    # Add nodes to the graph
    #G_BC.add_nodes_from(nodes_id['id'])
    for node_id in nodes_id['id']:
        graph.add_node(node_id, id=node_id)
    
    # Add edges (friendships) to the graph
    graph.add_edges_from(edges[['id_1', 'id_2']].values)

    # Create a dictionary to store groups for each ID
    group_dict = {}

    # Populate the group_dict
    for _, row in user_group_membership.iterrows():
        user_id = row['id']
        group_id = row['group']

        # Check if the user_id is already in the dictionary
        if user_id in group_dict:
            group_dict[user_id].append(group_id)
        else:
            group_dict[user_id] = [group_id]

    # Add group labels to the nodes
    for user_id, groups in group_dict.items():
        nx.set_node_attributes(graph, {user_id: groups}, 'y') # 'group belonging'
    
    # Remove nodes without labels
    for node_id in list(graph.nodes):
        groups = group_dict.get(node_id)
        if not groups:
            graph.remove_node(node_id)
        else:
            nx.set_node_attributes(graph, {node_id: groups}, 'y')

    # Find and preprocess labels for the graph
    labels = []
    c = 0 
    for n in graph.nodes:
        l = graph.nodes[n].get('y')
        labels.append(l)

    mlb = MultiLabelBinarizer()
    preprocessed_labels = mlb.fit_transform(labels)

    mlb = MultiLabelBinarizer()
    preprocessed_labels = mlb.fit_transform(labels)

    return graph, preprocessed_labels

In [9]:
graph, labels = read_dataset(nodes_path, edges_path, groups_path, group_edges_path)
data = from_networkx(graph)


# Create input to the model

In [11]:
# Adjusting labels and features 
data.y = torch.from_numpy(labels.astype(float))
# Add x variable which diagonal matrix with 1's as entries and size = num_nodes x num_nodes
x_diagonal = torch.eye(data.num_nodes)
data.x = x_diagonal.float()

# Define hyperparameters

In [12]:
learning_rate = 0.0001 
aggregator = 'MeanAggregation'

epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512
project = False

# Obtain embedding matrix

In [13]:
number_features, number_nodes = data.num_features, data.x.shape[0]

In [24]:
number_nodes

31703

In [25]:
data.edge_index

Data(edge_index=[2, 192722], y=[31703, 47], id=[31703], num_nodes=31703, x=[31703, 31703])

In [14]:
embedding_matrix = graphsage_calculate_embeddings.compute_embedding_matrix(
    data=data,
    number_features=number_features,
    number_nodes=number_nodes,
    batch_size=batch_size,
    hidden_layer=hidden_layer,
    epochs=epochs,
    neighborhood_1=neighborhood_1,
    neighborhood_2=neighborhood_2,
    embedding_dimension=embedding_dimension,
    learning_rate=learning_rate,
    dropout_rate=dropout_rate,
    activation_function=activation_function,
    aggregator=aggregator,
    activation_before_normalization=True, 
    bias=True,
    normalize=normalization, 
    project=project
)


Training Progress:   9%|▉         | 1/11 [33:54<5:39:03, 2034.34s/it]

Epoch: 000, Total loss: 3.3590, time_taken: 2034.3418009281158


Training Progress:  18%|█▊        | 2/11 [1:10:24<5:18:54, 2126.01s/it]

Epoch: 001, Total loss: 3.2139, time_taken: 2190.173082113266


Training Progress:  27%|██▋       | 3/11 [1:47:17<4:48:45, 2165.72s/it]

Epoch: 002, Total loss: 3.1632, time_taken: 2212.9766280651093


Training Progress:  36%|███▋      | 4/11 [2:20:55<4:05:51, 2107.42s/it]

Epoch: 003, Total loss: 3.1482, time_taken: 2018.0388810634613


Training Progress:  45%|████▌     | 5/11 [2:54:30<3:27:23, 2073.98s/it]

Epoch: 004, Total loss: 3.1451, time_taken: 2014.6921288967133


Training Progress:  55%|█████▍    | 6/11 [3:28:03<2:51:07, 2053.48s/it]

Epoch: 005, Total loss: 3.1372, time_taken: 2013.6688272953033


Training Progress:  64%|██████▎   | 7/11 [4:01:34<2:15:57, 2039.43s/it]

Epoch: 006, Total loss: 3.1391, time_taken: 2010.5166671276093


Training Progress:  73%|███████▎  | 8/11 [4:35:05<1:41:31, 2030.52s/it]

Epoch: 007, Total loss: 3.1352, time_taken: 2011.4325149059296


Training Progress:  82%|████████▏ | 9/11 [5:08:37<1:07:29, 2024.58s/it]

Epoch: 008, Total loss: 3.1324, time_taken: 2011.5233118534088


Training Progress:  91%|█████████ | 10/11 [5:42:04<33:39, 2019.26s/it] 

Epoch: 009, Total loss: 3.1347, time_taken: 2007.328714132309


Training Progress: 100%|██████████| 11/11 [6:15:37<00:00, 2048.84s/it]

Epoch: 010, Total loss: 3.1301, time_taken: 2012.5234701633453
Median time per epoch: 2013.6693s





# Save embedding

In [15]:
file_name = 'embeddings/youtube.pt'
torch.save(embedding_matrix, file_name)

In [None]:
# How to load it again: 
embedding_matrix = torch.load(file_name)

# Testing results

# Node clasification

In [16]:
acc, f1_macro, f1_micro = test_embeddings.test_node_classification_multi_class(embedding_matrix, data.y)
#print(f"Accuracy: {acc*100:.4f}, F1_macro: {f1_macro*100:.4f}, F1_micro: {f1_micro*100:.4f}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
locale.setlocale(locale.LC_ALL, 'de_DE')

# Format the numbers with four digits after the decimal and replace the dot with a comma
formatted_acc = locale.format_string("%.4f", acc * 100).replace('.', ',')
formatted_f1_macro = locale.format_string("%.4f", f1_macro * 100).replace('.', ',')
formatted_f1_micro = locale.format_string("%.4f", f1_micro * 100).replace('.', ',')

print(f"Accuracy: {formatted_acc}, F1_macro: {formatted_f1_macro}, F1_micro: {formatted_f1_micro}")

# Link Prediction

In [18]:
train_data, test_data = test_embeddings.train_test_split_graph(data = data, is_undirected = True) # TODO: change the is_undirected depending on graph

# Prepare edges
test_edges = test_data.edge_label_index.numpy().T
y_true = test_data.edge_label.numpy()

# Prepare embeddings
embedding_detached = embedding_matrix.detach()
embedding_np = embedding_detached.numpy()

In [19]:
roc_auc_score = test_embeddings.k_fold_cross_validation_link_prediction(embedding_np, test_edges, y_true, k=5)


In [None]:
formatted_score = "{:.4f}".format(roc_auc_score * 100).replace('.', ',')
print("ROC AUC Score:", formatted_score)