In [1]:
import torch
from torch_geometric.data import Data
import pandas as pd
import networkx as nx
from torch_geometric.utils.convert import from_networkx
import read_data
from torch_geometric.datasets import CitationFull
import torch.nn.functional as F
import graphsage_calculate_embeddings
import test_embeddings
import torch.nn.functional as F
import torch
import locale



# Understanding the Dataset

nodes.csv: Contains blogger IDs. Each blogger in the BlogCatalog is represented by a unique ID.

groups.csv: Contains group IDs. Bloggers can belong to various groups representing different interests or topics.

edges.csv: Represents the friendship network among bloggers. Each pair of IDs indicates a friendship link.

group-edges.csv: Represents group memberships. Each line indicates which group a particular blogger belongs to.

In [2]:
# Paths to data files
nodes_path = 'datasets/BlogCatalog-dataset/data/nodes.csv'
edges_path = 'datasets/BlogCatalog-dataset/data/edges.csv'
groups_path = 'datasets/BlogCatalog-dataset/data/groups.csv'
group_edges_path = 'datasets/BlogCatalog-dataset/data/group-edges.csv'

# Create graph object and transform it to torch_geometric.data

In [3]:
graph, labels = read_data.read_dataset_arizona_university(nodes_path, edges_path, groups_path, group_edges_path)
data = from_networkx(graph)

In [4]:
# Adjusting labels and features 
data.y = torch.from_numpy(labels.astype(float))
# Add x variable which diagonal matrix with 1's as entries and size = num_nodes x num_nodes
x_diagonal = torch.eye(data.num_nodes)
data.x = x_diagonal.float()

# Define hyperparameters

In [5]:
learning_rate = 0.0001 
aggregator = 'MeanAggregation'
epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512
project = False

# Obtain embedding matrix

In [6]:
number_features, number_nodes = data.num_features, data.x.shape[0]

In [7]:
embedding_matrix = graphsage_calculate_embeddings.compute_embedding_matrix(
    data=data,
    number_features=number_features,
    number_nodes=number_nodes,
    batch_size=batch_size,
    hidden_layer=hidden_layer,
    epochs=epochs,
    neighborhood_1=neighborhood_1,
    neighborhood_2=neighborhood_2,
    embedding_dimension=embedding_dimension,
    learning_rate=learning_rate,
    dropout_rate=dropout_rate,
    activation_function=activation_function,
    aggregator=aggregator,
    activation_before_normalization=True, 
    bias=True,
    normalize=normalization, 
    project=project
)


Training Progress:   9%|▉         | 1/11 [16:04<2:40:42, 964.29s/it]

Epoch: 000, Total loss: 37.7631, time_taken: 964.2894449234009


Training Progress:  18%|█▊        | 2/11 [31:49<2:22:57, 953.05s/it]

Epoch: 001, Total loss: 36.8075, time_taken: 945.1835119724274


Training Progress:  27%|██▋       | 3/11 [47:23<2:05:54, 944.37s/it]

Epoch: 002, Total loss: 36.6994, time_taken: 934.0349130630493


Training Progress:  36%|███▋      | 4/11 [1:02:57<1:49:42, 940.43s/it]

Epoch: 003, Total loss: 36.6200, time_taken: 934.3829898834229


Training Progress:  45%|████▌     | 5/11 [1:18:31<1:33:48, 938.08s/it]

Epoch: 004, Total loss: 36.5769, time_taken: 933.9272499084473


Training Progress:  55%|█████▍    | 6/11 [1:34:06<1:18:04, 936.98s/it]

Epoch: 005, Total loss: 36.5975, time_taken: 934.8382959365845


Training Progress:  64%|██████▎   | 7/11 [1:49:43<1:02:28, 937.01s/it]

Epoch: 006, Total loss: 36.5483, time_taken: 937.0564548969269


Training Progress:  73%|███████▎  | 8/11 [2:05:25<46:55, 938.48s/it]  

Epoch: 007, Total loss: 36.5628, time_taken: 941.6169600486755


Training Progress:  82%|████████▏ | 9/11 [2:21:11<31:22, 941.02s/it]

Epoch: 008, Total loss: 36.5311, time_taken: 946.6113018989563


Training Progress:  91%|█████████ | 10/11 [2:36:56<15:41, 941.96s/it]

Epoch: 009, Total loss: 36.5340, time_taken: 944.0725939273834


Training Progress: 100%|██████████| 11/11 [2:52:38<00:00, 941.65s/it]

Epoch: 010, Total loss: 36.4959, time_taken: 942.0991811752319
Median time per epoch: 941.6172s





# Save embedding

In [8]:
file_name = 'embeddings/blogcatalog.pt'
torch.save(embedding_matrix, file_name)

# Testing results

# Node clasification

In [9]:
acc, f1_macro, f1_micro = test_embeddings.test_node_classification_multi_class(embedding_matrix, data.y)
#print(f"Accuracy: {acc*100:.4f}, F1_macro: {f1_macro*100:.4f}, F1_micro: {f1_micro*100:.4f}")

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [10]:
locale.setlocale(locale.LC_ALL, 'de_DE')

# Format the numbers with four digits after the decimal and replace the dot with a comma
formatted_acc = locale.format_string("%.4f", acc * 100).replace('.', ',')
formatted_f1_macro = locale.format_string("%.4f", f1_macro * 100).replace('.', ',')
formatted_f1_micro = locale.format_string("%.4f", f1_micro * 100).replace('.', ',')

print(f"Accuracy: {formatted_acc}, F1_macro: {formatted_f1_macro}, F1_micro: {formatted_f1_micro}")

Accuracy: 1,9104, F1_macro: 1,6378, F1_micro: 4,0256


# Link Prediction

In [11]:
train_data, test_data = test_embeddings.train_test_split_graph(data = data, is_undirected = True) # TODO: change the is_undirected depending on graph

# Prepare edges
test_edges = test_data.edge_label_index.numpy().T
y_true = test_data.edge_label.numpy()

# Prepare embeddings
embedding_detached = embedding_matrix.detach()
embedding_np = embedding_detached.numpy()

In [12]:
roc_auc_score = test_embeddings.k_fold_cross_validation_link_prediction(embedding_np, test_edges, y_true, k=5)


In [13]:
formatted_score = "{:.4f}".format(roc_auc_score * 100).replace('.', ',')
print("ROC AUC Score:", formatted_score)

ROC AUC Score: 87,9573
