In [1]:
import graphsage_calculate_embeddings
import test_embeddings
from torch_geometric.datasets import Planetoid
import torch.nn.functional as F
import torch
import locale

# Read in data

In [2]:
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]

In [3]:
data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

# Hyperparameters

In [4]:
learning_rate = 0.0001 
aggregator = 'MeanAggregation'

epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512
project = False

# Obtain embedding matrix

In [5]:
number_features, number_nodes = data.num_features, data.x.shape[0]
data = data.sort(sort_by_row=False)

In [6]:
embedding_matrix = graphsage_calculate_embeddings.compute_embedding_matrix(
    data = data,
    number_features = number_features,
    number_nodes = number_nodes,
    batch_size = batch_size,
    hidden_layer = hidden_layer, 
    epochs = epochs, 
    neighborhood_1 = neighborhood_1,
    neighborhood_2 = neighborhood_2,
    embedding_dimension = embedding_dimension,
    learning_rate = learning_rate,
    dropout_rate = dropout_rate,
    activation_function = activation_function,
    aggregator = aggregator,
    activation_before_normalization = True, 
    bias= True,
    normalize = normalization, 
    project = project
)


Training Progress:   9%|▉         | 1/11 [00:01<00:14,  1.40s/it]

Epoch: 000, Total loss: 64.1893, time_taken: 1.4009928703308105


Training Progress:  18%|█▊        | 2/11 [00:02<00:12,  1.37s/it]

Epoch: 001, Total loss: 57.1141, time_taken: 1.3403441905975342


Training Progress:  27%|██▋       | 3/11 [00:04<00:10,  1.37s/it]

Epoch: 002, Total loss: 56.6268, time_taken: 1.3798871040344238


Training Progress:  36%|███▋      | 4/11 [00:05<00:09,  1.38s/it]

Epoch: 003, Total loss: 56.5518, time_taken: 1.4023737907409668


Training Progress:  45%|████▌     | 5/11 [00:06<00:08,  1.40s/it]

Epoch: 004, Total loss: 56.5114, time_taken: 1.416949987411499


Training Progress:  55%|█████▍    | 6/11 [00:08<00:06,  1.40s/it]

Epoch: 005, Total loss: 56.4673, time_taken: 1.400909185409546


Training Progress:  64%|██████▎   | 7/11 [00:09<00:05,  1.40s/it]

Epoch: 006, Total loss: 56.4377, time_taken: 1.4050261974334717


Training Progress:  73%|███████▎  | 8/11 [00:11<00:04,  1.39s/it]

Epoch: 007, Total loss: 56.3943, time_taken: 1.3631949424743652


Training Progress:  82%|████████▏ | 9/11 [00:12<00:02,  1.39s/it]

Epoch: 008, Total loss: 56.3622, time_taken: 1.3899309635162354


Training Progress:  91%|█████████ | 10/11 [00:13<00:01,  1.39s/it]

Epoch: 009, Total loss: 56.3212, time_taken: 1.3924119472503662


Training Progress: 100%|██████████| 11/11 [00:15<00:00,  1.39s/it]

Epoch: 010, Total loss: 56.3060, time_taken: 1.3747568130493164
Median time per epoch: 1.3925s





# Save embedding matrix

In [None]:
torch.save(embedding_matrix, 'embeddings/cora_small.pt')

In [None]:
# How to load it again: 
embedding_matrix = torch.load('embeddings/cora_small.pt')

# Testing results

# Node clasification

In [7]:
acc, f1_macro, f1_micro = test_embeddings.test_node_classification_multi_class(embedding_matrix, data.y)

In [8]:
locale.setlocale(locale.LC_ALL, 'de_DE')

# Format the numbers with four digits after the decimal and replace the dot with a comma
formatted_acc = locale.format_string("%.4f", acc * 100).replace('.', ',')
formatted_f1_macro = locale.format_string("%.4f", f1_macro * 100).replace('.', ',')
formatted_f1_micro = locale.format_string("%.4f", f1_micro * 100).replace('.', ',')

print(f"Accuracy: {formatted_acc}, F1_macro: {formatted_f1_macro}, F1_micro: {formatted_f1_micro}")

Accuracy: 82,1266, F1_macro: 80,3272, F1_micro: 82,1266


# Link Prediction

In [9]:
train_data, test_data = test_embeddings.train_test_split_graph(data = data, is_undirected = False) # TODO: change the is_undirected depending on graph

# Prepare edges
test_edges = test_data.edge_label_index.numpy().T
y_true = test_data.edge_label.numpy()

# Prepare embeddings
embedding_detached = embedding_matrix.detach()
embedding_np = embedding_detached.numpy()

In [10]:
roc_auc_score = test_embeddings.k_fold_cross_validation_link_prediction(embedding_np, test_edges, y_true, k=5)


In [11]:
formatted_score = "{:.4f}".format(roc_auc_score * 100).replace('.', ',')
print("ROC AUC Score:", formatted_score)

ROC AUC Score: 98,4350
