In [5]:
import graphsage_calculate_embeddings
import test_embeddings
from torch_geometric.datasets import Planetoid
import torch.nn.functional as F
import torch
import time
import traceback
import locale
import graph_information

# Read in data

In [2]:
dataset = Planetoid(root='/tmp/PubMed', name='PubMed')
data = dataset[0]

In [4]:
graph_information.visualize_information_graph(dataset)

Dataset: PubMed()
-------------------
Number of graphs: 1
Number of nodes: 19717
Number of features: 500
Number of classes: 3

Graph:
------
Training nodes: 60
Evaluation nodes: 500
Test nodes: 1000
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


# This script was used to experiment with the learning rates and the aggregators types

In [None]:
learning_rates = [0.001, 0.0001, 0.00002] 
aggregators = ['MeanAggregation', 'MaxAggregation', 'LSTMAggregation']
projects = [True, False]
directed_graph = True

# FIXED PARAMS 
epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512
#project = True # layer applies a linear transformation followed by an activation function before aggreagation, as described in EQ. 3 of paper 

# Obtain embedding matrix

In [None]:
number_features, number_nodes = data.num_features, data.x.shape[0]
data = data.sort(sort_by_row=False)

In [None]:
results = []
broken_experiments = []

for lr in learning_rates:
    for aggregator in aggregators:
        for project in projects:
            try: 
                start_time = time.time()
                # Compute the embedding matrix for the current set of hyperparameters
                embedding_matrix = graphsage_calculate_embeddings.compute_embedding_matrix(
                    data=data,
                    number_features=number_features,
                    number_nodes=number_nodes,
                    batch_size=batch_size,
                    hidden_layer=hidden_layer,
                    epochs=epochs,
                    neighborhood_1=neighborhood_1,
                    neighborhood_2=neighborhood_2,
                    embedding_dimension=embedding_dimension,
                    learning_rate=lr,
                    dropout_rate=dropout_rate,
                    activation_function=activation_function,
                    aggregator=aggregator,
                    activation_before_normalization=True, 
                    bias=True,
                    normalize=normalization, 
                    project=project
                )
                
                # Store the embedding matrix and corresponding hyperparameters
                results.append({
                    'learning_rate': lr,
                    'aggregator': aggregator,
                    'embedding_matrix': embedding_matrix, 
                    'time': time.time() - start_time
                })
                torch.save(embedding_matrix, f"embeddings/pubmed/{lr}_{aggregator}_{project}_.pt")
            except Exception as e:
                broken_experiments.append({
                    'learning_rate': lr,
                    'aggregator': aggregator,
                    'embedding_matrix': embedding_matrix, 
                    'time': time.time() - start_time,
                    'error': traceback.format_exc()
                })


# Testing results

In [100]:
# Change file_name to test other resuls 
file_name = 'embeddings/pubmed/0.0001_MaxAggregation_False_.pt'

embedding_matrix = torch.load(file_name)


# Node clasification

In [101]:
acc, f1_macro, f1_micro = test_embeddings.test_node_classification_multi_class(embedding_matrix, data.y)
#print(f"Accuracy: {acc*100:.4f}, F1_macro: {f1_macro*100:.4f}, F1_micro: {f1_micro*100:.4f}")

In [102]:
locale.setlocale(locale.LC_ALL, 'de_DE')

# Format the numbers with four digits after the decimal and replace the dot with a comma
formatted_acc = locale.format_string("%.4f", acc * 100).replace('.', ',')
formatted_f1_macro = locale.format_string("%.4f", f1_macro * 100).replace('.', ',')
formatted_f1_micro = locale.format_string("%.4f", f1_micro * 100).replace('.', ',')

print(f"Accuracy: {formatted_acc}, F1_macro: {formatted_f1_macro}, F1_micro: {formatted_f1_micro}")

Accuracy: 81,2041, F1_macro: 80,0528, F1_micro: 81,2041


# Link Prediction

In [64]:
train_data, test_data = test_embeddings.train_test_split_graph(data = data, is_undirected = False) # TODO: change the is_undirected depending on graph

# Prepare edges
test_edges = test_data.edge_label_index.numpy().T
y_true = test_data.edge_label.numpy()

# Prepare embeddings
embedding_detached = embedding_matrix.detach()
embedding_np = embedding_detached.numpy()

In [65]:
roc_auc_score = test_embeddings.k_fold_cross_validation_link_prediction(embedding_np, test_edges, y_true, k=5)


In [66]:
formatted_score = "{:.4f}".format(roc_auc_score * 100).replace('.', ',')
print("ROC AUC Score:", formatted_score)

ROC AUC Score: 96.1560
