In [1]:
from torch_geometric.datasets import HeterophilousGraphDataset
import torch.nn.functional as F
import graphsage_calculate_embeddings
import test_embeddings
import torch.nn.functional as F
import torch
import locale
import graph_information


# Read in data

In [2]:
dataset = HeterophilousGraphDataset(root="./", name='amazon_ratings')
data = dataset[0]

# Hyperparameters

In [20]:
learning_rate = 0.0001 
aggregator = 'MeanAggregation'

epochs = 10
dropout_rate = 0.4
normalization = True 
activation_function = F.relu
bias = True
batch_size =  512
neighborhood_1 = 25
neighborhood_2 = 10
embedding_dimension = 128
hidden_layer = 512
project = False

# Obtain embedding matrix

In [21]:
number_features, number_nodes = data.num_features, data.x.shape[0]
data = data.sort(sort_by_row=False)

In [22]:
embedding_matrix = graphsage_calculate_embeddings.compute_embedding_matrix(
    data = data,
    number_features = number_features,
    number_nodes = number_nodes,
    batch_size = batch_size,
    hidden_layer = hidden_layer, 
    epochs = epochs, 
    neighborhood_1 = neighborhood_1,
    neighborhood_2 = neighborhood_2,
    embedding_dimension = embedding_dimension,
    learning_rate = learning_rate,
    dropout_rate = dropout_rate,
    activation_function = activation_function,
    aggregator = aggregator,
    activation_before_normalization = True, 
    bias= True,
    normalize = normalization, 
    project = project
)


Training Progress:   9%|▉         | 1/11 [01:10<11:43, 70.37s/it]

Epoch: 000, Total loss: 4.2171, time_taken: 70.37032771110535


Training Progress:  18%|█▊        | 2/11 [02:20<10:34, 70.48s/it]

Epoch: 001, Total loss: 3.8929, time_taken: 70.55615377426147


Training Progress:  27%|██▋       | 3/11 [03:30<09:21, 70.14s/it]

Epoch: 002, Total loss: 3.8544, time_taken: 69.72800278663635


Training Progress:  36%|███▋      | 4/11 [04:41<08:13, 70.46s/it]

Epoch: 003, Total loss: 3.8326, time_taken: 70.9565920829773


Training Progress:  45%|████▌     | 5/11 [05:55<07:09, 71.66s/it]

Epoch: 004, Total loss: 3.8240, time_taken: 73.79137802124023


Training Progress:  55%|█████▍    | 6/11 [07:07<05:58, 71.79s/it]

Epoch: 005, Total loss: 3.8227, time_taken: 72.03118681907654


Training Progress:  64%|██████▎   | 7/11 [08:19<04:46, 71.74s/it]

Epoch: 006, Total loss: 3.8099, time_taken: 71.62781000137329


Training Progress:  73%|███████▎  | 8/11 [09:30<03:34, 71.50s/it]

Epoch: 007, Total loss: 3.8059, time_taken: 71.0010838508606


Training Progress:  82%|████████▏ | 9/11 [10:41<02:22, 71.50s/it]

Epoch: 008, Total loss: 3.8031, time_taken: 71.49097490310669


Training Progress:  91%|█████████ | 10/11 [11:52<01:11, 71.19s/it]

Epoch: 009, Total loss: 3.7964, time_taken: 70.50960206985474


Training Progress: 100%|██████████| 11/11 [13:02<00:00, 71.13s/it]

Epoch: 010, Total loss: 3.7927, time_taken: 70.4024338722229
Median time per epoch: 70.9567s





# Save embedding matrix

In [23]:
torch.save(embedding_matrix, 'embeddings/amazon.pt')

In [3]:
# How to load it again: 
embedding_matrix = torch.load('embeddings/amazon.pt')

# Testing results

# Evaluate node classification 

In [4]:
acc, f1_macro, f1_micro = test_embeddings.test_node_classification_multi_class(embedding_matrix, data.y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
locale.setlocale(locale.LC_ALL, 'de_DE')

# Format the numbers with four digits after the decimal and replace the dot with a comma
formatted_acc = locale.format_string("%.4f", acc * 100).replace('.', ',')
formatted_f1_macro = locale.format_string("%.4f", f1_macro * 100).replace('.', ',')
formatted_f1_micro = locale.format_string("%.4f", f1_micro * 100).replace('.', ',')

print(f"Accuracy: {formatted_acc}, F1_macro: {formatted_f1_macro}, F1_micro: {formatted_f1_micro}")

Accuracy: 41,0134, F1_macro: 19,9513, F1_micro: 41,0134


# Link Prediction

In [6]:
train_data, test_data = test_embeddings.train_test_split_graph(data = data, is_undirected = False) # TODO: change the is_undirected depending on graph

# Prepare edges
test_edges = test_data.edge_label_index.numpy().T
y_true = test_data.edge_label.numpy()

# Prepare embeddings
embedding_detached = embedding_matrix.detach()
embedding_np = embedding_detached.numpy()

In [7]:
roc_auc_score = test_embeddings.k_fold_cross_validation_link_prediction(embedding_np, test_edges, y_true, k=5)


In [8]:
formatted_score = "{:.4f}".format(roc_auc_score * 100).replace('.', ',')
print("ROC AUC Score:", formatted_score)

ROC AUC Score: 99,5078
