In [1]:
%config InlineBackend.figure_format = 'svg'

In [2]:
import os
import random
import time 
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from node2vec import Node2Vec
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from torch_geometric.data import Data
import spektral
from spektral.layers import GCNConv, GATConv
from spektral.layers import GraphSageConv
from spektral.data import Graph, Dataset, BatchLoader
from scipy.sparse import csr_matrix
from spektral.datasets import Cora
from torch_geometric.nn import DeepGraphInfomax, VGAE
from torch_geometric.utils import from_networkx
import scipy.sparse as sp
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from scipy.sparse.csgraph import laplacian
from scipy.sparse.linalg import eigsh
from collections import Counter
from sklearn.preprocessing import normalize
from joblib import Parallel, delayed
from torch_geometric.nn import GCNConv as PyG_GCNConv, VGAE as PyG_VGAE
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 46

# Set seed for Python's built-in random module
random.seed(SEED)

# Set seed for NumPy
np.random.seed(SEED)

# Set seed for TensorFlow
tf.random.set_seed(SEED)

# Set seed for PyTorch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

In [4]:
# Create a custom Dataset for the graph
class CoraDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def read(self):
        data = Cora()  # Load the dataset
        graph = data.graphs[0]  # Access the first graph in the dataset
        return [Graph(x=graph.x, a=graph.a, y=graph.y)]

In [5]:
embedding_dimensionality=150

## Extracting modularity embedding and using it for classification

In [6]:
# Laplacian Eigenmaps Embedding
def deepwalk_embedding(G, k=2, walk_length=10, num_walks=80, workers=4):
    node2vec = Node2Vec(G, dimensions=k, walk_length=walk_length, num_walks=num_walks, workers=workers)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])

# Node2Vec Embedding
def node2vec_embedding(G, k=2, seed=SEED):
    node2vec = Node2Vec(G, dimensions=k, walk_length=10, num_walks=100, workers=2, seed=seed)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])


# VGAE Embedding 
class VGAEEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
        self.conv_mu = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for mu
        self.conv_logstd = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for logstd

    def forward(self, x, edge_index):
        x = torch.relu(self.conv1(x, edge_index))
        mu = self.conv_mu(x, edge_index)
        logstd = self.conv_logstd(x, edge_index)
        return mu, logstd

def vgae_embedding(data, k=128):
    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = PyG_VGAE(VGAEEncoder(in_channels, k))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        z = model.encode(x, data.edge_index)  # Use one-hot encoded features
        loss = model.recon_loss(z, data.edge_index) + (1 / data.num_nodes) * model.kl_loss()
        loss.backward()
        optimizer.step()
    
    return model.encode(x, data.edge_index).detach().numpy()

# DGI Embedding
def dgi_embedding(data, k=128):
    class GCNEncoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super().__init__()
            self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
            self.conv2 = PyG_GCNConv(2 * out_channels, out_channels)  # Use PyG_GCNConv

        def forward(self, x, edge_index):
            x = torch.relu(self.conv1(x, edge_index))
            return self.conv2(x, edge_index)

    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = DeepGraphInfomax(
        hidden_channels=k,
        encoder=GCNEncoder(in_channels, k),
        summary=lambda z, *args, **kwargs: z.mean(dim=0),  # Ensure `summary` only takes `z`
        corruption=lambda x, edge_index: (x[torch.randperm(x.size(0))], edge_index)  # Correct corruption function
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        pos_z, neg_z, summary = model(x, data.edge_index)  # Use one-hot encoded features
        loss = model.loss(pos_z, neg_z, summary)
        loss.backward()
        optimizer.step()

    return pos_z.detach().numpy()


# Unsupervised gradient ascent for modularity maximization
def gradient_ascent_modularity_unsupervised(G, k=2, eta=0.01, iterations=1000, seed=SEED):
    np.random.seed(seed)  # Ensure deterministic initialization

    A = nx.to_numpy_array(G)
    l = A.sum(axis=1)
    m = np.sum(l) / 2
    B = A - np.outer(l, l) / (2 * m)
    n = B.shape[0]

    S = np.random.randn(n, k)  # Random Initialization
    S, _ = np.linalg.qr(S)  # Ensure initial orthonormality

    for i in tqdm(range(iterations), desc="Gradient Ascent Progress"):
        grad = (1 / (2 * m)) * B @ S
        S += eta * grad
        S, _ = np.linalg.qr(S)  # Orthonormalize using QR decomposition

    return S

In [7]:
def perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled=3):
    walks = {node: [] for node in G.nodes()}
    for node in G.nodes():
        for _ in range(num_walks):
            walk = [node]
            labeled_count = 0
            for _ in range(walk_length - 1):
                cur = walk[-1]
                neighbors = list(G.neighbors(cur))
                if not neighbors:
                    break
                labeled_neighbors = [n for n in neighbors if label_mask[n]]
                if labeled_neighbors and labeled_count < walk_length_labelled:
                    next_node = random.choice(labeled_neighbors)
                    labeled_count += 1
                else:
                    next_node = random.choice(neighbors)
                walk.append(next_node)
            walks[node].extend([n for n in walk if label_mask[n]])
    return walks

def compute_attention_weights(S, labeled_nodes):
    weights = {}
    for node, labeled in labeled_nodes.items():
        if labeled:
            similarities = {n: np.dot(S[node], S[n]) for n in labeled}
            exp_sims = {n: np.exp(sim) for n, sim in similarities.items()}
            total = sum(exp_sims.values())
            weights[node] = {n: exp_sims[n] / total for n in labeled}
    return weights

def semi_supervised_gradient_ascent_modularity(G, labels, label_mask, k=2, eta=0.01, lambda_supervised=1.0, 
                                                      lambda_semi=2.0, iterations=5000, initialization='random',
                                                      num_walks=10, walk_length=5, walk_length_labelled=3):
    # Convert graph to sparse adjacency matrix
    A = csr_matrix(nx.to_scipy_sparse_array(G, format='csr'))
    degrees = np.array(A.sum(axis=1)).flatten()
    m = G.number_of_edges()
    n = A.shape[0]

    # Initialize embeddings
    if initialization == 'random':
        S = np.random.randn(n, k)
    S, _ = np.linalg.qr(S)

    # Compute labeled random walks and attention weights
    labeled_walks = perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled)
    attention_weights = compute_attention_weights(S, labeled_walks)

    for _ in tqdm(range(iterations), desc="Gradient Ascent with Linear Modularity"):
        # Compute modularity gradient using linear approximation
        neighbor_agg = A @ S  # Efficient aggregation of neighbor embeddings
        global_correction = (degrees[:, None] / (2 * m)) * S.sum(axis=0)
        grad_modularity = (1 / (2 * m)) * (neighbor_agg - global_correction)

        # Compute supervised gradient
        grad_supervised = np.zeros_like(S)
        unique_labels = np.unique(labels[label_mask])
        for label in unique_labels:
            mask = (labels == label) & label_mask
            mean_embedding = np.mean(S[mask], axis=0, keepdims=True)
            grad_supervised[mask] = S[mask] - mean_embedding

        # Compute semi-supervised gradient using adaptive attention
        grad_semi_supervised = np.zeros_like(S)
        for i in range(n):
            if not label_mask[i] and i in attention_weights:
                weighted_embedding = sum(weight * S[n] for n, weight in attention_weights[i].items())
                grad_semi_supervised[i] = S[i] - weighted_embedding

        # Update embeddings
        grad_total = grad_modularity - lambda_supervised * grad_supervised - lambda_semi * grad_semi_supervised
        S += eta * grad_total
        S, _ = np.linalg.qr(S)

    return S

In [8]:
def convert_to_networkx(A):
    return nx.from_scipy_sparse_array(A)

In [9]:
dataset = CoraDataset()
ground_truth_labels = dataset[0].y
labels=np.argmax(ground_truth_labels,axis=1)

In [10]:
labels_to_be_masked=np.random.choice(np.arange(len(labels)),int(len(labels)*.7),replace=False)

In [11]:
masked_labels=[]
for i in np.arange(len(labels)):
    if i in labels_to_be_masked:
        masked_labels.append(-1)
    else:
        masked_labels.append(labels[i])
masked_labels=np.array(masked_labels)

In [12]:
label_mask = masked_labels != -1

In [13]:
X = dataset[0].x
A = dataset[0].a
G = convert_to_networkx(A)

In [14]:
print("Adjacency Matrix Shape:", A.shape)
print("Graph Nodes:", G.number_of_nodes())
print("Graph Edges:", G.number_of_edges())

Adjacency Matrix Shape: (2708, 2708)
Graph Nodes: 2708
Graph Edges: 5278


In [15]:
# Convert your preprocessed data into a PyTorch Geometric Data object
X_py = Data(
    x=torch.tensor(X, dtype=torch.float),  # Node features
    edge_index=torch.tensor(np.array(A.nonzero()), dtype=torch.long),  # Edge indices
    y=torch.tensor(labels, dtype=torch.long)  # Labels
)

# Ensure edge_index is in the correct shape (2, num_edges)
X_py.edge_index = X_py.edge_index.to(torch.long)

## Embeddings

In [16]:
# Dictionary for embeddings
embedding_dict = {}
execution_times = []  # List to store execution times

# Compute embeddings and store them with time tracking
def record_time(model_name, func, *args, **kwargs):
    print(f"Computing {model_name} embedding...")
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    execution_times.append((model_name, elapsed_time))
    print(f"{model_name} embedding computed in {elapsed_time:.2f} seconds.")
    return result

X_deepwalk = record_time("DeepWalk", deepwalk_embedding, G, k=embedding_dimensionality)
X_deepwalk = tf.convert_to_tensor(X_deepwalk, dtype=tf.float32)
embedding_dict['deepwalk'] = X_deepwalk

X_vgae = record_time("VGAE", vgae_embedding, X_py, k=embedding_dimensionality)
embedding_dict['vgae'] = X_vgae

X_dgi = record_time("DGI", dgi_embedding, X_py, k=embedding_dimensionality)
embedding_dict['dgi'] = X_dgi

X_modularity = record_time("Modularity", semi_supervised_gradient_ascent_modularity,
                           G, labels, label_mask, k=embedding_dimensionality,
                           eta=0.05, lambda_supervised=1.0, lambda_semi=2.0, iterations=200, initialization='random')
embedding_dict['modularity'] = X_modularity

X_node2vec = record_time("Node2Vec", node2vec_embedding, G, k=embedding_dimensionality)
X_node2vec = tf.convert_to_tensor(X_node2vec, dtype=tf.float32)
embedding_dict['node2vec'] = X_node2vec

# Generate random embedding
print("Generating Random embedding...")
start_time = time.time()
shape = (len(ground_truth_labels), embedding_dimensionality)
X_random = np.random.randn(*shape)
X_random = tf.convert_to_tensor(X_random, dtype=tf.float32)
end_time = time.time()
execution_times.append(("Random", end_time - start_time))
print(f"Random embedding generated in {end_time - start_time:.2f} seconds.")
embedding_dict['random'] = X_random

# Use original node features as 'given' embedding
embedding_dict['given'] = X

print("All embeddings computed and stored in the dictionary successfully.")

# Store execution times in a DataFrame and save
execution_df = pd.DataFrame(execution_times, columns=["Model", "Time (seconds)"])
execution_df.to_csv("./cora_analysis_results/embedding_execution_times_cora_"+str(SEED)+".csv", index=False)

print("\nExecution times saved to 'embedding_execution_times.csv'.")
print(execution_df)

Computing DeepWalk embedding...


Computing transition probabilities: 100%|██████████| 2708/2708 [00:00<00:00, 3713.55it/s]


DeepWalk embedding computed in 96.89 seconds.
Computing VGAE embedding...


100%|██████████| 200/200 [00:14<00:00, 14.16it/s]


VGAE embedding computed in 14.26 seconds.
Computing DGI embedding...


100%|██████████| 200/200 [00:19<00:00, 10.34it/s]


DGI embedding computed in 19.35 seconds.
Computing Modularity embedding...


Gradient Ascent with Linear Modularity: 100%|██████████| 200/200 [00:26<00:00,  7.66it/s]


Modularity embedding computed in 26.95 seconds.
Computing Node2Vec embedding...


Computing transition probabilities: 100%|██████████| 2708/2708 [00:01<00:00, 2494.11it/s]


Node2Vec embedding computed in 143.57 seconds.
Generating Random embedding...
Random embedding generated in 0.02 seconds.
All embeddings computed and stored in the dictionary successfully.

Execution times saved to 'embedding_execution_times.csv'.
        Model  Time (seconds)
0    DeepWalk       96.889719
1        VGAE       14.257295
2         DGI       19.349364
3  Modularity       26.949443
4    Node2Vec      143.572014
5      Random        0.015800


## Helper functions

In [17]:
def visualize_all_embeddings(all_embeddings, labels, label_mask):
    """
    Visualize all embeddings in a grid with 4 columns per row using UMAP.

    Parameters:
    - all_embeddings: Dictionary where keys are embedding methods, and values are embeddings.
    - labels: Labels (numpy array of shape [n_nodes]).
    - label_mask: Boolean array indicating known labels (True for known, False for unknown).
    """
    num_embeddings = len(all_embeddings)
    num_rows = (num_embeddings + 3) // 4  # Ensure enough rows for all embeddings
    fig, axes = plt.subplots(num_rows, 4, figsize=(8.27, 11.69))  # A4 size

    for i, (embedding_type, embedding) in tqdm(enumerate(all_embeddings.items()), 
                                               total=num_embeddings, desc="Visualizing embeddings"):
        row, col = divmod(i, 4)
        ax = axes[row, col] if num_rows > 1 else axes[col]  # Adjust for single-row case

        # Ensure embedding is a NumPy array
        if isinstance(embedding, tf.Tensor):
            embedding = embedding.numpy()

        # Reduce dimensionality using UMAP
        reducer = umap.UMAP(n_components=2)
        embedding_2d = reducer.fit_transform(embedding)

        # Known labels
        ax.scatter(embedding_2d[label_mask, 0], embedding_2d[label_mask, 1], 
                   c=labels[label_mask], cmap="Set1", s=3, alpha=0.7, label="Known Labels",
                   edgecolors='none')

        # Unknown labels
        ax.scatter(embedding_2d[~label_mask, 0], embedding_2d[~label_mask, 1], 
                   c=labels[~label_mask], cmap="Set1", s=5, alpha=0.7, 
                   label="Unknown Labels", edgecolors='black', linewidths=0.2)

        # Title with smaller font size
        ax.set_title(embedding_type.upper(), fontsize=8, pad=2)

        # Remove axis labels, ticks, and frames
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_frame_on(False)

    # Remove empty subplots if num_embeddings is not a multiple of 4
    for j in range(i + 1, num_rows * 4):
        row, col = divmod(j, 4)
        fig.delaxes(axes[row, col])

    plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.2, hspace=0.2)  # Adjust margins
    save_path = "./cora_analysis_results/embedding_grid_plot_cora.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Visualization saved to {save_path}")
    plt.show()

In [18]:
def evaluate_model(true_labels, predicted_labels):
    """
    Evaluate the model's performance using accuracy, F1-score, and confusion matrix.

    Args:
        true_labels (np.array): Ground truth labels (integers).
        predicted_labels (np.array): Predicted labels (integers).

    Returns:
        dict: A dictionary containing accuracy, F1-score, and confusion matrix.
    """
    # Compute accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    # Compute F1-score (macro-averaged)
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    
    # Compute confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    #
    print(cm)
    
    # Return results as a dictionary
    return {
        'accuracy': accuracy,
        'f1_score': f1
    }

## Classifiers

In [19]:
class GCN(tf.keras.Model):
    def __init__(self, n_labels, seed=42):  # Use an explicit seed
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)  # Define initializer
        
        self.conv1 = GCNConv(16, activation='relu', kernel_initializer=initializer)
        self.conv2 = GCNConv(n_labels, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [20]:
# Define the GAT model
class GAT(tf.keras.Model):
    def __init__(self, n_labels, num_heads=8, seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GATConv(16, attn_heads=num_heads, concat_heads=True, activation='elu', kernel_initializer=initializer)
        self.conv2 = GATConv(n_labels, attn_heads=1, concat_heads=False, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [21]:
# Define the GraphSAGE model
class GraphSAGE(tf.keras.Model):
    def __init__(self, n_labels, hidden_dim=16, aggregator='mean', seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GraphSageConv(hidden_dim, activation='relu', aggregator=aggregator, kernel_initializer=initializer)
        self.conv2 = GraphSageConv(n_labels, activation='softmax', aggregator=aggregator, kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [22]:
classifiers=['gcn','gat','graphsage']

## Classification using different node embeddings

In [23]:
def train_and_evaluate(embedding_dict, embedding, classifier, ground_truth_labels=ground_truth_labels, masked_labels=masked_labels):
    "the labels have to be one hot encoded"
    "model take values: gcn, gat, graphsage"
    print('embedding: ' + embedding.upper())
    print('model: ' + classifier.upper())

    X = embedding_dict[embedding]

    print("Processing...")
    # Create boolean mask for training
    train_mask = masked_labels != -1

    # Split the data into training and prediction sets
    X_train = X[train_mask]  # Training node features
    Y_train = ground_truth_labels[train_mask]  # Training labels (one-hot encoded)
    Y_train = tf.cast(Y_train, dtype='int32')
    
    # Reduce the adjacency matrix to only include training nodes
    A_train = A[train_mask, :][:, train_mask]  # Correctly reduce the adjacency matrix
    
    # Convert sparse adjacency matrix to COO format
    A_coo = A_train.tocoo()
    indices = np.column_stack((A_coo.row, A_coo.col))  # Corrected indices format
    values = A_coo.data
    shape = A_coo.shape  # Shape: (num_nodes, num_nodes)
    
    # Create a sparse tensor for the adjacency matrix
    A_train_tensor = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=shape)
    
    # Ensure the sparse tensor is ordered correctly
    A_train_tensor = tf.sparse.reorder(A_train_tensor)

    print("Training...")
    # Initialize the model
    if classifier == 'gcn':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GCN(n_labels)
    elif classifier == 'gat':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GAT(n_labels)
    elif classifier == 'graphsage':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GraphSAGE(n_labels)
    
    # Compile the model (not strictly necessary when using GradientTape, but useful for metrics)
    model.compile(
        optimizer=Adam(learning_rate=0.01),
        loss=CategoricalCrossentropy(),
        metrics=[CategoricalAccuracy()]
    )
    
    # Print shapes for debugging
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of A_train_tensor: {A_train_tensor.shape}")
    print(f"Shape of Y_train: {Y_train.shape}")
    
    # Define the optimizer and loss function
    optimizer = Adam(learning_rate=0.01)
    loss_fn = CategoricalCrossentropy()
    
    # Training loop with GradientTape
    epochs = 200
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            # Forward pass
            predictions, intermediate_embeddings = model([X_train, A_train_tensor])  # Unpack both outputs
                
            # Compute supervised loss (cross-entropy)
            supervised_loss = loss_fn(Y_train, predictions)
            
        # Compute gradients
        gradients = tape.gradient(supervised_loss, model.trainable_variables)
        
        # Update weights
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        # Print loss and accuracy for monitoring
        if epoch % 10 == 0:
            accuracy = CategoricalAccuracy()(Y_train, predictions)
            print(f"Epoch {epoch + 1}, Loss: {supervised_loss.numpy()}, Accuracy: {accuracy.numpy()}")

    print("Predicting...")
    # Prepare the full graph for prediction
    X_full = X  # Full node features
    A_full = A  # Full adjacency matrix
    
    # Convert the full adjacency matrix to COO format
    A_full_coo = A_full.tocoo()
    indices_full = np.column_stack((A_full_coo.row, A_full_coo.col))
    values_full = A_full_coo.data
    shape_full = A_full_coo.shape
    
    # Create a sparse tensor for the full adjacency matrix
    A_full_tensor = tf.sparse.SparseTensor(indices=indices_full, values=values_full, dense_shape=shape_full)
    A_full_tensor = tf.sparse.reorder(A_full_tensor)
    
    # Make predictions for all nodes
    predictions, emb = model([X_full, A_full_tensor])  # Shape: [num_nodes, n_labels]

    # Convert predictions to class labels (integers)
    predicted_labels = tf.argmax(predictions, axis=1).numpy()  # Shape: [num_nodes]
    
    # Extract predictions for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]

    # True labels for the masked nodes
    true_labels_masked = labels[labels_to_be_masked]
    
    # Predicted labels for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]
    
    # Evaluate the model's performance
    results = evaluate_model(true_labels_masked, predicted_labels_masked)
    
    # Print the results
    print(f"Accuracy: {results['accuracy'] * 100:.2f}%")
    print(f"F1-Score: {results['f1_score']:.4f}")

    results['model'] = classifier
    results['embedding'] = embedding

    # Return results and intermediate embeddings for visualization
    return results, emb

In [24]:
all_results=[]
graph_embeddings_dict={}
for emb in embedding_dict.keys():
    for clf in classifiers:
        results, embedding_matrix = train_and_evaluate(embedding_dict, emb, clf)
        all_results.append(results)
        key_string= emb + ' with ' + clf
        graph_embeddings_dict[key_string]=embedding_matrix

embedding: DEEPWALK
model: GCN
Processing...
Training...
Shape of X_train: (813, 150)
Shape of A_train_tensor: (813, 813)
Shape of Y_train: (813, 7)
Epoch 1, Loss: 2.9073357582092285, Accuracy: 0.10947109758853912
Epoch 11, Loss: 1.4460430145263672, Accuracy: 0.47601476311683655
Epoch 21, Loss: 1.2604765892028809, Accuracy: 0.570725679397583
Epoch 31, Loss: 1.1495040655136108, Accuracy: 0.6027060151100159
Epoch 41, Loss: 1.071852207183838, Accuracy: 0.6211562156677246
Epoch 51, Loss: 1.0147699117660522, Accuracy: 0.6359163522720337
Epoch 61, Loss: 0.9690921306610107, Accuracy: 0.6396064162254333
Epoch 71, Loss: 0.9310035705566406, Accuracy: 0.6531365513801575
Epoch 81, Loss: 0.900374710559845, Accuracy: 0.6629766225814819
Epoch 91, Loss: 0.8764164447784424, Accuracy: 0.6691266894340515
Epoch 101, Loss: 0.8621304631233215, Accuracy: 0.6728167533874512
Epoch 111, Loss: 0.8726982474327087, Accuracy: 0.6814268231391907
Epoch 121, Loss: 0.855615496635437, Accuracy: 0.6851168274879456
Epoch 



Epoch 1, Loss: 1.9486311674118042, Accuracy: 0.12546125054359436
Epoch 11, Loss: 0.5940623879432678, Accuracy: 0.8081181049346924
Epoch 21, Loss: 0.3623128831386566, Accuracy: 0.8733087182044983
Epoch 31, Loss: 0.24603459239006042, Accuracy: 0.9175891876220703
Epoch 41, Loss: 0.17582790553569794, Accuracy: 0.9409593939781189
Epoch 51, Loss: 0.1286785751581192, Accuracy: 0.9446494579315186
Epoch 61, Loss: 0.09469079971313477, Accuracy: 0.9618695974349976
Epoch 71, Loss: 0.07153619080781937, Accuracy: 0.9753997325897217
Epoch 81, Loss: 0.0563085600733757, Accuracy: 0.9803197979927063
Epoch 91, Loss: 0.04633421450853348, Accuracy: 0.9852398633956909
Epoch 101, Loss: 0.03937610611319542, Accuracy: 0.9864698648452759
Epoch 111, Loss: 0.034515440464019775, Accuracy: 0.9876998662948608
Epoch 121, Loss: 0.03108414262533188, Accuracy: 0.9864698648452759
Epoch 131, Loss: 0.028477655723690987, Accuracy: 0.9864698648452759
Epoch 141, Loss: 0.02582847699522972, Accuracy: 0.9864698648452759
Epoch 15



Epoch 11, Loss: 0.677680492401123, Accuracy: 0.7945879697799683
Epoch 21, Loss: 0.40751487016677856, Accuracy: 0.8720787167549133
Epoch 31, Loss: 0.25669512152671814, Accuracy: 0.9200491905212402
Epoch 41, Loss: 0.16978532075881958, Accuracy: 0.9495695233345032
Epoch 51, Loss: 0.11168598383665085, Accuracy: 0.9630996584892273
Epoch 61, Loss: 0.0767301619052887, Accuracy: 0.9766297936439514
Epoch 71, Loss: 0.05561944842338562, Accuracy: 0.9790897965431213
Epoch 81, Loss: 0.043246570974588394, Accuracy: 0.9852398633956909
Epoch 91, Loss: 0.035914480686187744, Accuracy: 0.9852398633956909
Epoch 101, Loss: 0.03120969608426094, Accuracy: 0.9864698648452759
Epoch 111, Loss: 0.028057154268026352, Accuracy: 0.9864698648452759
Epoch 121, Loss: 0.025902068242430687, Accuracy: 0.9864698648452759
Epoch 131, Loss: 0.02434992976486683, Accuracy: 0.9876998662948608
Epoch 141, Loss: 0.023176832124590874, Accuracy: 0.9876998662948608
Epoch 151, Loss: 0.022250959649682045, Accuracy: 0.9876998662948608
E



Epoch 11, Loss: 1.7997865676879883, Accuracy: 0.3050430417060852
Epoch 21, Loss: 1.6739847660064697, Accuracy: 0.37392374873161316
Epoch 31, Loss: 1.5588135719299316, Accuracy: 0.39975398778915405
Epoch 41, Loss: 1.4893485307693481, Accuracy: 0.42189422249794006
Epoch 51, Loss: 1.4100397825241089, Accuracy: 0.468634694814682
Epoch 61, Loss: 1.3269370794296265, Accuracy: 0.5153751373291016
Epoch 71, Loss: 1.2815041542053223, Accuracy: 0.5153751373291016
Epoch 81, Loss: 1.1912226676940918, Accuracy: 0.5842558145523071
Epoch 91, Loss: 1.1404781341552734, Accuracy: 0.6223862171173096
Epoch 101, Loss: 1.0913622379302979, Accuracy: 0.6260762810707092
Epoch 111, Loss: 1.041021466255188, Accuracy: 0.661746621131897
Epoch 121, Loss: 1.0127400159835815, Accuracy: 0.659286618232727
Epoch 131, Loss: 1.0210667848587036, Accuracy: 0.6420664191246033
Epoch 141, Loss: 0.9624770283699036, Accuracy: 0.6814268231391907
Epoch 151, Loss: 0.916848361492157, Accuracy: 0.6937269568443298
Epoch 161, Loss: 0.89



Epoch 1, Loss: 1.945801854133606, Accuracy: 0.14268141984939575
Epoch 11, Loss: 1.6649761199951172, Accuracy: 0.27798277139663696
Epoch 21, Loss: 1.1912025213241577, Accuracy: 0.8364083766937256
Epoch 31, Loss: 0.6224966645240784, Accuracy: 0.9212791919708252
Epoch 41, Loss: 0.2840885519981384, Accuracy: 0.9495695233345032
Epoch 51, Loss: 0.15674518048763275, Accuracy: 0.9569495916366577
Epoch 61, Loss: 0.1052955612540245, Accuracy: 0.9630996584892273
Epoch 71, Loss: 0.08005260676145554, Accuracy: 0.9729397296905518
Epoch 81, Loss: 0.0651724636554718, Accuracy: 0.9790897965431213
Epoch 91, Loss: 0.05567852035164833, Accuracy: 0.9790897965431213
Epoch 101, Loss: 0.04846764728426933, Accuracy: 0.9827798008918762
Epoch 111, Loss: 0.04246879369020462, Accuracy: 0.9864698648452759
Epoch 121, Loss: 0.03775561600923538, Accuracy: 0.9889298677444458
Epoch 131, Loss: 0.03434162586927414, Accuracy: 0.9889298677444458
Epoch 141, Loss: 0.03167802095413208, Accuracy: 0.9889298677444458
Epoch 151, L



Epoch 1, Loss: 1.9383519887924194, Accuracy: 0.18696187436580658
Epoch 11, Loss: 0.5836333632469177, Accuracy: 0.814268171787262
Epoch 21, Loss: 0.3503478169441223, Accuracy: 0.8806887865066528
Epoch 31, Loss: 0.24543626606464386, Accuracy: 0.9261992573738098
Epoch 41, Loss: 0.17799623310565948, Accuracy: 0.9335793256759644
Epoch 51, Loss: 0.13310186564922333, Accuracy: 0.9446494579315186
Epoch 61, Loss: 0.10144802927970886, Accuracy: 0.9606395959854126
Epoch 71, Loss: 0.07975242286920547, Accuracy: 0.9729397296905518
Epoch 81, Loss: 0.0651208832859993, Accuracy: 0.9741697311401367
Epoch 91, Loss: 0.054799504578113556, Accuracy: 0.9790897965431213
Epoch 101, Loss: 0.047336023300886154, Accuracy: 0.9827798008918762
Epoch 111, Loss: 0.041484761983156204, Accuracy: 0.9827798008918762
Epoch 121, Loss: 0.036711741238832474, Accuracy: 0.9815497994422913
Epoch 131, Loss: 0.03327847644686699, Accuracy: 0.9815497994422913
Epoch 141, Loss: 0.03076031059026718, Accuracy: 0.9827798008918762
Epoch 



Epoch 11, Loss: 0.7111485600471497, Accuracy: 0.7958179712295532
Epoch 21, Loss: 0.1617964655160904, Accuracy: 0.9630996584892273
Epoch 31, Loss: 0.05152703449130058, Accuracy: 0.9815497994422913
Epoch 41, Loss: 0.02994064800441265, Accuracy: 0.9852398633956909
Epoch 51, Loss: 0.023864224553108215, Accuracy: 0.9864698648452759
Epoch 61, Loss: 0.021367598325014114, Accuracy: 0.9876998662948608
Epoch 71, Loss: 0.020811202004551888, Accuracy: 0.9864698648452759
Epoch 81, Loss: 0.02046360820531845, Accuracy: 0.9864698648452759
Epoch 91, Loss: 0.019534962251782417, Accuracy: 0.9864698648452759
Epoch 101, Loss: 0.019257524982094765, Accuracy: 0.9876998662948608
Epoch 111, Loss: 0.01895175129175186, Accuracy: 0.9901599287986755
Epoch 121, Loss: 0.017561698332428932, Accuracy: 0.9889298677444458
Epoch 131, Loss: 0.01716657355427742, Accuracy: 0.9901599287986755
Epoch 141, Loss: 0.016352863982319832, Accuracy: 0.9901599287986755
Epoch 151, Loss: 0.014760083518922329, Accuracy: 0.990159928798675



Epoch 1, Loss: 1.9469044208526611, Accuracy: 0.11685116589069366
Epoch 11, Loss: 0.1977013200521469, Accuracy: 0.936039388179779
Epoch 21, Loss: 0.06659530103206635, Accuracy: 0.9729397296905518
Epoch 31, Loss: 0.04044291377067566, Accuracy: 0.9778597950935364
Epoch 41, Loss: 0.030810652300715446, Accuracy: 0.984009861946106
Epoch 51, Loss: 0.025495875626802444, Accuracy: 0.9852398633956909
Epoch 61, Loss: 0.022625740617513657, Accuracy: 0.9852398633956909
Epoch 71, Loss: 0.02160685509443283, Accuracy: 0.9864698648452759
Epoch 81, Loss: 0.02071201056241989, Accuracy: 0.9864698648452759
Epoch 91, Loss: 0.018755437806248665, Accuracy: 0.9876998662948608
Epoch 101, Loss: 0.018266931176185608, Accuracy: 0.9876998662948608
Epoch 111, Loss: 0.018040692433714867, Accuracy: 0.9876998662948608
Epoch 121, Loss: 0.017873261123895645, Accuracy: 0.9876998662948608
Epoch 131, Loss: 0.01776740700006485, Accuracy: 0.9852398633956909
Epoch 141, Loss: 0.017682936042547226, Accuracy: 0.9876998662948608
E

## Saving aggregate results

In [25]:
# Convert to DataFrame
df = pd.DataFrame(all_results)

# Define dataset name and seed
dataset_name = "cora"
seed_value = SEED

# Save as CSV file without sorting
filename = f"{dataset_name}_seed{seed_value}_results.csv"
filename='./cora_analysis_results/'+filename
df.to_csv(filename, index=False)

print(f"Results saved as {filename}")

Results saved as ./cora_analysis_results/cora_seed46_results.csv


In [26]:
all_embeddings= embedding_dict | graph_embeddings_dict

In [27]:
def reorder_dict(original_dict, key_order):
    """
    Reorders a dictionary based on a given list of keys.

    Parameters:
    - original_dict (dict): The dictionary to reorder.
    - key_order (list): The list specifying the desired key order.

    Returns:
    - dict: A new dictionary with keys ordered as per key_order.
    """
    return {key: original_dict[key] for key in key_order if key in original_dict}

In [28]:
key_order = ['random', 'random with gcn', 'random with gat', 'random with graphsage', 'deepwalk', 'deepwalk with gcn', 'deepwalk with gat', 'deepwalk with graphsage', 'node2vec','node2vec with gcn', 'node2vec with gat', 'node2vec with graphsage', 'vgae', 'vgae with gcn', 'vgae with gat', 'vgae with graphsage', 'dgi', 'dgi with gcn', 'dgi with gat', 'dgi with graphsage', 'modularity', 'modularity with gcn', 'modularity with gat', 'modularity with graphsage', 'given', 'given with gcn', 'given with gat', 'given with graphsage']

In [29]:
all_embeddings = reorder_dict(all_embeddings, key_order)

