In [1]:
%config InlineBackend.figure_format = 'svg'

In [2]:
import os
import random
import time 
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from node2vec import Node2Vec
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from torch_geometric.data import Data
import spektral
from spektral.layers import GCNConv, GATConv
from spektral.layers import GraphSageConv
from spektral.data import Graph, Dataset, BatchLoader
from scipy.sparse import csr_matrix, lil_matrix
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import DeepGraphInfomax, VGAE
from torch_geometric.utils import from_networkx
import scipy.sparse as sp
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from scipy.sparse.csgraph import laplacian
from scipy.sparse.linalg import eigsh
from collections import Counter
from sklearn.preprocessing import normalize
from joblib import Parallel, delayed
from torch_geometric.nn import GCNConv as PyG_GCNConv, VGAE as PyG_VGAE
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 46

# Set seed for Python's built-in random module
random.seed(SEED)

# Set seed for NumPy
np.random.seed(SEED)

# Set seed for TensorFlow
tf.random.set_seed(SEED)

# Set seed for PyTorch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

In [4]:
# Create a custom Dataset for the graph
class PubMedDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def read(self):
        dataset = Planetoid(root=".", name="PubMed")  # Load CiteSeer dataset
        data = dataset[0]  # Access the first graph
        
        # Convert Torch tensors to NumPy
        x = data.x.numpy()
        edge_index = data.edge_index.numpy()
        y = data.y.numpy()

        # One-hot encode labels
        num_classes = y.max() + 1  # Number of classes
        y_one_hot = np.eye(num_classes)[y]  # One-hot encoding

        # Convert edge_index to a sparse adjacency matrix
        num_nodes = x.shape[0]
        adj = lil_matrix((num_nodes, num_nodes), dtype=np.float32)
        for i in range(edge_index.shape[1]):
            src, dst = edge_index[:, i]
            adj[src, dst] = 1
            adj[dst, src] = 1  # Ensure undirected graph

        return [Graph(x=x, a=adj, y=y_one_hot)]

In [5]:
embedding_dimensionality=150

## Extracting modularity embedding and using it for classification

In [6]:
# Laplacian Eigenmaps Embedding
def deepwalk_embedding(G, k=2, walk_length=10, num_walks=80, workers=4):
    node2vec = Node2Vec(G, dimensions=k, walk_length=walk_length, num_walks=num_walks, workers=workers)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])

# Node2Vec Embedding
def node2vec_embedding(G, k=2, seed=SEED):
    node2vec = Node2Vec(G, dimensions=k, walk_length=10, num_walks=100, workers=2, seed=seed)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])


# VGAE Embedding 
class VGAEEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
        self.conv_mu = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for mu
        self.conv_logstd = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for logstd

    def forward(self, x, edge_index):
        x = torch.relu(self.conv1(x, edge_index))
        mu = self.conv_mu(x, edge_index)
        logstd = self.conv_logstd(x, edge_index)
        return mu, logstd

def vgae_embedding(data, k=128):
    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = PyG_VGAE(VGAEEncoder(in_channels, k))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        z = model.encode(x, data.edge_index)  # Use one-hot encoded features
        loss = model.recon_loss(z, data.edge_index) + (1 / data.num_nodes) * model.kl_loss()
        loss.backward()
        optimizer.step()
    
    return model.encode(x, data.edge_index).detach().numpy()

# DGI Embedding
def dgi_embedding(data, k=128):
    class GCNEncoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super().__init__()
            self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
            self.conv2 = PyG_GCNConv(2 * out_channels, out_channels)  # Use PyG_GCNConv

        def forward(self, x, edge_index):
            x = torch.relu(self.conv1(x, edge_index))
            return self.conv2(x, edge_index)

    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = DeepGraphInfomax(
        hidden_channels=k,
        encoder=GCNEncoder(in_channels, k),
        summary=lambda z, *args, **kwargs: z.mean(dim=0),  # Ensure `summary` only takes `z`
        corruption=lambda x, edge_index: (x[torch.randperm(x.size(0))], edge_index)  # Correct corruption function
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        pos_z, neg_z, summary = model(x, data.edge_index)  # Use one-hot encoded features
        loss = model.loss(pos_z, neg_z, summary)
        loss.backward()
        optimizer.step()

    return pos_z.detach().numpy()


# Unsupervised gradient ascent for modularity maximization
def gradient_ascent_modularity_unsupervised(G, k=2, eta=0.01, iterations=1000, seed=SEED):
    np.random.seed(seed)  # Ensure deterministic initialization

    A = nx.to_numpy_array(G)
    l = A.sum(axis=1)
    m = np.sum(l) / 2
    B = A - np.outer(l, l) / (2 * m)
    n = B.shape[0]

    S = np.random.randn(n, k)  # Random Initialization
    S, _ = np.linalg.qr(S)  # Ensure initial orthonormality

    for i in tqdm(range(iterations), desc="Gradient Ascent Progress"):
        grad = (1 / (2 * m)) * B @ S
        S += eta * grad
        S, _ = np.linalg.qr(S)  # Orthonormalize using QR decomposition

    return S

In [7]:
def perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled=3):
    walks = {node: [] for node in G.nodes()}
    for node in G.nodes():
        for _ in range(num_walks):
            walk = [node]
            labeled_count = 0
            for _ in range(walk_length - 1):
                cur = walk[-1]
                neighbors = list(G.neighbors(cur))
                if not neighbors:
                    break
                labeled_neighbors = [n for n in neighbors if label_mask[n]]
                if labeled_neighbors and labeled_count < walk_length_labelled:
                    next_node = random.choice(labeled_neighbors)
                    labeled_count += 1
                else:
                    next_node = random.choice(neighbors)
                walk.append(next_node)
            walks[node].extend([n for n in walk if label_mask[n]])
    return walks

def compute_attention_weights(S, labeled_nodes):
    weights = {}
    for node, labeled in labeled_nodes.items():
        if labeled:
            similarities = {n: np.dot(S[node], S[n]) for n in labeled}
            exp_sims = {n: np.exp(sim) for n, sim in similarities.items()}
            total = sum(exp_sims.values())
            weights[node] = {n: exp_sims[n] / total for n in labeled}
    return weights

def semi_supervised_gradient_ascent_modularity(G, labels, label_mask, k=2, eta=0.01, lambda_supervised=1.0, 
                                                      lambda_semi=2.0, iterations=5000, initialization='random',
                                                      num_walks=10, walk_length=5, walk_length_labelled=3):
    # Convert graph to sparse adjacency matrix
    A = csr_matrix(nx.to_scipy_sparse_array(G, format='csr'))
    degrees = np.array(A.sum(axis=1)).flatten()
    m = G.number_of_edges()
    n = A.shape[0]

    # Initialize embeddings
    if initialization == 'random':
        S = np.random.randn(n, k)
    S, _ = np.linalg.qr(S)

    # Compute labeled random walks and attention weights
    labeled_walks = perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled)
    attention_weights = compute_attention_weights(S, labeled_walks)

    for _ in tqdm(range(iterations), desc="Gradient Ascent with Linear Modularity"):
        # Compute modularity gradient using linear approximation
        neighbor_agg = A @ S  # Efficient aggregation of neighbor embeddings
        global_correction = (degrees[:, None] / (2 * m)) * S.sum(axis=0)
        grad_modularity = (1 / (2 * m)) * (neighbor_agg - global_correction)

        # Compute supervised gradient
        grad_supervised = np.zeros_like(S)
        unique_labels = np.unique(labels[label_mask])
        for label in unique_labels:
            mask = (labels == label) & label_mask
            mean_embedding = np.mean(S[mask], axis=0, keepdims=True)
            grad_supervised[mask] = S[mask] - mean_embedding

        # Compute semi-supervised gradient using adaptive attention
        grad_semi_supervised = np.zeros_like(S)
        for i in range(n):
            if not label_mask[i] and i in attention_weights:
                weighted_embedding = sum(weight * S[n] for n, weight in attention_weights[i].items())
                grad_semi_supervised[i] = S[i] - weighted_embedding

        # Update embeddings
        grad_total = grad_modularity - lambda_supervised * grad_supervised - lambda_semi * grad_semi_supervised
        S += eta * grad_total
        S, _ = np.linalg.qr(S)

    return S

In [8]:
def convert_to_networkx(A):
    return nx.from_scipy_sparse_array(A)

In [9]:
dataset = PubMedDataset()
ground_truth_labels = dataset[0].y
labels=np.argmax(ground_truth_labels,axis=1)

In [10]:
np.random.seed(42)
labels_to_be_masked=np.random.choice(np.arange(len(labels)),int(len(labels)*.7),replace=False)

In [11]:
masked_labels=[]
for i in np.arange(len(labels)):
    if i in labels_to_be_masked:
        masked_labels.append(-1)
    else:
        masked_labels.append(labels[i])
masked_labels=np.array(masked_labels)

In [12]:
label_mask = masked_labels != -1

In [13]:
X = dataset[0].x
A = dataset[0].a
G = convert_to_networkx(A)

In [14]:
print("Adjacency Matrix Shape:", A.shape)
print("Graph Nodes:", G.number_of_nodes())
print("Graph Edges:", G.number_of_edges())

Adjacency Matrix Shape: (19717, 19717)
Graph Nodes: 19717
Graph Edges: 44324


In [15]:
# Convert your preprocessed data into a PyTorch Geometric Data object
X_py = Data(
    x=torch.tensor(X, dtype=torch.float),  # Node features
    edge_index=torch.tensor(np.array(A.nonzero()), dtype=torch.long),  # Edge indices
    y=torch.tensor(labels, dtype=torch.long)  # Labels
)

# Ensure edge_index is in the correct shape (2, num_edges)
X_py.edge_index = X_py.edge_index.to(torch.long)

## Embeddings

In [16]:
# Dictionary for embeddings
embedding_dict = {}
execution_times = []  # List to store execution times

# Compute embeddings and store them with time tracking
def record_time(model_name, func, *args, **kwargs):
    print(f"Computing {model_name} embedding...")
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    execution_times.append((model_name, elapsed_time))
    print(f"{model_name} embedding computed in {elapsed_time:.2f} seconds.")
    return result

X_deepwalk = record_time("DeepWalk", deepwalk_embedding, G, k=embedding_dimensionality)
X_deepwalk = tf.convert_to_tensor(X_deepwalk, dtype=tf.float32)
embedding_dict['deepwalk'] = X_deepwalk

X_vgae = record_time("VGAE", vgae_embedding, X_py, k=embedding_dimensionality)
embedding_dict['vgae'] = X_vgae

X_dgi = record_time("DGI", dgi_embedding, X_py, k=embedding_dimensionality)
embedding_dict['dgi'] = X_dgi

X_modularity = record_time("Modularity", semi_supervised_gradient_ascent_modularity,
                           G, labels, label_mask, k=embedding_dimensionality,
                           eta=0.05, lambda_supervised=1.0, lambda_semi=2.0, iterations=200, initialization='random')
embedding_dict['modularity'] = X_modularity

X_node2vec = record_time("Node2Vec", node2vec_embedding, G, k=embedding_dimensionality)
X_node2vec = tf.convert_to_tensor(X_node2vec, dtype=tf.float32)
embedding_dict['node2vec'] = X_node2vec

# Generate random embedding
print("Generating Random embedding...")
start_time = time.time()
shape = (len(ground_truth_labels), embedding_dimensionality)
X_random = np.random.randn(*shape)
X_random = tf.convert_to_tensor(X_random, dtype=tf.float32)
end_time = time.time()
execution_times.append(("Random", end_time - start_time))
print(f"Random embedding generated in {end_time - start_time:.2f} seconds.")
embedding_dict['random'] = X_random

# Use original node features as 'given' embedding
embedding_dict['given'] = X

print("All embeddings computed and stored in the dictionary successfully.")

# Store execution times in a DataFrame and save
execution_df = pd.DataFrame(execution_times, columns=["Model", "Time (seconds)"])
execution_df.to_csv("./pubmed_analysis_results/embedding_execution_times_pubmed_"+str(SEED)+".csv", index=False)

print("\nExecution times saved to 'embedding_execution_times.csv'.")
print(execution_df)

Computing DeepWalk embedding...


Computing transition probabilities: 100%|██████████| 19717/19717 [00:09<00:00, 2003.32it/s]


DeepWalk embedding computed in 903.31 seconds.
Computing VGAE embedding...


100%|██████████| 200/200 [05:22<00:00,  1.61s/it]


VGAE embedding computed in 323.60 seconds.
Computing DGI embedding...


100%|██████████| 200/200 [14:00<00:00,  4.20s/it]


DGI embedding computed in 840.77 seconds.
Computing Modularity embedding...


Gradient Ascent with Linear Modularity: 100%|██████████| 200/200 [02:50<00:00,  1.18it/s]


Modularity embedding computed in 176.37 seconds.
Computing Node2Vec embedding...


Computing transition probabilities: 100%|██████████| 19717/19717 [00:10<00:00, 1831.49it/s]


Node2Vec embedding computed in 518.67 seconds.
Generating Random embedding...
Random embedding generated in 0.04 seconds.
All embeddings computed and stored in the dictionary successfully.

Execution times saved to 'embedding_execution_times.csv'.
        Model  Time (seconds)
0    DeepWalk      903.312759
1        VGAE      323.600382
2         DGI      840.769592
3  Modularity      176.374603
4    Node2Vec      518.670925
5      Random        0.039656


## Helper functions

In [17]:
def visualize_all_embeddings(all_embeddings, labels, label_mask):
    """
    Visualize all embeddings in a grid with 4 columns per row using UMAP.

    Parameters:
    - all_embeddings: Dictionary where keys are embedding methods, and values are embeddings.
    - labels: Labels (numpy array of shape [n_nodes]).
    - label_mask: Boolean array indicating known labels (True for known, False for unknown).
    """
    num_embeddings = len(all_embeddings)
    num_rows = (num_embeddings + 3) // 4  # Ensure enough rows for all embeddings
    fig, axes = plt.subplots(num_rows, 4, figsize=(8.27, 11.69))  # A4 size

    for i, (embedding_type, embedding) in tqdm(enumerate(all_embeddings.items()), 
                                               total=num_embeddings, desc="Visualizing embeddings"):
        row, col = divmod(i, 4)
        ax = axes[row, col] if num_rows > 1 else axes[col]  # Adjust for single-row case

        # Ensure embedding is a NumPy array
        if isinstance(embedding, tf.Tensor):
            embedding = embedding.numpy()

        # Reduce dimensionality using UMAP
        reducer = umap.UMAP(n_components=2)
        embedding_2d = reducer.fit_transform(embedding)

        # Known labels
        ax.scatter(embedding_2d[label_mask, 0], embedding_2d[label_mask, 1], 
                   c=labels[label_mask], cmap="Set1", s=3, alpha=0.7, label="Known Labels",
                   edgecolors='none')

        # Unknown labels
        ax.scatter(embedding_2d[~label_mask, 0], embedding_2d[~label_mask, 1], 
                   c=labels[~label_mask], cmap="Set1", s=5, alpha=0.7, 
                   label="Unknown Labels", edgecolors='black', linewidths=0.2)

        # Title with smaller font size
        ax.set_title(embedding_type.upper(), fontsize=8, pad=2)

        # Remove axis labels, ticks, and frames
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_frame_on(False)

    # Remove empty subplots if num_embeddings is not a multiple of 4
    for j in range(i + 1, num_rows * 4):
        row, col = divmod(j, 4)
        fig.delaxes(axes[row, col])

    plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.2, hspace=0.2)  # Adjust margins
    save_path = "./pubmed_analysis_results/embedding_grid_plot_pubmed.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Visualization saved to {save_path}")
    plt.show()

In [18]:
def evaluate_model(true_labels, predicted_labels):
    """
    Evaluate the model's performance using accuracy, F1-score, and confusion matrix.

    Args:
        true_labels (np.array): Ground truth labels (integers).
        predicted_labels (np.array): Predicted labels (integers).

    Returns:
        dict: A dictionary containing accuracy, F1-score, and confusion matrix.
    """
    # Compute accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    # Compute F1-score (macro-averaged)
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    
    # Compute confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    #
    print(cm)
    
    # Return results as a dictionary
    return {
        'accuracy': accuracy,
        'f1_score': f1
    }

## Classifiers

In [19]:
class GCN(tf.keras.Model):
    def __init__(self, n_labels, seed=42):  # Use an explicit seed
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)  # Define initializer
        
        self.conv1 = GCNConv(16, activation='relu', kernel_initializer=initializer)
        self.conv2 = GCNConv(n_labels, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [20]:
# Define the GAT model
class GAT(tf.keras.Model):
    def __init__(self, n_labels, num_heads=8, seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GATConv(16, attn_heads=num_heads, concat_heads=True, activation='elu', kernel_initializer=initializer)
        self.conv2 = GATConv(n_labels, attn_heads=1, concat_heads=False, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [21]:
# Define the GraphSAGE model
class GraphSAGE(tf.keras.Model):
    def __init__(self, n_labels, hidden_dim=16, aggregator='mean', seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GraphSageConv(hidden_dim, activation='relu', aggregator=aggregator, kernel_initializer=initializer)
        self.conv2 = GraphSageConv(n_labels, activation='softmax', aggregator=aggregator, kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [22]:
classifiers=['gcn','gat','graphsage']

## Classification using different node embeddings

In [23]:
def train_and_evaluate(embedding_dict, embedding, classifier, ground_truth_labels=ground_truth_labels, masked_labels=masked_labels):
    "the labels have to be one hot encoded"
    "model take values: gcn, gat, graphsage"
    print('embedding: ' + embedding.upper())
    print('model: ' + classifier.upper())

    X = embedding_dict[embedding]

    print("Processing...")
    # Create boolean mask for training
    train_mask = masked_labels != -1

    # Split the data into training and prediction sets
    X_train = X[train_mask]  # Training node features
    Y_train = ground_truth_labels[train_mask]  # Training labels (one-hot encoded)
    Y_train = tf.cast(Y_train, dtype='int32')
    
    # Reduce the adjacency matrix to only include training nodes
    A_train = A[train_mask, :][:, train_mask]  # Correctly reduce the adjacency matrix
    
    # Convert sparse adjacency matrix to COO format
    A_coo = A_train.tocoo()
    indices = np.column_stack((A_coo.row, A_coo.col))  # Corrected indices format
    values = A_coo.data
    shape = A_coo.shape  # Shape: (num_nodes, num_nodes)
    
    # Create a sparse tensor for the adjacency matrix
    A_train_tensor = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=shape)
    
    # Ensure the sparse tensor is ordered correctly
    A_train_tensor = tf.sparse.reorder(A_train_tensor)

    print("Training...")
    # Initialize the model
    if classifier == 'gcn':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GCN(n_labels)
    elif classifier == 'gat':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GAT(n_labels)
    elif classifier == 'graphsage':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GraphSAGE(n_labels)
    
    # Compile the model (not strictly necessary when using GradientTape, but useful for metrics)
    model.compile(
        optimizer=Adam(learning_rate=0.01),
        loss=CategoricalCrossentropy(),
        metrics=[CategoricalAccuracy()]
    )
    
    # Print shapes for debugging
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of A_train_tensor: {A_train_tensor.shape}")
    print(f"Shape of Y_train: {Y_train.shape}")
    
    # Define the optimizer and loss function
    optimizer = Adam(learning_rate=0.01)
    loss_fn = CategoricalCrossentropy()
    
    # Training loop with GradientTape
    epochs = 200
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            # Forward pass
            predictions, intermediate_embeddings = model([X_train, A_train_tensor])  # Unpack both outputs
                
            # Compute supervised loss (cross-entropy)
            supervised_loss = loss_fn(Y_train, predictions)
            
        # Compute gradients
        gradients = tape.gradient(supervised_loss, model.trainable_variables)
        
        # Update weights
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        # Print loss and accuracy for monitoring
        if epoch % 10 == 0:
            accuracy = CategoricalAccuracy()(Y_train, predictions)
            print(f"Epoch {epoch + 1}, Loss: {supervised_loss.numpy()}, Accuracy: {accuracy.numpy()}")

    print("Predicting...")
    # Prepare the full graph for prediction
    X_full = X  # Full node features
    A_full = A  # Full adjacency matrix
    
    # Convert the full adjacency matrix to COO format
    A_full_coo = A_full.tocoo()
    indices_full = np.column_stack((A_full_coo.row, A_full_coo.col))
    values_full = A_full_coo.data
    shape_full = A_full_coo.shape
    
    # Create a sparse tensor for the full adjacency matrix
    A_full_tensor = tf.sparse.SparseTensor(indices=indices_full, values=values_full, dense_shape=shape_full)
    A_full_tensor = tf.sparse.reorder(A_full_tensor)
    
    # Make predictions for all nodes
    predictions, emb = model([X_full, A_full_tensor])  # Shape: [num_nodes, n_labels]

    # Convert predictions to class labels (integers)
    predicted_labels = tf.argmax(predictions, axis=1).numpy()  # Shape: [num_nodes]
    
    # Extract predictions for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]

    # True labels for the masked nodes
    true_labels_masked = labels[labels_to_be_masked]
    
    # Predicted labels for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]
    
    # Evaluate the model's performance
    results = evaluate_model(true_labels_masked, predicted_labels_masked)
    
    # Print the results
    print(f"Accuracy: {results['accuracy'] * 100:.2f}%")
    print(f"F1-Score: {results['f1_score']:.4f}")

    results['model'] = classifier
    results['embedding'] = embedding

    # Return results and intermediate embeddings for visualization
    return results, emb

In [24]:
all_results=[]
graph_embeddings_dict={}
for emb in embedding_dict.keys():
    for clf in classifiers:
        results, embedding_matrix = train_and_evaluate(embedding_dict, emb, clf)
        all_results.append(results)
        key_string= emb + ' with ' + clf
        graph_embeddings_dict[key_string]=embedding_matrix

embedding: DEEPWALK
model: GCN
Processing...
Training...
Shape of X_train: (5916, 150)
Shape of A_train_tensor: (5916, 5916)
Shape of Y_train: (5916, 3)
Epoch 1, Loss: 3.3151021003723145, Accuracy: 0.3049357533454895
Epoch 11, Loss: 0.8751522898674011, Accuracy: 0.5848546028137207
Epoch 21, Loss: 0.823583722114563, Accuracy: 0.5975320935249329
Epoch 31, Loss: 0.7882698178291321, Accuracy: 0.6071670055389404
Epoch 41, Loss: 0.7688202857971191, Accuracy: 0.6154496073722839
Epoch 51, Loss: 0.7550916075706482, Accuracy: 0.6210277080535889
Epoch 61, Loss: 0.7439541816711426, Accuracy: 0.6249154806137085
Epoch 71, Loss: 0.7352607846260071, Accuracy: 0.6279580593109131
Epoch 81, Loss: 0.7280264496803284, Accuracy: 0.6294793486595154
Epoch 91, Loss: 0.7214968800544739, Accuracy: 0.6320148706436157
Epoch 101, Loss: 0.7155686020851135, Accuracy: 0.6328600645065308
Epoch 111, Loss: 0.7099736332893372, Accuracy: 0.6364097595214844
Epoch 121, Loss: 0.7045320868492126, Accuracy: 0.639621376991272
Ep



Epoch 11, Loss: 0.5341686010360718, Accuracy: 0.8000338077545166
Epoch 21, Loss: 0.4844798147678375, Accuracy: 0.8167681097984314
Epoch 31, Loss: 0.4637574255466461, Accuracy: 0.8236984610557556
Epoch 41, Loss: 0.45135146379470825, Accuracy: 0.8313049077987671
Epoch 51, Loss: 0.44182106852531433, Accuracy: 0.8331642746925354
Epoch 61, Loss: 0.43097200989723206, Accuracy: 0.8351926803588867
Epoch 71, Loss: 0.41623198986053467, Accuracy: 0.8407707810401917
Epoch 81, Loss: 0.39593803882598877, Accuracy: 0.8507437705993652
Epoch 91, Loss: 0.3702104389667511, Accuracy: 0.8578431606292725
Epoch 101, Loss: 0.34406763315200806, Accuracy: 0.8693373799324036
Epoch 111, Loss: 0.31408828496932983, Accuracy: 0.8823529481887817
Epoch 121, Loss: 0.2826573848724365, Accuracy: 0.8941852450370789
Epoch 131, Loss: 0.25692445039749146, Accuracy: 0.9009465575218201
Epoch 141, Loss: 0.22867651283740997, Accuracy: 0.913116991519928
Epoch 151, Loss: 0.20672854781150818, Accuracy: 0.9215686321258545
Epoch 161,



Epoch 11, Loss: 0.6716559529304504, Accuracy: 0.7307302355766296
Epoch 21, Loss: 0.6239644289016724, Accuracy: 0.7447599768638611
Epoch 31, Loss: 0.6083436608314514, Accuracy: 0.7511832118034363
Epoch 41, Loss: 0.6010280251502991, Accuracy: 0.7591277956962585
Epoch 51, Loss: 0.5950042605400085, Accuracy: 0.7592968344688416
Epoch 61, Loss: 0.5881367325782776, Accuracy: 0.7630155682563782
Epoch 71, Loss: 0.5781629681587219, Accuracy: 0.7660581469535828
Epoch 81, Loss: 0.5617530345916748, Accuracy: 0.773833692073822
Epoch 91, Loss: 0.5340700149536133, Accuracy: 0.7876943945884705
Epoch 101, Loss: 0.49340301752090454, Accuracy: 0.8040906190872192
Epoch 111, Loss: 0.4442346394062042, Accuracy: 0.8269100785255432
Epoch 121, Loss: 0.38881343603134155, Accuracy: 0.8551385998725891
Epoch 131, Loss: 0.33058294653892517, Accuracy: 0.8828600645065308
Epoch 141, Loss: 0.2791329324245453, Accuracy: 0.9016227126121521
Epoch 151, Loss: 0.23104162514209747, Accuracy: 0.9213995933532715
Epoch 161, Loss:



Epoch 11, Loss: 1.0628708600997925, Accuracy: 0.4399932324886322
Epoch 21, Loss: 1.0334193706512451, Accuracy: 0.4682217836380005
Epoch 31, Loss: 1.0010277032852173, Accuracy: 0.4881676733493805
Epoch 41, Loss: 0.9745643138885498, Accuracy: 0.5143678188323975
Epoch 51, Loss: 0.956035315990448, Accuracy: 0.5204530358314514
Epoch 61, Loss: 0.944864809513092, Accuracy: 0.5189316868782043
Epoch 71, Loss: 0.9334852695465088, Accuracy: 0.5273833870887756
Epoch 81, Loss: 0.9134989380836487, Accuracy: 0.5425963401794434
Epoch 91, Loss: 0.9107478857040405, Accuracy: 0.5518931746482849
Epoch 101, Loss: 0.9348294734954834, Accuracy: 0.5214672088623047
Epoch 111, Loss: 0.8863257765769958, Accuracy: 0.5706558227539062
Epoch 121, Loss: 0.8772276639938354, Accuracy: 0.5791075229644775
Epoch 131, Loss: 0.8700950145721436, Accuracy: 0.5823191404342651
Epoch 141, Loss: 0.861958384513855, Accuracy: 0.5941514372825623
Epoch 151, Loss: 0.963180661201477, Accuracy: 0.5378634333610535
Epoch 161, Loss: 0.8642



Epoch 11, Loss: 1.0310654640197754, Accuracy: 0.6151115894317627
Epoch 21, Loss: 0.9537420272827148, Accuracy: 0.6228870749473572
Epoch 31, Loss: 0.8258379101753235, Accuracy: 0.6925287246704102
Epoch 41, Loss: 0.6687868237495422, Accuracy: 0.7841446995735168
Epoch 51, Loss: 0.4916967451572418, Accuracy: 0.8973968625068665
Epoch 61, Loss: 0.3335939943790436, Accuracy: 0.9403312802314758
Epoch 71, Loss: 0.2280096560716629, Accuracy: 0.9479377865791321
Epoch 81, Loss: 0.167161226272583, Accuracy: 0.953008770942688
Epoch 91, Loss: 0.13209505379199982, Accuracy: 0.9580797553062439
Epoch 101, Loss: 0.11054950207471848, Accuracy: 0.9643340110778809
Epoch 111, Loss: 0.095212921500206, Accuracy: 0.9678837060928345
Epoch 121, Loss: 0.0831550881266594, Accuracy: 0.9722785949707031
Epoch 131, Loss: 0.07352902740240097, Accuracy: 0.9743069410324097
Epoch 141, Loss: 0.06485848873853683, Accuracy: 0.9785327911376953
Epoch 151, Loss: 0.05769648775458336, Accuracy: 0.9822515249252319
Epoch 161, Loss: 



Epoch 11, Loss: 0.5407021641731262, Accuracy: 0.8044286966323853
Epoch 21, Loss: 0.4912573993206024, Accuracy: 0.8162609934806824
Epoch 31, Loss: 0.4692041575908661, Accuracy: 0.8240365386009216
Epoch 41, Loss: 0.45643848180770874, Accuracy: 0.8297836184501648
Epoch 51, Loss: 0.44697892665863037, Accuracy: 0.8348546028137207
Epoch 61, Loss: 0.43702900409698486, Accuracy: 0.8346856236457825
Epoch 71, Loss: 0.4240882396697998, Accuracy: 0.8390804529190063
Epoch 81, Loss: 0.40615615248680115, Accuracy: 0.8453347086906433
Epoch 91, Loss: 0.38497617840766907, Accuracy: 0.8534482717514038
Epoch 101, Loss: 0.35809338092803955, Accuracy: 0.8649425506591797
Epoch 111, Loss: 0.33017173409461975, Accuracy: 0.8767748475074768
Epoch 121, Loss: 0.3080112338066101, Accuracy: 0.8798174262046814
Epoch 131, Loss: 0.27652883529663086, Accuracy: 0.8931710720062256
Epoch 141, Loss: 0.24603036046028137, Accuracy: 0.9022988677024841
Epoch 151, Loss: 0.2204398512840271, Accuracy: 0.9154834151268005
Epoch 161,



Epoch 11, Loss: 0.9449153542518616, Accuracy: 0.5534144639968872
Epoch 21, Loss: 0.6423506736755371, Accuracy: 0.7466193437576294
Epoch 31, Loss: 0.3132026791572571, Accuracy: 0.9083840250968933
Epoch 41, Loss: 0.10672739893198013, Accuracy: 0.9743069410324097
Epoch 51, Loss: 0.04531536251306534, Accuracy: 0.9879986643791199
Epoch 61, Loss: 0.026792582124471664, Accuracy: 0.9910412430763245
Epoch 71, Loss: 0.01942538283765316, Accuracy: 0.9923934936523438
Epoch 81, Loss: 0.015606238506734371, Accuracy: 0.9937458038330078
Epoch 91, Loss: 0.012461155652999878, Accuracy: 0.9945909380912781
Epoch 101, Loss: 0.010539916343986988, Accuracy: 0.9954361319541931
Epoch 111, Loss: 0.00927830208092928, Accuracy: 0.9957741498947144
Epoch 121, Loss: 0.008350533433258533, Accuracy: 0.9961122274398804
Epoch 131, Loss: 0.007651806343346834, Accuracy: 0.9962812662124634
Epoch 141, Loss: 0.00716338912025094, Accuracy: 0.9964503049850464
Epoch 151, Loss: 0.007279972080141306, Accuracy: 0.9961122274398804




Epoch 11, Loss: 0.6330666542053223, Accuracy: 0.7883705496788025
Epoch 21, Loss: 0.42645367980003357, Accuracy: 0.8291075229644775
Epoch 31, Loss: 0.36713677644729614, Accuracy: 0.8585192561149597
Epoch 41, Loss: 0.3270653784275055, Accuracy: 0.8759296536445618
Epoch 51, Loss: 0.3024786114692688, Accuracy: 0.8853955268859863
Epoch 61, Loss: 0.28416725993156433, Accuracy: 0.8935091495513916
Epoch 71, Loss: 0.2687815725803375, Accuracy: 0.8989182114601135
Epoch 81, Loss: 0.2552790939807892, Accuracy: 0.907031774520874
Epoch 91, Loss: 0.24247705936431885, Accuracy: 0.9099053144454956
Epoch 101, Loss: 0.2292059361934662, Accuracy: 0.9161595702171326
Epoch 111, Loss: 0.215067520737648, Accuracy: 0.9224137663841248
Epoch 121, Loss: 0.1998629868030548, Accuracy: 0.931034505367279
Epoch 131, Loss: 0.18419408798217773, Accuracy: 0.9374577403068542
Epoch 141, Loss: 0.16842998564243317, Accuracy: 0.9425287246704102
Epoch 151, Loss: 0.1528058499097824, Accuracy: 0.9492900371551514
Epoch 161, Loss:

## Saving aggregate results

In [25]:
# Convert to DataFrame
df = pd.DataFrame(all_results)

# Define dataset name and seed
dataset_name = "pubmed"
seed_value = SEED

# Save as CSV file without sorting
filename = f"{dataset_name}_seed{seed_value}_results.csv"
filename='./pubmed_analysis_results/'+filename
df.to_csv(filename, index=False)

print(f"Results saved as {filename}")

Results saved as ./pubmed_analysis_results/pubmed_seed46_results.csv


In [26]:
all_embeddings= embedding_dict | graph_embeddings_dict

In [27]:
def reorder_dict(original_dict, key_order):
    """
    Reorders a dictionary based on a given list of keys.

    Parameters:
    - original_dict (dict): The dictionary to reorder.
    - key_order (list): The list specifying the desired key order.

    Returns:
    - dict: A new dictionary with keys ordered as per key_order.
    """
    return {key: original_dict[key] for key in key_order if key in original_dict}

In [28]:
key_order = ['random', 'random with gcn', 'random with gat', 'random with graphsage', 'deepwalk', 'deepwalk with gcn', 'deepwalk with gat', 'deepwalk with graphsage', 'node2vec','node2vec with gcn', 'node2vec with gat', 'node2vec with graphsage', 'vgae', 'vgae with gcn', 'vgae with gat', 'vgae with graphsage', 'dgi', 'dgi with gcn', 'dgi with gat', 'dgi with graphsage', 'modularity', 'modularity with gcn', 'modularity with gat', 'modularity with graphsage', 'given', 'given with gcn', 'given with gat', 'given with graphsage']

In [29]:
all_embeddings = reorder_dict(all_embeddings, key_order)