In [1]:
%config InlineBackend.figure_format = 'svg'

In [2]:
import os
import random
import time 
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from node2vec import Node2Vec
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from torch_geometric.data import Data
import spektral
from spektral.layers import GCNConv, GATConv
from spektral.layers import GraphSageConv
from spektral.data import Graph, Dataset, BatchLoader
from scipy.sparse import csr_matrix, lil_matrix
from torch_geometric.datasets import WikiCS
from torch_geometric.nn import DeepGraphInfomax, VGAE
from torch_geometric.utils import from_networkx
import scipy.sparse as sp
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from scipy.sparse.csgraph import laplacian
from scipy.sparse.linalg import eigsh
from collections import Counter
from sklearn.preprocessing import normalize
from joblib import Parallel, delayed
from torch_geometric.nn import GCNConv as PyG_GCNConv, VGAE as PyG_VGAE
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 46

# Set seed for Python's built-in random module
random.seed(SEED)

# Set seed for NumPy
np.random.seed(SEED)

# Set seed for TensorFlow
tf.random.set_seed(SEED)

# Set seed for PyTorch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

In [4]:
# Create a custom Dataset
class WikiCSDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def read(self):
        dataset = WikiCS(root="./WikiCS")  # Load WikiCS dataset
        data = dataset[0]  # Access the first graph

        # Convert to NumPy
        x = data.x.numpy()
        edge_index = data.edge_index.numpy()
        y = data.y.numpy()

        # One-hot encode labels
        num_classes = y.max() + 1
        y_one_hot = np.eye(num_classes)[y]

        # Convert edge_index to sparse adjacency matrix
        num_nodes = x.shape[0]
        adj = lil_matrix((num_nodes, num_nodes), dtype=np.float32)
        for i in range(edge_index.shape[1]):
            src, dst = edge_index[:, i]
            adj[src, dst] = 1
            adj[dst, src] = 1  # Ensure undirected graph

        return [Graph(x=x, a=adj, y=y_one_hot)]

In [5]:
embedding_dimensionality=150

## Extracting modularity embedding and using it for classification

In [6]:
# Laplacian Eigenmaps Embedding
def deepwalk_embedding(G, k=2, walk_length=10, num_walks=80, workers=4):
    node2vec = Node2Vec(G, dimensions=k, walk_length=walk_length, num_walks=num_walks, workers=workers)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])

# Node2Vec Embedding
def node2vec_embedding(G, k=2, seed=SEED):
    node2vec = Node2Vec(G, dimensions=k, walk_length=10, num_walks=100, workers=2, seed=seed)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])


# VGAE Embedding 
class VGAEEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
        self.conv_mu = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for mu
        self.conv_logstd = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for logstd

    def forward(self, x, edge_index):
        x = torch.relu(self.conv1(x, edge_index))
        mu = self.conv_mu(x, edge_index)
        logstd = self.conv_logstd(x, edge_index)
        return mu, logstd

def vgae_embedding(data, k=128):
    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = PyG_VGAE(VGAEEncoder(in_channels, k))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        z = model.encode(x, data.edge_index)  # Use one-hot encoded features
        loss = model.recon_loss(z, data.edge_index) + (1 / data.num_nodes) * model.kl_loss()
        loss.backward()
        optimizer.step()
    
    return model.encode(x, data.edge_index).detach().numpy()

# DGI Embedding
def dgi_embedding(data, k=128):
    class GCNEncoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super().__init__()
            self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
            self.conv2 = PyG_GCNConv(2 * out_channels, out_channels)  # Use PyG_GCNConv

        def forward(self, x, edge_index):
            x = torch.relu(self.conv1(x, edge_index))
            return self.conv2(x, edge_index)

    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = DeepGraphInfomax(
        hidden_channels=k,
        encoder=GCNEncoder(in_channels, k),
        summary=lambda z, *args, **kwargs: z.mean(dim=0),  # Ensure `summary` only takes `z`
        corruption=lambda x, edge_index: (x[torch.randperm(x.size(0))], edge_index)  # Correct corruption function
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        pos_z, neg_z, summary = model(x, data.edge_index)  # Use one-hot encoded features
        loss = model.loss(pos_z, neg_z, summary)
        loss.backward()
        optimizer.step()

    return pos_z.detach().numpy()


# Unsupervised gradient ascent for modularity maximization
def gradient_ascent_modularity_unsupervised(G, k=2, eta=0.01, iterations=1000, seed=SEED):
    np.random.seed(seed)  # Ensure deterministic initialization

    A = nx.to_numpy_array(G)
    l = A.sum(axis=1)
    m = np.sum(l) / 2
    B = A - np.outer(l, l) / (2 * m)
    n = B.shape[0]

    S = np.random.randn(n, k)  # Random Initialization
    S, _ = np.linalg.qr(S)  # Ensure initial orthonormality

    for i in tqdm(range(iterations), desc="Gradient Ascent Progress"):
        grad = (1 / (2 * m)) * B @ S
        S += eta * grad
        S, _ = np.linalg.qr(S)  # Orthonormalize using QR decomposition

    return S

In [7]:
def perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled=3):
    walks = {node: [] for node in G.nodes()}
    for node in G.nodes():
        for _ in range(num_walks):
            walk = [node]
            labeled_count = 0
            for _ in range(walk_length - 1):
                cur = walk[-1]
                neighbors = list(G.neighbors(cur))
                if not neighbors:
                    break
                labeled_neighbors = [n for n in neighbors if label_mask[n]]
                if labeled_neighbors and labeled_count < walk_length_labelled:
                    next_node = random.choice(labeled_neighbors)
                    labeled_count += 1
                else:
                    next_node = random.choice(neighbors)
                walk.append(next_node)
            walks[node].extend([n for n in walk if label_mask[n]])
    return walks

def compute_attention_weights(S, labeled_nodes):
    weights = {}
    for node, labeled in labeled_nodes.items():
        if labeled:
            similarities = {n: np.dot(S[node], S[n]) for n in labeled}
            exp_sims = {n: np.exp(sim) for n, sim in similarities.items()}
            total = sum(exp_sims.values())
            weights[node] = {n: exp_sims[n] / total for n in labeled}
    return weights

def semi_supervised_gradient_ascent_modularity(G, labels, label_mask, k=2, eta=0.01, lambda_supervised=1.0, 
                                                      lambda_semi=2.0, iterations=5000, initialization='random',
                                                      num_walks=10, walk_length=5, walk_length_labelled=3):
    # Convert graph to sparse adjacency matrix
    A = csr_matrix(nx.to_scipy_sparse_array(G, format='csr'))
    degrees = np.array(A.sum(axis=1)).flatten()
    m = G.number_of_edges()
    n = A.shape[0]

    # Initialize embeddings
    if initialization == 'random':
        S = np.random.randn(n, k)
    S, _ = np.linalg.qr(S)

    # Compute labeled random walks and attention weights
    labeled_walks = perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled)
    attention_weights = compute_attention_weights(S, labeled_walks)

    for _ in tqdm(range(iterations), desc="Gradient Ascent with Linear Modularity"):
        # Compute modularity gradient using linear approximation
        neighbor_agg = A @ S  # Efficient aggregation of neighbor embeddings
        global_correction = (degrees[:, None] / (2 * m)) * S.sum(axis=0)
        grad_modularity = (1 / (2 * m)) * (neighbor_agg - global_correction)

        # Compute supervised gradient
        grad_supervised = np.zeros_like(S)
        unique_labels = np.unique(labels[label_mask])
        for label in unique_labels:
            mask = (labels == label) & label_mask
            mean_embedding = np.mean(S[mask], axis=0, keepdims=True)
            grad_supervised[mask] = S[mask] - mean_embedding

        # Compute semi-supervised gradient using adaptive attention
        grad_semi_supervised = np.zeros_like(S)
        for i in range(n):
            if not label_mask[i] and i in attention_weights:
                weighted_embedding = sum(weight * S[n] for n, weight in attention_weights[i].items())
                grad_semi_supervised[i] = S[i] - weighted_embedding

        # Update embeddings
        grad_total = grad_modularity - lambda_supervised * grad_supervised - lambda_semi * grad_semi_supervised
        S += eta * grad_total
        S, _ = np.linalg.qr(S)

    return S

In [8]:
def convert_to_networkx(A):
    return nx.from_scipy_sparse_array(A)

In [9]:
dataset = WikiCSDataset()
ground_truth_labels = dataset[0].y
labels=np.argmax(ground_truth_labels,axis=1)



In [10]:
labels_to_be_masked=np.random.choice(np.arange(len(labels)),int(len(labels)*.7),replace=False)

In [11]:
masked_labels=[]
for i in np.arange(len(labels)):
    if i in labels_to_be_masked:
        masked_labels.append(-1)
    else:
        masked_labels.append(labels[i])
masked_labels=np.array(masked_labels)

In [12]:
label_mask = masked_labels != -1

In [13]:
X = dataset[0].x
A = dataset[0].a
G = convert_to_networkx(A)

In [14]:
print("Adjacency Matrix Shape:", A.shape)
print("Graph Nodes:", G.number_of_nodes())
print("Graph Edges:", G.number_of_edges())

Adjacency Matrix Shape: (11701, 11701)
Graph Nodes: 11701
Graph Edges: 216123


In [15]:
# Convert your preprocessed data into a PyTorch Geometric Data object
X_py = Data(
    x=torch.tensor(X, dtype=torch.float),  # Node features
    edge_index=torch.tensor(np.array(A.nonzero()), dtype=torch.long),  # Edge indices
    y=torch.tensor(labels, dtype=torch.long)  # Labels
)

# Ensure edge_index is in the correct shape (2, num_edges)
X_py.edge_index = X_py.edge_index.to(torch.long)

## Embeddings

In [16]:
# Dictionary for embeddings
embedding_dict = {}
execution_times = []  # List to store execution times

# Compute embeddings and store them with time tracking
def record_time(model_name, func, *args, **kwargs):
    print(f"Computing {model_name} embedding...")
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    execution_times.append((model_name, elapsed_time))
    print(f"{model_name} embedding computed in {elapsed_time:.2f} seconds.")
    return result

X_deepwalk = record_time("DeepWalk", deepwalk_embedding, G, k=embedding_dimensionality)
X_deepwalk = tf.convert_to_tensor(X_deepwalk, dtype=tf.float32)
embedding_dict['deepwalk'] = X_deepwalk

X_vgae = record_time("VGAE", vgae_embedding, X_py, k=embedding_dimensionality)
embedding_dict['vgae'] = X_vgae

X_dgi = record_time("DGI", dgi_embedding, X_py, k=embedding_dimensionality)
embedding_dict['dgi'] = X_dgi

X_modularity = record_time("Modularity", semi_supervised_gradient_ascent_modularity,
                           G, labels, label_mask, k=embedding_dimensionality,
                           eta=0.05, lambda_supervised=1.0, lambda_semi=2.0, iterations=200, initialization='random')
embedding_dict['modularity'] = X_modularity

X_node2vec = record_time("Node2Vec", node2vec_embedding, G, k=embedding_dimensionality)
X_node2vec = tf.convert_to_tensor(X_node2vec, dtype=tf.float32)
embedding_dict['node2vec'] = X_node2vec

# Generate random embedding
print("Generating Random embedding...")
start_time = time.time()
shape = (len(ground_truth_labels), embedding_dimensionality)
X_random = np.random.randn(*shape)
X_random = tf.convert_to_tensor(X_random, dtype=tf.float32)
end_time = time.time()
execution_times.append(("Random", end_time - start_time))
print(f"Random embedding generated in {end_time - start_time:.2f} seconds.")
embedding_dict['random'] = X_random

# Use original node features as 'given' embedding
embedding_dict['given'] = X

print("All embeddings computed and stored in the dictionary successfully.")

# Store execution times in a DataFrame and save
execution_df = pd.DataFrame(execution_times, columns=["Model", "Time (seconds)"])
execution_df.to_csv("./wikics_analysis_results/embedding_execution_times_wikics_"+str(SEED)+".csv", index=False)

print("\nExecution times saved to 'embedding_execution_times.csv'.")
print(execution_df)

Computing DeepWalk embedding...


Computing transition probabilities: 100%|██████████| 11701/11701 [08:54<00:00, 21.91it/s]


DeepWalk embedding computed in 1400.99 seconds.
Computing VGAE embedding...


100%|██████████| 200/200 [10:54<00:00,  3.27s/it]


VGAE embedding computed in 655.49 seconds.
Computing DGI embedding...


100%|██████████| 200/200 [04:29<00:00,  1.35s/it]


DGI embedding computed in 269.58 seconds.
Computing Modularity embedding...


Gradient Ascent with Linear Modularity: 100%|██████████| 200/200 [01:17<00:00,  2.59it/s]


Modularity embedding computed in 83.03 seconds.
Computing Node2Vec embedding...


Computing transition probabilities: 100%|██████████| 11701/11701 [03:48<00:00, 51.28it/s]


Node2Vec embedding computed in 561.71 seconds.
Generating Random embedding...
Random embedding generated in 0.02 seconds.
All embeddings computed and stored in the dictionary successfully.

Execution times saved to 'embedding_execution_times.csv'.
        Model  Time (seconds)
0    DeepWalk     1400.991565
1        VGAE      655.488353
2         DGI      269.578326
3  Modularity       83.026353
4    Node2Vec      561.712193
5      Random        0.021474


## Helper functions

In [17]:
def visualize_all_embeddings(all_embeddings, labels, label_mask):
    """
    Visualize all embeddings in a grid with 4 columns per row using UMAP.

    Parameters:
    - all_embeddings: Dictionary where keys are embedding methods, and values are embeddings.
    - labels: Labels (numpy array of shape [n_nodes]).
    - label_mask: Boolean array indicating known labels (True for known, False for unknown).
    """
    num_embeddings = len(all_embeddings)
    num_rows = (num_embeddings + 3) // 4  # Ensure enough rows for all embeddings
    fig, axes = plt.subplots(num_rows, 4, figsize=(8.27, 11.69))  # A4 size

    for i, (embedding_type, embedding) in tqdm(enumerate(all_embeddings.items()), 
                                               total=num_embeddings, desc="Visualizing embeddings"):
        row, col = divmod(i, 4)
        ax = axes[row, col] if num_rows > 1 else axes[col]  # Adjust for single-row case

        # Ensure embedding is a NumPy array
        if isinstance(embedding, tf.Tensor):
            embedding = embedding.numpy()

        # Reduce dimensionality using UMAP
        reducer = umap.UMAP(n_components=2)
        embedding_2d = reducer.fit_transform(embedding)

        # Known labels
        ax.scatter(embedding_2d[label_mask, 0], embedding_2d[label_mask, 1], 
                   c=labels[label_mask], cmap="Set1", s=3, alpha=0.7, label="Known Labels",
                   edgecolors='none')

        # Unknown labels
        ax.scatter(embedding_2d[~label_mask, 0], embedding_2d[~label_mask, 1], 
                   c=labels[~label_mask], cmap="Set1", s=5, alpha=0.7, 
                   label="Unknown Labels", edgecolors='black', linewidths=0.2)

        # Title with smaller font size
        ax.set_title(embedding_type.upper(), fontsize=8, pad=2)

        # Remove axis labels, ticks, and frames
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_frame_on(False)

    # Remove empty subplots if num_embeddings is not a multiple of 4
    for j in range(i + 1, num_rows * 4):
        row, col = divmod(j, 4)
        fig.delaxes(axes[row, col])

    plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.2, hspace=0.2)  # Adjust margins
    save_path = "./wikics_analysis_results/embedding_grid_plot_wikics.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Visualization saved to {save_path}")
    plt.show()

In [18]:
def evaluate_model(true_labels, predicted_labels):
    """
    Evaluate the model's performance using accuracy, F1-score, and confusion matrix.

    Args:
        true_labels (np.array): Ground truth labels (integers).
        predicted_labels (np.array): Predicted labels (integers).

    Returns:
        dict: A dictionary containing accuracy, F1-score, and confusion matrix.
    """
    # Compute accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    # Compute F1-score (macro-averaged)
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    
    # Compute confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    #
    print(cm)
    
    # Return results as a dictionary
    return {
        'accuracy': accuracy,
        'f1_score': f1
    }

## Classifiers

In [19]:
class GCN(tf.keras.Model):
    def __init__(self, n_labels, seed=42):  # Use an explicit seed
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)  # Define initializer
        
        self.conv1 = GCNConv(16, activation='relu', kernel_initializer=initializer)
        self.conv2 = GCNConv(n_labels, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [20]:
# Define the GAT model
class GAT(tf.keras.Model):
    def __init__(self, n_labels, num_heads=8, seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GATConv(16, attn_heads=num_heads, concat_heads=True, activation='elu', kernel_initializer=initializer)
        self.conv2 = GATConv(n_labels, attn_heads=1, concat_heads=False, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [21]:
# Define the GraphSAGE model
class GraphSAGE(tf.keras.Model):
    def __init__(self, n_labels, hidden_dim=16, aggregator='mean', seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GraphSageConv(hidden_dim, activation='relu', aggregator=aggregator, kernel_initializer=initializer)
        self.conv2 = GraphSageConv(n_labels, activation='softmax', aggregator=aggregator, kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [22]:
classifiers=['gcn','gat','graphsage']

## Classification using different node embeddings

In [23]:
def train_and_evaluate(embedding_dict, embedding, classifier, ground_truth_labels=ground_truth_labels, masked_labels=masked_labels):
    "the labels have to be one hot encoded"
    "model take values: gcn, gat, graphsage"
    print('embedding: ' + embedding.upper())
    print('model: ' + classifier.upper())

    X = embedding_dict[embedding]

    print("Processing...")
    # Create boolean mask for training
    train_mask = masked_labels != -1

    # Split the data into training and prediction sets
    X_train = X[train_mask]  # Training node features
    Y_train = ground_truth_labels[train_mask]  # Training labels (one-hot encoded)
    Y_train = tf.cast(Y_train, dtype='int32')
    
    # Reduce the adjacency matrix to only include training nodes
    A_train = A[train_mask, :][:, train_mask]  # Correctly reduce the adjacency matrix
    
    # Convert sparse adjacency matrix to COO format
    A_coo = A_train.tocoo()
    indices = np.column_stack((A_coo.row, A_coo.col))  # Corrected indices format
    values = A_coo.data
    shape = A_coo.shape  # Shape: (num_nodes, num_nodes)
    
    # Create a sparse tensor for the adjacency matrix
    A_train_tensor = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=shape)
    
    # Ensure the sparse tensor is ordered correctly
    A_train_tensor = tf.sparse.reorder(A_train_tensor)

    print("Training...")
    # Initialize the model
    if classifier == 'gcn':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GCN(n_labels)
    elif classifier == 'gat':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GAT(n_labels)
    elif classifier == 'graphsage':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GraphSAGE(n_labels)
    
    # Compile the model (not strictly necessary when using GradientTape, but useful for metrics)
    model.compile(
        optimizer=Adam(learning_rate=0.01),
        loss=CategoricalCrossentropy(),
        metrics=[CategoricalAccuracy()]
    )
    
    # Print shapes for debugging
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of A_train_tensor: {A_train_tensor.shape}")
    print(f"Shape of Y_train: {Y_train.shape}")
    
    # Define the optimizer and loss function
    optimizer = Adam(learning_rate=0.01)
    loss_fn = CategoricalCrossentropy()
    
    # Training loop with GradientTape
    epochs = 200
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            # Forward pass
            predictions, intermediate_embeddings = model([X_train, A_train_tensor])  # Unpack both outputs
                
            # Compute supervised loss (cross-entropy)
            supervised_loss = loss_fn(Y_train, predictions)
            
        # Compute gradients
        gradients = tape.gradient(supervised_loss, model.trainable_variables)
        
        # Update weights
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        # Print loss and accuracy for monitoring
        if epoch % 10 == 0:
            accuracy = CategoricalAccuracy()(Y_train, predictions)
            print(f"Epoch {epoch + 1}, Loss: {supervised_loss.numpy()}, Accuracy: {accuracy.numpy()}")

    print("Predicting...")
    # Prepare the full graph for prediction
    X_full = X  # Full node features
    A_full = A  # Full adjacency matrix
    
    # Convert the full adjacency matrix to COO format
    A_full_coo = A_full.tocoo()
    indices_full = np.column_stack((A_full_coo.row, A_full_coo.col))
    values_full = A_full_coo.data
    shape_full = A_full_coo.shape
    
    # Create a sparse tensor for the full adjacency matrix
    A_full_tensor = tf.sparse.SparseTensor(indices=indices_full, values=values_full, dense_shape=shape_full)
    A_full_tensor = tf.sparse.reorder(A_full_tensor)
    
    # Make predictions for all nodes
    predictions, emb = model([X_full, A_full_tensor])  # Shape: [num_nodes, n_labels]

    # Convert predictions to class labels (integers)
    predicted_labels = tf.argmax(predictions, axis=1).numpy()  # Shape: [num_nodes]
    
    # Extract predictions for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]

    # True labels for the masked nodes
    true_labels_masked = labels[labels_to_be_masked]
    
    # Predicted labels for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]
    
    # Evaluate the model's performance
    results = evaluate_model(true_labels_masked, predicted_labels_masked)
    
    # Print the results
    print(f"Accuracy: {results['accuracy'] * 100:.2f}%")
    print(f"F1-Score: {results['f1_score']:.4f}")

    results['model'] = classifier
    results['embedding'] = embedding

    # Return results and intermediate embeddings for visualization
    return results, emb

In [24]:
all_results=[]
graph_embeddings_dict={}
for emb in embedding_dict.keys():
    for clf in classifiers:
        results, embedding_matrix = train_and_evaluate(embedding_dict, emb, clf)
        all_results.append(results)
        key_string= emb + ' with ' + clf
        graph_embeddings_dict[key_string]=embedding_matrix

embedding: DEEPWALK
model: GCN
Processing...
Training...
Shape of X_train: (3511, 150)
Shape of A_train_tensor: (3511, 3511)
Shape of Y_train: (3511, 10)
Epoch 1, Loss: 115.68543243408203, Accuracy: 0.03674167022109032
Epoch 11, Loss: 8.77559757232666, Accuracy: 0.44232413172721863
Epoch 21, Loss: 2.6286704540252686, Accuracy: 0.4474508762359619
Epoch 31, Loss: 1.9450163841247559, Accuracy: 0.46226146817207336
Epoch 41, Loss: 1.8319092988967896, Accuracy: 0.4389062821865082
Epoch 51, Loss: 1.7532421350479126, Accuracy: 0.44317859411239624
Epoch 61, Loss: 1.6972224712371826, Accuracy: 0.4571347236633301
Epoch 71, Loss: 1.6411986351013184, Accuracy: 0.46254628896713257
Epoch 81, Loss: 1.6094704866409302, Accuracy: 0.4693819284439087
Epoch 91, Loss: 1.5741814374923706, Accuracy: 0.47849616408348083
Epoch 101, Loss: 1.5570604801177979, Accuracy: 0.4776417016983032
Epoch 111, Loss: 1.5384483337402344, Accuracy: 0.486755907535553
Epoch 121, Loss: 1.5238734483718872, Accuracy: 0.4870407283306



Epoch 11, Loss: 0.9300974011421204, Accuracy: 0.7205924391746521
Epoch 21, Loss: 0.7698954939842224, Accuracy: 0.7678723931312561
Epoch 31, Loss: 0.64713454246521, Accuracy: 0.7994873523712158
Epoch 41, Loss: 0.583782970905304, Accuracy: 0.8174309134483337
Epoch 51, Loss: 0.5271232724189758, Accuracy: 0.8336656093597412
Epoch 61, Loss: 0.4785342216491699, Accuracy: 0.8450583815574646
Epoch 71, Loss: 0.43271467089653015, Accuracy: 0.8627171516418457
Epoch 81, Loss: 0.3860606551170349, Accuracy: 0.8709769248962402
Epoch 91, Loss: 0.33824217319488525, Accuracy: 0.8869268298149109
Epoch 101, Loss: 0.2933032214641571, Accuracy: 0.9034463167190552
Epoch 111, Loss: 0.2537045180797577, Accuracy: 0.9188265204429626
Epoch 121, Loss: 0.21726755797863007, Accuracy: 0.9342067837715149
Epoch 131, Loss: 0.19300468266010284, Accuracy: 0.9401879906654358
Epoch 141, Loss: 0.16577138006687164, Accuracy: 0.9501566290855408
Epoch 151, Loss: 0.1486230492591858, Accuracy: 0.953859269618988
Epoch 161, Loss: 0



Epoch 11, Loss: 1.035008430480957, Accuracy: 0.6915408968925476
Epoch 21, Loss: 0.8854212164878845, Accuracy: 0.7262887954711914
Epoch 31, Loss: 0.8051554560661316, Accuracy: 0.7533466219902039
Epoch 41, Loss: 0.7500538229942322, Accuracy: 0.7670179605484009
Epoch 51, Loss: 0.7001828551292419, Accuracy: 0.7761321663856506
Epoch 61, Loss: 0.6540089249610901, Accuracy: 0.7892338633537292
Epoch 71, Loss: 0.60956871509552, Accuracy: 0.8048989176750183
Epoch 81, Loss: 0.5632889270782471, Accuracy: 0.8214184045791626
Epoch 91, Loss: 0.51694655418396, Accuracy: 0.835659384727478
Epoch 101, Loss: 0.4693875014781952, Accuracy: 0.8550270795822144
Epoch 111, Loss: 0.42214736342430115, Accuracy: 0.8706921339035034
Epoch 121, Loss: 0.376056045293808, Accuracy: 0.8883509039878845
Epoch 131, Loss: 0.33207547664642334, Accuracy: 0.9040159583091736
Epoch 141, Loss: 0.28966471552848816, Accuracy: 0.9191113710403442
Epoch 151, Loss: 0.25420019030570984, Accuracy: 0.930219292640686
Epoch 161, Loss: 0.2170



Epoch 11, Loss: 1.7985327243804932, Accuracy: 0.46653375029563904
Epoch 21, Loss: 1.5947238206863403, Accuracy: 0.46112218499183655
Epoch 31, Loss: 1.423924207687378, Accuracy: 0.548561692237854
Epoch 41, Loss: 1.3035420179367065, Accuracy: 0.5952720046043396
Epoch 51, Loss: 1.214573860168457, Accuracy: 0.6183423399925232
Epoch 61, Loss: 1.145402431488037, Accuracy: 0.6493876576423645
Epoch 71, Loss: 1.080390214920044, Accuracy: 0.6693249940872192
Epoch 81, Loss: 1.0236294269561768, Accuracy: 0.6829962730407715
Epoch 91, Loss: 0.9798058867454529, Accuracy: 0.6926801204681396
Epoch 101, Loss: 0.9442342519760132, Accuracy: 0.7043577432632446
Epoch 111, Loss: 0.9176545143127441, Accuracy: 0.7077755331993103
Epoch 121, Loss: 0.8860204219818115, Accuracy: 0.7200227975845337
Epoch 131, Loss: 0.8623431324958801, Accuracy: 0.7251495122909546
Epoch 141, Loss: 0.851526141166687, Accuracy: 0.7308459281921387
Epoch 151, Loss: 0.8291932940483093, Accuracy: 0.7334092855453491
Epoch 161, Loss: 0.8155



Epoch 11, Loss: 2.061417579650879, Accuracy: 0.23639988899230957
Epoch 21, Loss: 1.9427027702331543, Accuracy: 0.23639988899230957
Epoch 31, Loss: 1.7228965759277344, Accuracy: 0.42010822892189026
Epoch 41, Loss: 1.3786962032318115, Accuracy: 0.7288521528244019
Epoch 51, Loss: 0.9974873661994934, Accuracy: 0.8205639123916626
Epoch 61, Loss: 0.6979177594184875, Accuracy: 0.8487610220909119
Epoch 71, Loss: 0.5119797587394714, Accuracy: 0.8601537942886353
Epoch 81, Loss: 0.42193901538848877, Accuracy: 0.8732554912567139
Epoch 91, Loss: 0.3775746822357178, Accuracy: 0.8769581317901611
Epoch 101, Loss: 0.3528193235397339, Accuracy: 0.8823696970939636
Epoch 111, Loss: 0.3342541754245758, Accuracy: 0.8889205455780029
Epoch 121, Loss: 0.3164854645729065, Accuracy: 0.8940472602844238
Epoch 131, Loss: 0.30071553587913513, Accuracy: 0.895756185054779
Epoch 141, Loss: 0.28697705268859863, Accuracy: 0.9023070335388184
Epoch 151, Loss: 0.276190847158432, Accuracy: 0.9071489572525024
Epoch 161, Loss:



Epoch 11, Loss: 0.9254922866821289, Accuracy: 0.7197379469871521
Epoch 21, Loss: 0.7597125172615051, Accuracy: 0.7678723931312561
Epoch 31, Loss: 0.6382719278335571, Accuracy: 0.8009114265441895
Epoch 41, Loss: 0.5707075595855713, Accuracy: 0.821133553981781
Epoch 51, Loss: 0.5164716839790344, Accuracy: 0.8393620252609253
Epoch 61, Loss: 0.4634150266647339, Accuracy: 0.8550270795822144
Epoch 71, Loss: 0.4101620316505432, Accuracy: 0.8715465664863586
Epoch 81, Loss: 0.35885077714920044, Accuracy: 0.8852179050445557
Epoch 91, Loss: 0.31028833985328674, Accuracy: 0.9017373919487
Epoch 101, Loss: 0.26513826847076416, Accuracy: 0.9122757315635681
Epoch 111, Loss: 0.22613245248794556, Accuracy: 0.9276559352874756
Epoch 121, Loss: 0.1904897689819336, Accuracy: 0.941896915435791
Epoch 131, Loss: 0.16311459243297577, Accuracy: 0.9504414796829224
Epoch 141, Loss: 0.14400714635849, Accuracy: 0.952720046043396
Epoch 151, Loss: 0.13312917947769165, Accuracy: 0.9549985527992249
Epoch 161, Loss: 0.13



Epoch 11, Loss: 1.3205773830413818, Accuracy: 0.609512984752655
Epoch 21, Loss: 0.8859952688217163, Accuracy: 0.7274280786514282
Epoch 31, Loss: 0.5803922414779663, Accuracy: 0.8182854056358337
Epoch 41, Loss: 0.35367128252983093, Accuracy: 0.9017373919487
Epoch 51, Loss: 0.2092457115650177, Accuracy: 0.9413272738456726
Epoch 61, Loss: 0.14493437111377716, Accuracy: 0.9598404765129089
Epoch 71, Loss: 0.09907355904579163, Accuracy: 0.9729421734809875
Epoch 81, Loss: 0.0740928202867508, Accuracy: 0.9800626635551453
Epoch 91, Loss: 0.059070561081171036, Accuracy: 0.9837653040885925
Epoch 101, Loss: 0.04915054515004158, Accuracy: 0.9857590198516846
Epoch 111, Loss: 0.04150977358222008, Accuracy: 0.9874679446220398
Epoch 121, Loss: 0.035311248153448105, Accuracy: 0.9897465109825134
Epoch 131, Loss: 0.030131828039884567, Accuracy: 0.9914554357528687
Epoch 141, Loss: 0.02622896246612072, Accuracy: 0.9920250773429871
Epoch 151, Loss: 0.06723947823047638, Accuracy: 0.976360023021698
Epoch 161, 



Epoch 11, Loss: 1.225242018699646, Accuracy: 0.595841646194458
Epoch 21, Loss: 0.9118409156799316, Accuracy: 0.7299914360046387
Epoch 31, Loss: 0.7632795572280884, Accuracy: 0.7544859051704407
Epoch 41, Loss: 0.6725379824638367, Accuracy: 0.7821133732795715
Epoch 51, Loss: 0.6093865633010864, Accuracy: 0.8037596344947815
Epoch 61, Loss: 0.5557217597961426, Accuracy: 0.8194246888160706
Epoch 71, Loss: 0.5083109736442566, Accuracy: 0.8362289667129517
Epoch 81, Loss: 0.4668281674385071, Accuracy: 0.8516092300415039
Epoch 91, Loss: 0.4320566654205322, Accuracy: 0.8641412854194641
Epoch 101, Loss: 0.39699122309684753, Accuracy: 0.8718313574790955
Epoch 111, Loss: 0.36419445276260376, Accuracy: 0.8849330544471741
Epoch 121, Loss: 0.3389788568019867, Accuracy: 0.8920535445213318
Epoch 131, Loss: 0.31090247631073, Accuracy: 0.9020222425460815
Epoch 141, Loss: 0.2741140127182007, Accuracy: 0.9142694473266602
Epoch 151, Loss: 0.25018757581710815, Accuracy: 0.9233836531639099
Epoch 161, Loss: 0.2

## Saving aggregate results

In [25]:
# Convert to DataFrame
df = pd.DataFrame(all_results)

# Define dataset name and seed
dataset_name = "wikics"
seed_value = SEED

# Save as CSV file without sorting
filename = f"{dataset_name}_seed{seed_value}_results.csv"
filename='./wikics_analysis_results/'+filename
df.to_csv(filename, index=False)

print(f"Results saved as {filename}")

Results saved as ./wikics_analysis_results/wikics_seed46_results.csv


In [26]:
all_embeddings= embedding_dict | graph_embeddings_dict

In [27]:
def reorder_dict(original_dict, key_order):
    """
    Reorders a dictionary based on a given list of keys.

    Parameters:
    - original_dict (dict): The dictionary to reorder.
    - key_order (list): The list specifying the desired key order.

    Returns:
    - dict: A new dictionary with keys ordered as per key_order.
    """
    return {key: original_dict[key] for key in key_order if key in original_dict}

In [28]:
key_order = ['random', 'random with gcn', 'random with gat', 'random with graphsage', 'deepwalk', 'deepwalk with gcn', 'deepwalk with gat', 'deepwalk with graphsage', 'node2vec','node2vec with gcn', 'node2vec with gat', 'node2vec with graphsage', 'vgae', 'vgae with gcn', 'vgae with gat', 'vgae with graphsage', 'dgi', 'dgi with gcn', 'dgi with gat', 'dgi with graphsage', 'modularity', 'modularity with gcn', 'modularity with gat', 'modularity with graphsage', 'given', 'given with gcn', 'given with gat', 'given with graphsage']

In [29]:
all_embeddings = reorder_dict(all_embeddings, key_order)