In [1]:
%config InlineBackend.figure_format = 'svg'

In [2]:
import os
import random
import time 
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import umap.umap_ as umap
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
from node2vec import Node2Vec
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
import torch
from torch_geometric.data import Data
import spektral
from spektral.layers import GCNConv, GATConv
from spektral.layers import GraphSageConv
from spektral.data import Graph, Dataset, BatchLoader
from scipy.sparse import csr_matrix, lil_matrix
from torch_geometric.datasets import Amazon
from torch_geometric.nn import DeepGraphInfomax, VGAE
from torch_geometric.utils import from_networkx
import scipy.sparse as sp
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from scipy.sparse.csgraph import laplacian
from scipy.sparse.linalg import eigsh
from collections import Counter
from sklearn.preprocessing import normalize
from joblib import Parallel, delayed
from torch_geometric.nn import GCNConv as PyG_GCNConv, VGAE as PyG_VGAE
from torch_geometric.data import Data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
SEED = 46

# Set seed for Python's built-in random module
random.seed(SEED)

# Set seed for NumPy
np.random.seed(SEED)

# Set seed for TensorFlow
tf.random.set_seed(SEED)

# Set seed for PyTorch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)

In [4]:
# Create a custom Dataset for DBLP
class AmazonPhotosDataset(Dataset):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def read(self):
        dataset = Amazon(".", name="photo")  # Load Amazon Computers dataset
        data = dataset[0]

        x = data.x.numpy()
        edge_index = data.edge_index.numpy()
        y = data.y.numpy()

        # One-hot encode labels
        num_classes = y.max() + 1
        y_one_hot = np.eye(num_classes)[y]

        # Convert edge_index to adjacency matrix
        num_nodes = x.shape[0]
        adj = lil_matrix((num_nodes, num_nodes), dtype=np.float32)
        for i in range(edge_index.shape[1]):
            src, dst = edge_index[:, i]
            adj[src, dst] = 1
            adj[dst, src] = 1  

        return [Graph(x=x, a=adj, y=y_one_hot)]


In [5]:
embedding_dimensionality=150

## Extracting modularity embedding and using it for classification

In [6]:
# Laplacian Eigenmaps Embedding
def deepwalk_embedding(G, k=2, walk_length=10, num_walks=80, workers=4):
    node2vec = Node2Vec(G, dimensions=k, walk_length=walk_length, num_walks=num_walks, workers=workers)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])

# Node2Vec Embedding
def node2vec_embedding(G, k=2, seed=SEED):
    node2vec = Node2Vec(G, dimensions=k, walk_length=10, num_walks=100, workers=2, seed=seed)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return np.array([model.wv[str(node)] for node in G.nodes()])


# VGAE Embedding 
class VGAEEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
        self.conv_mu = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for mu
        self.conv_logstd = PyG_GCNConv(2 * out_channels, out_channels)  # Separate layer for logstd

    def forward(self, x, edge_index):
        x = torch.relu(self.conv1(x, edge_index))
        mu = self.conv_mu(x, edge_index)
        logstd = self.conv_logstd(x, edge_index)
        return mu, logstd

def vgae_embedding(data, k=128):
    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = PyG_VGAE(VGAEEncoder(in_channels, k))
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    
    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        z = model.encode(x, data.edge_index)  # Use one-hot encoded features
        loss = model.recon_loss(z, data.edge_index) + (1 / data.num_nodes) * model.kl_loss()
        loss.backward()
        optimizer.step()
    
    return model.encode(x, data.edge_index).detach().numpy()

# DGI Embedding
def dgi_embedding(data, k=128):
    class GCNEncoder(torch.nn.Module):
        def __init__(self, in_channels, out_channels):
            super().__init__()
            self.conv1 = PyG_GCNConv(in_channels, 2 * out_channels)  # Use PyG_GCNConv
            self.conv2 = PyG_GCNConv(2 * out_channels, out_channels)  # Use PyG_GCNConv

        def forward(self, x, edge_index):
            x = torch.relu(self.conv1(x, edge_index))
            return self.conv2(x, edge_index)

    # Use one-hot encoded node IDs as features
    num_nodes = data.num_nodes
    x = torch.eye(num_nodes)  # One-hot encoded node features

    in_channels = x.shape[1]  # Feature dimension is equal to the number of nodes
    model = DeepGraphInfomax(
        hidden_channels=k,
        encoder=GCNEncoder(in_channels, k),
        summary=lambda z, *args, **kwargs: z.mean(dim=0),  # Ensure `summary` only takes `z`
        corruption=lambda x, edge_index: (x[torch.randperm(x.size(0))], edge_index)  # Correct corruption function
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    for _ in tqdm(range(200)):
        optimizer.zero_grad()
        pos_z, neg_z, summary = model(x, data.edge_index)  # Use one-hot encoded features
        loss = model.loss(pos_z, neg_z, summary)
        loss.backward()
        optimizer.step()

    return pos_z.detach().numpy()


# Unsupervised gradient ascent for modularity maximization
def gradient_ascent_modularity_unsupervised(G, k=2, eta=0.01, iterations=1000, seed=SEED):
    np.random.seed(seed)  # Ensure deterministic initialization

    A = nx.to_numpy_array(G)
    l = A.sum(axis=1)
    m = np.sum(l) / 2
    B = A - np.outer(l, l) / (2 * m)
    n = B.shape[0]

    S = np.random.randn(n, k)  # Random Initialization
    S, _ = np.linalg.qr(S)  # Ensure initial orthonormality

    for i in tqdm(range(iterations), desc="Gradient Ascent Progress"):
        grad = (1 / (2 * m)) * B @ S
        S += eta * grad
        S, _ = np.linalg.qr(S)  # Orthonormalize using QR decomposition

    return S

In [7]:
def perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled=3):
    walks = {node: [] for node in G.nodes()}
    for node in G.nodes():
        for _ in range(num_walks):
            walk = [node]
            labeled_count = 0
            for _ in range(walk_length - 1):
                cur = walk[-1]
                neighbors = list(G.neighbors(cur))
                if not neighbors:
                    break
                labeled_neighbors = [n for n in neighbors if label_mask[n]]
                if labeled_neighbors and labeled_count < walk_length_labelled:
                    next_node = random.choice(labeled_neighbors)
                    labeled_count += 1
                else:
                    next_node = random.choice(neighbors)
                walk.append(next_node)
            walks[node].extend([n for n in walk if label_mask[n]])
    return walks

def compute_attention_weights(S, labeled_nodes):
    weights = {}
    for node, labeled in labeled_nodes.items():
        if labeled:
            similarities = {n: np.dot(S[node], S[n]) for n in labeled}
            exp_sims = {n: np.exp(sim) for n, sim in similarities.items()}
            total = sum(exp_sims.values())
            weights[node] = {n: exp_sims[n] / total for n in labeled}
    return weights

def semi_supervised_gradient_ascent_modularity(G, labels, label_mask, k=2, eta=0.01, lambda_supervised=1.0, 
                                                      lambda_semi=2.0, iterations=5000, initialization='random',
                                                      num_walks=10, walk_length=5, walk_length_labelled=3):
    # Convert graph to sparse adjacency matrix
    A = csr_matrix(nx.to_scipy_sparse_array(G, format='csr'))
    degrees = np.array(A.sum(axis=1)).flatten()
    m = G.number_of_edges()
    n = A.shape[0]

    # Initialize embeddings
    if initialization == 'random':
        S = np.random.randn(n, k)
    S, _ = np.linalg.qr(S)

    # Compute labeled random walks and attention weights
    labeled_walks = perform_labeled_random_walks(G, label_mask, labels, num_walks, walk_length, walk_length_labelled)
    attention_weights = compute_attention_weights(S, labeled_walks)

    for _ in tqdm(range(iterations), desc="Gradient Ascent with Linear Modularity"):
        # Compute modularity gradient using linear approximation
        neighbor_agg = A @ S  # Efficient aggregation of neighbor embeddings
        global_correction = (degrees[:, None] / (2 * m)) * S.sum(axis=0)
        grad_modularity = (1 / (2 * m)) * (neighbor_agg - global_correction)

        # Compute supervised gradient
        grad_supervised = np.zeros_like(S)
        unique_labels = np.unique(labels[label_mask])
        for label in unique_labels:
            mask = (labels == label) & label_mask
            mean_embedding = np.mean(S[mask], axis=0, keepdims=True)
            grad_supervised[mask] = S[mask] - mean_embedding

        # Compute semi-supervised gradient using adaptive attention
        grad_semi_supervised = np.zeros_like(S)
        for i in range(n):
            if not label_mask[i] and i in attention_weights:
                weighted_embedding = sum(weight * S[n] for n, weight in attention_weights[i].items())
                grad_semi_supervised[i] = S[i] - weighted_embedding

        # Update embeddings
        grad_total = grad_modularity - lambda_supervised * grad_supervised - lambda_semi * grad_semi_supervised
        S += eta * grad_total
        S, _ = np.linalg.qr(S)

    return S

In [8]:
def convert_to_networkx(A):
    return nx.from_scipy_sparse_array(A)

In [9]:
dataset = AmazonPhotosDataset()
ground_truth_labels = dataset[0].y
labels=np.argmax(ground_truth_labels,axis=1)

In [10]:
labels_to_be_masked=np.random.choice(np.arange(len(labels)),int(len(labels)*.7),replace=False)

In [11]:
masked_labels=[]
for i in np.arange(len(labels)):
    if i in labels_to_be_masked:
        masked_labels.append(-1)
    else:
        masked_labels.append(labels[i])
masked_labels=np.array(masked_labels)

In [12]:
label_mask = masked_labels != -1

In [13]:
X = dataset[0].x
A = dataset[0].a
G = convert_to_networkx(A)

In [14]:
print("Adjacency Matrix Shape:", A.shape)
print("Graph Nodes:", G.number_of_nodes())
print("Graph Edges:", G.number_of_edges())

Adjacency Matrix Shape: (7650, 7650)
Graph Nodes: 7650
Graph Edges: 119081


In [15]:
# Convert your preprocessed data into a PyTorch Geometric Data object
X_py = Data(
    x=torch.tensor(X, dtype=torch.float),  # Node features
    edge_index=torch.tensor(np.array(A.nonzero()), dtype=torch.long),  # Edge indices
    y=torch.tensor(labels, dtype=torch.long)  # Labels
)

# Ensure edge_index is in the correct shape (2, num_edges)
X_py.edge_index = X_py.edge_index.to(torch.long)

## Embeddings

In [16]:
# Dictionary for embeddings
embedding_dict = {}
execution_times = []  # List to store execution times

# Compute embeddings and store them with time tracking
def record_time(model_name, func, *args, **kwargs):
    print(f"Computing {model_name} embedding...")
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    execution_times.append((model_name, elapsed_time))
    print(f"{model_name} embedding computed in {elapsed_time:.2f} seconds.")
    return result

X_deepwalk = record_time("DeepWalk", deepwalk_embedding, G, k=embedding_dimensionality)
X_deepwalk = tf.convert_to_tensor(X_deepwalk, dtype=tf.float32)
embedding_dict['deepwalk'] = X_deepwalk

X_vgae = record_time("VGAE", vgae_embedding, X_py, k=embedding_dimensionality)
embedding_dict['vgae'] = X_vgae

X_dgi = record_time("DGI", dgi_embedding, X_py, k=embedding_dimensionality)
embedding_dict['dgi'] = X_dgi

X_modularity = record_time("Modularity", semi_supervised_gradient_ascent_modularity,
                           G, labels, label_mask, k=embedding_dimensionality,
                           eta=0.05, lambda_supervised=1.0, lambda_semi=2.0, iterations=200, initialization='random')
embedding_dict['modularity'] = X_modularity

X_node2vec = record_time("Node2Vec", node2vec_embedding, G, k=embedding_dimensionality)
X_node2vec = tf.convert_to_tensor(X_node2vec, dtype=tf.float32)
embedding_dict['node2vec'] = X_node2vec

# Generate random embedding
print("Generating Random embedding...")
start_time = time.time()
shape = (len(ground_truth_labels), embedding_dimensionality)
X_random = np.random.randn(*shape)
X_random = tf.convert_to_tensor(X_random, dtype=tf.float32)
end_time = time.time()
execution_times.append(("Random", end_time - start_time))
print(f"Random embedding generated in {end_time - start_time:.2f} seconds.")
embedding_dict['random'] = X_random

# Use original node features as 'given' embedding
embedding_dict['given'] = X

print("All embeddings computed and stored in the dictionary successfully.")

# Store execution times in a DataFrame and save
execution_df = pd.DataFrame(execution_times, columns=["Model", "Time (seconds)"])
execution_df.to_csv("./photo_analysis_results/embedding_execution_times_photo_"+str(SEED)+".csv", index=False)

print("\nExecution times saved to 'embedding_execution_times.csv'.")
print(execution_df)

Computing DeepWalk embedding...


Computing transition probabilities: 100%|██████████| 7650/7650 [02:34<00:00, 49.49it/s]


DeepWalk embedding computed in 480.52 seconds.
Computing VGAE embedding...


100%|██████████| 200/200 [02:53<00:00,  1.15it/s]


VGAE embedding computed in 174.10 seconds.
Computing DGI embedding...


100%|██████████| 200/200 [02:53<00:00,  1.15it/s]


DGI embedding computed in 173.57 seconds.
Computing Modularity embedding...


Gradient Ascent with Linear Modularity: 100%|██████████| 200/200 [03:56<00:00,  1.18s/it]


Modularity embedding computed in 243.26 seconds.
Computing Node2Vec embedding...


Computing transition probabilities: 100%|██████████| 7650/7650 [03:33<00:00, 35.76it/s]


Node2Vec embedding computed in 1169.81 seconds.
Generating Random embedding...
Random embedding generated in 0.03 seconds.
All embeddings computed and stored in the dictionary successfully.

Execution times saved to 'embedding_execution_times.csv'.
        Model  Time (seconds)
0    DeepWalk      480.520978
1        VGAE      174.098534
2         DGI      173.570651
3  Modularity      243.257231
4    Node2Vec     1169.812063
5      Random        0.030629


## Helper functions

In [17]:
def visualize_all_embeddings(all_embeddings, labels, label_mask):
    """
    Visualize all embeddings in a grid with 4 columns per row using UMAP.

    Parameters:
    - all_embeddings: Dictionary where keys are embedding methods, and values are embeddings.
    - labels: Labels (numpy array of shape [n_nodes]).
    - label_mask: Boolean array indicating known labels (True for known, False for unknown).
    """
    num_embeddings = len(all_embeddings)
    num_rows = (num_embeddings + 3) // 4  # Ensure enough rows for all embeddings
    fig, axes = plt.subplots(num_rows, 4, figsize=(8.27, 11.69))  # A4 size

    for i, (embedding_type, embedding) in tqdm(enumerate(all_embeddings.items()), 
                                               total=num_embeddings, desc="Visualizing embeddings"):
        row, col = divmod(i, 4)
        ax = axes[row, col] if num_rows > 1 else axes[col]  # Adjust for single-row case

        # Ensure embedding is a NumPy array
        if isinstance(embedding, tf.Tensor):
            embedding = embedding.numpy()

        # Reduce dimensionality using UMAP
        reducer = umap.UMAP(n_components=2)
        embedding_2d = reducer.fit_transform(embedding)

        # Known labels
        ax.scatter(embedding_2d[label_mask, 0], embedding_2d[label_mask, 1], 
                   c=labels[label_mask], cmap="Set1", s=3, alpha=0.7, label="Known Labels",
                   edgecolors='none')

        # Unknown labels
        ax.scatter(embedding_2d[~label_mask, 0], embedding_2d[~label_mask, 1], 
                   c=labels[~label_mask], cmap="Set1", s=5, alpha=0.7, 
                   label="Unknown Labels", edgecolors='black', linewidths=0.2)

        # Title with smaller font size
        ax.set_title(embedding_type.upper(), fontsize=8, pad=2)

        # Remove axis labels, ticks, and frames
        ax.set_xticks([])
        ax.set_yticks([])
        ax.set_frame_on(False)

    # Remove empty subplots if num_embeddings is not a multiple of 4
    for j in range(i + 1, num_rows * 4):
        row, col = divmod(j, 4)
        fig.delaxes(axes[row, col])

    plt.subplots_adjust(left=0.05, right=0.95, top=0.95, bottom=0.05, wspace=0.2, hspace=0.2)  # Adjust margins
    save_path = "./photo_analysis_results/embedding_grid_plot_photo.png"
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    print(f"Visualization saved to {save_path}")
    plt.show()

In [18]:
def evaluate_model(true_labels, predicted_labels):
    """
    Evaluate the model's performance using accuracy, F1-score, and confusion matrix.

    Args:
        true_labels (np.array): Ground truth labels (integers).
        predicted_labels (np.array): Predicted labels (integers).

    Returns:
        dict: A dictionary containing accuracy, F1-score, and confusion matrix.
    """
    # Compute accuracy
    accuracy = accuracy_score(true_labels, predicted_labels)
    
    # Compute F1-score (macro-averaged)
    f1 = f1_score(true_labels, predicted_labels, average='macro')
    
    # Compute confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    #
    print(cm)
    
    # Return results as a dictionary
    return {
        'accuracy': accuracy,
        'f1_score': f1
    }

## Classifiers

In [19]:
class GCN(tf.keras.Model):
    def __init__(self, n_labels, seed=42):  # Use an explicit seed
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)  # Define initializer
        
        self.conv1 = GCNConv(16, activation='relu', kernel_initializer=initializer)
        self.conv2 = GCNConv(n_labels, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [20]:
# Define the GAT model
class GAT(tf.keras.Model):
    def __init__(self, n_labels, num_heads=8, seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GATConv(16, attn_heads=num_heads, concat_heads=True, activation='elu', kernel_initializer=initializer)
        self.conv2 = GATConv(n_labels, attn_heads=1, concat_heads=False, activation='softmax', kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [21]:
# Define the GraphSAGE model
class GraphSAGE(tf.keras.Model):
    def __init__(self, n_labels, hidden_dim=16, aggregator='mean', seed=42):
        super().__init__()
        initializer = tf.keras.initializers.GlorotUniform(seed=seed)

        self.conv1 = GraphSageConv(hidden_dim, activation='relu', aggregator=aggregator, kernel_initializer=initializer)
        self.conv2 = GraphSageConv(n_labels, activation='softmax', aggregator=aggregator, kernel_initializer=initializer)

    def call(self, inputs):
        x, a = inputs
        intermediate_embeddings = self.conv1([x, a])  # Store intermediate embeddings
        x = self.conv2([intermediate_embeddings, a])
        return x, intermediate_embeddings  # Return both final output and intermediate embeddings

In [22]:
classifiers=['gcn','gat','graphsage']

## Classification using different node embeddings

In [23]:
def train_and_evaluate(embedding_dict, embedding, classifier, ground_truth_labels=ground_truth_labels, masked_labels=masked_labels):
    "the labels have to be one hot encoded"
    "model take values: gcn, gat, graphsage"
    print('embedding: ' + embedding.upper())
    print('model: ' + classifier.upper())

    X = embedding_dict[embedding]

    print("Processing...")
    # Create boolean mask for training
    train_mask = masked_labels != -1

    # Split the data into training and prediction sets
    X_train = X[train_mask]  # Training node features
    Y_train = ground_truth_labels[train_mask]  # Training labels (one-hot encoded)
    Y_train = tf.cast(Y_train, dtype='int32')
    
    # Reduce the adjacency matrix to only include training nodes
    A_train = A[train_mask, :][:, train_mask]  # Correctly reduce the adjacency matrix
    
    # Convert sparse adjacency matrix to COO format
    A_coo = A_train.tocoo()
    indices = np.column_stack((A_coo.row, A_coo.col))  # Corrected indices format
    values = A_coo.data
    shape = A_coo.shape  # Shape: (num_nodes, num_nodes)
    
    # Create a sparse tensor for the adjacency matrix
    A_train_tensor = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=shape)
    
    # Ensure the sparse tensor is ordered correctly
    A_train_tensor = tf.sparse.reorder(A_train_tensor)

    print("Training...")
    # Initialize the model
    if classifier == 'gcn':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GCN(n_labels)
    elif classifier == 'gat':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GAT(n_labels)
    elif classifier == 'graphsage':
        n_labels = ground_truth_labels.shape[1]  # Number of classes
        model = GraphSAGE(n_labels)
    
    # Compile the model (not strictly necessary when using GradientTape, but useful for metrics)
    model.compile(
        optimizer=Adam(learning_rate=0.01),
        loss=CategoricalCrossentropy(),
        metrics=[CategoricalAccuracy()]
    )
    
    # Print shapes for debugging
    print(f"Shape of X_train: {X_train.shape}")
    print(f"Shape of A_train_tensor: {A_train_tensor.shape}")
    print(f"Shape of Y_train: {Y_train.shape}")
    
    # Define the optimizer and loss function
    optimizer = Adam(learning_rate=0.01)
    loss_fn = CategoricalCrossentropy()
    
    # Training loop with GradientTape
    epochs = 200
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            # Forward pass
            predictions, intermediate_embeddings = model([X_train, A_train_tensor])  # Unpack both outputs
                
            # Compute supervised loss (cross-entropy)
            supervised_loss = loss_fn(Y_train, predictions)
            
        # Compute gradients
        gradients = tape.gradient(supervised_loss, model.trainable_variables)
        
        # Update weights
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        
        # Print loss and accuracy for monitoring
        if epoch % 10 == 0:
            accuracy = CategoricalAccuracy()(Y_train, predictions)
            print(f"Epoch {epoch + 1}, Loss: {supervised_loss.numpy()}, Accuracy: {accuracy.numpy()}")

    print("Predicting...")
    # Prepare the full graph for prediction
    X_full = X  # Full node features
    A_full = A  # Full adjacency matrix
    
    # Convert the full adjacency matrix to COO format
    A_full_coo = A_full.tocoo()
    indices_full = np.column_stack((A_full_coo.row, A_full_coo.col))
    values_full = A_full_coo.data
    shape_full = A_full_coo.shape
    
    # Create a sparse tensor for the full adjacency matrix
    A_full_tensor = tf.sparse.SparseTensor(indices=indices_full, values=values_full, dense_shape=shape_full)
    A_full_tensor = tf.sparse.reorder(A_full_tensor)
    
    # Make predictions for all nodes
    predictions, emb = model([X_full, A_full_tensor])  # Shape: [num_nodes, n_labels]

    # Convert predictions to class labels (integers)
    predicted_labels = tf.argmax(predictions, axis=1).numpy()  # Shape: [num_nodes]
    
    # Extract predictions for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]

    # True labels for the masked nodes
    true_labels_masked = labels[labels_to_be_masked]
    
    # Predicted labels for the masked nodes
    predicted_labels_masked = predicted_labels[labels_to_be_masked]
    
    # Evaluate the model's performance
    results = evaluate_model(true_labels_masked, predicted_labels_masked)
    
    # Print the results
    print(f"Accuracy: {results['accuracy'] * 100:.2f}%")
    print(f"F1-Score: {results['f1_score']:.4f}")

    results['model'] = classifier
    results['embedding'] = embedding

    # Return results and intermediate embeddings for visualization
    return results, emb

In [24]:
all_results=[]
graph_embeddings_dict={}
for emb in embedding_dict.keys():
    for clf in classifiers:
        results, embedding_matrix = train_and_evaluate(embedding_dict, emb, clf)
        all_results.append(results)
        key_string= emb + ' with ' + clf
        graph_embeddings_dict[key_string]=embedding_matrix

embedding: DEEPWALK
model: GCN
Processing...
Training...
Shape of X_train: (2295, 150)
Shape of A_train_tensor: (2295, 2295)
Shape of Y_train: (2295, 8)
Epoch 1, Loss: 80.59327697753906, Accuracy: 0.15032680332660675
Epoch 11, Loss: 14.102044105529785, Accuracy: 0.4919390082359314
Epoch 21, Loss: 4.502809047698975, Accuracy: 0.47145968675613403
Epoch 31, Loss: 1.4875774383544922, Accuracy: 0.5381263494491577
Epoch 41, Loss: 1.4364583492279053, Accuracy: 0.43834424018859863
Epoch 51, Loss: 1.2657424211502075, Accuracy: 0.615686297416687
Epoch 61, Loss: 1.0556358098983765, Accuracy: 0.6823529601097107
Epoch 71, Loss: 1.0012435913085938, Accuracy: 0.7163398861885071
Epoch 81, Loss: 0.9449745416641235, Accuracy: 0.7119825482368469
Epoch 91, Loss: 0.9138507843017578, Accuracy: 0.7289760112762451
Epoch 101, Loss: 0.8825042247772217, Accuracy: 0.7272331118583679
Epoch 111, Loss: 0.8615908622741699, Accuracy: 0.7337690591812134
Epoch 121, Loss: 0.8467381596565247, Accuracy: 0.7359477281570435




Epoch 1, Loss: 2.0882253646850586, Accuracy: 0.051851850003004074
Epoch 11, Loss: 0.4419538974761963, Accuracy: 0.8596950173377991
Epoch 21, Loss: 0.35105979442596436, Accuracy: 0.9045751690864563
Epoch 31, Loss: 0.2798238694667816, Accuracy: 0.9241830110549927
Epoch 41, Loss: 0.22318072617053986, Accuracy: 0.9320261478424072
Epoch 51, Loss: 0.1884259283542633, Accuracy: 0.9403049945831299
Epoch 61, Loss: 0.16226136684417725, Accuracy: 0.9459695219993591
Epoch 71, Loss: 0.1393304467201233, Accuracy: 0.9529411792755127
Epoch 81, Loss: 0.11889629811048508, Accuracy: 0.9594771265983582
Epoch 91, Loss: 0.09927824139595032, Accuracy: 0.9681917428970337
Epoch 101, Loss: 0.0809531956911087, Accuracy: 0.9764705896377563
Epoch 111, Loss: 0.0643705278635025, Accuracy: 0.9790849685668945
Epoch 121, Loss: 0.052403468638658524, Accuracy: 0.9821350574493408
Epoch 131, Loss: 0.04501965641975403, Accuracy: 0.984749436378479
Epoch 141, Loss: 0.04034382849931717, Accuracy: 0.984749436378479
Epoch 151, L



Epoch 11, Loss: 0.5036377310752869, Accuracy: 0.8470588326454163
Epoch 21, Loss: 0.3781101107597351, Accuracy: 0.8810457587242126
Epoch 31, Loss: 0.3068903386592865, Accuracy: 0.9058823585510254
Epoch 41, Loss: 0.2633686363697052, Accuracy: 0.9189542531967163
Epoch 51, Loss: 0.23371148109436035, Accuracy: 0.9285402894020081
Epoch 61, Loss: 0.21073397994041443, Accuracy: 0.9363834261894226
Epoch 71, Loss: 0.19086569547653198, Accuracy: 0.94466233253479
Epoch 81, Loss: 0.17239412665367126, Accuracy: 0.9477124214172363
Epoch 91, Loss: 0.15454117953777313, Accuracy: 0.9546840786933899
Epoch 101, Loss: 0.13699974119663239, Accuracy: 0.9599128365516663
Epoch 111, Loss: 0.12024929374456406, Accuracy: 0.9651415944099426
Epoch 121, Loss: 0.10452726483345032, Accuracy: 0.9694989323616028
Epoch 131, Loss: 0.08856122940778732, Accuracy: 0.9734205007553101
Epoch 141, Loss: 0.07418165355920792, Accuracy: 0.9769062995910645
Epoch 151, Loss: 0.06089424341917038, Accuracy: 0.9821350574493408
Epoch 161,



Epoch 11, Loss: 1.711827278137207, Accuracy: 0.399564266204834
Epoch 21, Loss: 1.3537522554397583, Accuracy: 0.4910675287246704
Epoch 31, Loss: 1.0830405950546265, Accuracy: 0.6082788705825806
Epoch 41, Loss: 0.9349703788757324, Accuracy: 0.7198256850242615
Epoch 51, Loss: 0.823497474193573, Accuracy: 0.7529411911964417
Epoch 61, Loss: 0.7289060354232788, Accuracy: 0.7668845057487488
Epoch 71, Loss: 0.6714534163475037, Accuracy: 0.7808278799057007
Epoch 81, Loss: 0.6229240298271179, Accuracy: 0.7943354845046997
Epoch 91, Loss: 0.5915417671203613, Accuracy: 0.8017429113388062
Epoch 101, Loss: 0.5672233700752258, Accuracy: 0.8082788586616516
Epoch 111, Loss: 0.5478813648223877, Accuracy: 0.813507616519928
Epoch 121, Loss: 0.5294991135597229, Accuracy: 0.827015221118927
Epoch 131, Loss: 0.5098995566368103, Accuracy: 0.8322439789772034
Epoch 141, Loss: 0.4965786337852478, Accuracy: 0.842265784740448
Epoch 151, Loss: 0.4823084771633148, Accuracy: 0.8444444537162781
Epoch 161, Loss: 0.470309



Epoch 11, Loss: 1.8640697002410889, Accuracy: 0.2540304958820343
Epoch 21, Loss: 1.6470447778701782, Accuracy: 0.5202614665031433
Epoch 31, Loss: 1.2332122325897217, Accuracy: 0.7385621070861816
Epoch 41, Loss: 0.7295100688934326, Accuracy: 0.9176470637321472
Epoch 51, Loss: 0.3875207304954529, Accuracy: 0.9450980424880981
Epoch 61, Loss: 0.2296510636806488, Accuracy: 0.9529411792755127
Epoch 71, Loss: 0.1703900545835495, Accuracy: 0.9555555582046509
Epoch 81, Loss: 0.14451254904270172, Accuracy: 0.9603486061096191
Epoch 91, Loss: 0.12919367849826813, Accuracy: 0.9603486061096191
Epoch 101, Loss: 0.11708693206310272, Accuracy: 0.9616557955741882
Epoch 111, Loss: 0.1040523499250412, Accuracy: 0.9668845534324646
Epoch 121, Loss: 0.09504732489585876, Accuracy: 0.9725490212440491
Epoch 131, Loss: 0.08489483594894409, Accuracy: 0.9734205007553101
Epoch 141, Loss: 0.07635311037302017, Accuracy: 0.9769062995910645
Epoch 151, Loss: 0.06998712569475174, Accuracy: 0.9786492586135864
Epoch 161, L



Epoch 11, Loss: 0.4419606626033783, Accuracy: 0.8527233004570007
Epoch 21, Loss: 0.34532174468040466, Accuracy: 0.9058823585510254
Epoch 31, Loss: 0.27543315291404724, Accuracy: 0.9224401116371155
Epoch 41, Loss: 0.22975513339042664, Accuracy: 0.929411768913269
Epoch 51, Loss: 0.2002449780702591, Accuracy: 0.9328976273536682
Epoch 61, Loss: 0.1755405217409134, Accuracy: 0.9394335746765137
Epoch 71, Loss: 0.15477000176906586, Accuracy: 0.9485839009284973
Epoch 81, Loss: 0.13439002633094788, Accuracy: 0.9546840786933899
Epoch 91, Loss: 0.11530130356550217, Accuracy: 0.9599128365516663
Epoch 101, Loss: 0.09767153114080429, Accuracy: 0.9660130739212036
Epoch 111, Loss: 0.08121698349714279, Accuracy: 0.972113311290741
Epoch 121, Loss: 0.0667680874466896, Accuracy: 0.9799564480781555
Epoch 131, Loss: 0.05616503581404686, Accuracy: 0.9821350574493408
Epoch 141, Loss: 0.048189662396907806, Accuracy: 0.984749436378479
Epoch 151, Loss: 0.04205676540732384, Accuracy: 0.9869281053543091
Epoch 161,



Epoch 11, Loss: 0.7057252526283264, Accuracy: 0.7969498634338379
Epoch 21, Loss: 0.28550148010253906, Accuracy: 0.9215686321258545
Epoch 31, Loss: 0.1279360055923462, Accuracy: 0.9668845534324646
Epoch 41, Loss: 0.06347998976707458, Accuracy: 0.9816993474960327
Epoch 51, Loss: 0.0325450524687767, Accuracy: 0.9904139637947083
Epoch 61, Loss: 0.016400236636400223, Accuracy: 0.9947712421417236
Epoch 71, Loss: 0.009437181986868382, Accuracy: 0.9978213310241699
Epoch 81, Loss: 0.006672711111605167, Accuracy: 0.9978213310241699
Epoch 91, Loss: 0.005374091677367687, Accuracy: 0.9973856210708618
Epoch 101, Loss: 0.004808951634913683, Accuracy: 0.9978213310241699
Epoch 111, Loss: 0.004534939769655466, Accuracy: 0.9978213310241699
Epoch 121, Loss: 0.0043600136414170265, Accuracy: 0.9978213310241699
Epoch 131, Loss: 0.004213462118059397, Accuracy: 0.9978213310241699
Epoch 141, Loss: 0.004112434573471546, Accuracy: 0.9978213310241699
Epoch 151, Loss: 0.003959450405091047, Accuracy: 0.9978213310241



Epoch 11, Loss: 1.604129433631897, Accuracy: 0.4313725531101227
Epoch 21, Loss: 0.849682629108429, Accuracy: 0.7943354845046997
Epoch 31, Loss: 0.42201489210128784, Accuracy: 0.8945533633232117
Epoch 41, Loss: 0.2776717245578766, Accuracy: 0.9254902005195618
Epoch 51, Loss: 0.20617420971393585, Accuracy: 0.9429193735122681
Epoch 61, Loss: 0.15950222313404083, Accuracy: 0.9546840786933899
Epoch 71, Loss: 0.12834692001342773, Accuracy: 0.9629629850387573
Epoch 81, Loss: 0.10425954312086105, Accuracy: 0.9690631628036499
Epoch 91, Loss: 0.08608011156320572, Accuracy: 0.9747276902198792
Epoch 101, Loss: 0.07297579199075699, Accuracy: 0.9764705896377563
Epoch 111, Loss: 0.06230999156832695, Accuracy: 0.9812636375427246
Epoch 121, Loss: 0.05305098369717598, Accuracy: 0.9838780164718628
Epoch 131, Loss: 0.0453379787504673, Accuracy: 0.986492395401001
Epoch 141, Loss: 0.039342377334833145, Accuracy: 0.9873638153076172
Epoch 151, Loss: 0.034279193729162216, Accuracy: 0.9891067743301392
Epoch 161

## Saving aggregate results

In [25]:
# Convert to DataFrame
df = pd.DataFrame(all_results)

# Define dataset name and seed
dataset_name = "photo"
seed_value = SEED

# Save as CSV file without sorting
filename = f"{dataset_name}_seed{seed_value}_results.csv"
filename='./photo_analysis_results/'+filename
df.to_csv(filename, index=False)

print(f"Results saved as {filename}")

Results saved as ./photo_analysis_results/photo_seed46_results.csv


In [26]:
all_embeddings= embedding_dict | graph_embeddings_dict

In [27]:
def reorder_dict(original_dict, key_order):
    """
    Reorders a dictionary based on a given list of keys.

    Parameters:
    - original_dict (dict): The dictionary to reorder.
    - key_order (list): The list specifying the desired key order.

    Returns:
    - dict: A new dictionary with keys ordered as per key_order.
    """
    return {key: original_dict[key] for key in key_order if key in original_dict}

In [28]:
key_order = ['random', 'random with gcn', 'random with gat', 'random with graphsage', 'deepwalk', 'deepwalk with gcn', 'deepwalk with gat', 'deepwalk with graphsage', 'node2vec','node2vec with gcn', 'node2vec with gat', 'node2vec with graphsage', 'vgae', 'vgae with gcn', 'vgae with gat', 'vgae with graphsage', 'dgi', 'dgi with gcn', 'dgi with gat', 'dgi with graphsage', 'modularity', 'modularity with gcn', 'modularity with gat', 'modularity with graphsage', 'given', 'given with gcn', 'given with gat', 'given with graphsage']

In [29]:
all_embeddings = reorder_dict(all_embeddings, key_order)