In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
%pip install torch-geometric torch-scatter torch-sparse torch-cluster torch-spline-conv pyg-lib \
  -f https://data.pyg.org/whl/torch-2.5.1+cu118.html
%pip install tensorboard
%pip install dask
%pip install dask distributed
%pip install PyMetis
%pip install torch-cluster
%pip install tensorboard
%pip install torch_tb_profiler

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu118.html
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting 

In [1]:
import torch

if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs available: {num_gpus}")
else:
    print("No GPUs available. PyTorch is running on CPU.")

Number of GPUs available: 3


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Reddit
from torch_geometric.nn import GCNConv
import numpy as np
import time
import sys
import os
import cProfile # For general Python profiling (kept for Node2Vec if desired, but not for GCN training)
import pstats # For general Python profiling stats (kept for Node2Vec)
from torch_geometric.datasets import Planetoid

# Try to import Node2Vec and its dependencies
try:
    from torch_geometric.nn import Node2Vec # Import Node2Vec for random walk embeddings
    NODE2VEC_AVAILABLE = True
except ImportError:
    print("Node2Vec or its dependencies ('pyg-lib' or 'torch-cluster') not found.")
    print("Please install them for Node2Vec functionality:")
    print("pip install pyg-lib")
    print("pip install torch-cluster")
    NODE2VEC_AVAILABLE = False

from sklearn.linear_model import LogisticRegression # For classification on embeddings
from sklearn.metrics import accuracy_score, adjusted_rand_score, normalized_mutual_info_score # For evaluating Node2Vec classifier and clustering

from sklearn.cluster import KMeans # For clustering on GCN embeddings

# Import for DistributedDataParallel
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.multiprocessing as mp # Still needed for DDP process group initialization

# Import for Dask
from dask.distributed import Client, LocalCluster, get_worker

# Import for Visualization
import matplotlib.pyplot as plt

# Import PyTorch Profiler
import torch.profiler

# Directory for TensorBoard logs
LOG_DIR = "runs/hpc_gcn_profiler"
os.makedirs(LOG_DIR, exist_ok=True)

# Try to import PyMetis. If not available, provide instructions.
try:
    import pymetis
    METIS_AVAILABLE = True
except ImportError:
    print("PyMetis not found. Please install it for METIS partitioning:")
    print("1. Install METIS library: sudo apt-get install libmetis-dev (on Ubuntu/Debian)")
    print("2. Install PyMetis: pip install PyMetis")
    print("Falling back to only random partitioning if PyMetis is not installed.")
    METIS_AVAILABLE = False

# --- 1. Load the Dataset ---
print("Loading dataset...")
# Using PubMed for quicker demonstration, but can be switched to Reddit.
dataset = Planetoid(root='/tmp/PubMed', name='PubMed')
# dataset = Reddit(root='/tmp/Reddit') # Uncomment to use Reddit dataset
data = dataset[0]
print(f"Dataset loaded: {dataset.name if isinstance(dataset, Planetoid) else 'Reddit'}") # Adjusted for Reddit dataset
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Number of features: {data.num_node_features}")
print(f"Number of classes: {dataset.num_classes}")
print(f"Graph has isolated nodes: {data.contains_isolated_nodes()}")
print(f"Graph has self-loops: {data.contains_self_loops()}")
print(f"Graph is undirected: {data.is_undirected()}")

# For node classification, we typically use data.train_mask, data.val_mask, data.test_mask
print(f"Number of training nodes: {data.train_mask.sum()}")
print(f"Number of validation nodes: {data.val_mask.sum()}")
print(f"Number of test nodes: {data.test_mask.sum()}")

# --- 2. Define the GCN Model ---
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, hidden_channels, num_classes):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, num_classes)

    def forward(self, x, edge_index, return_embeddings=False):
        # The first layer output can be considered as node embeddings
        h = self.conv1(x, edge_index)
        h = F.relu(h)
        h = F.dropout(h, p=0.5, training=self.training)
        
        if return_embeddings:
            return h # Return embeddings from the hidden layer
        
        out = self.conv2(h, edge_index)
        return out

# --- 3. Graph Partitioning Functions ---

def convert_to_metis_format(edge_index, num_nodes):
    """
    Converts PyG edge_index to METIS adjacency list format.
    METIS expects adjacency list where each list contains neighbors of a node.
    """
    adj_list = [[] for _ in range(num_nodes)]
    for i in range(edge_index.shape[1]):
        u, v = edge_index[0, i].item(), edge_index[1, i].item()
        adj_list[u].append(v)
        # If graph is undirected and edge_index only contains one direction, add reverse
        if data.is_undirected(): # Assuming data.is_undirected() is reliable
             adj_list[v].append(u)
    return adj_list

def random_partition_graph(data, num_partitions):
    """
    Randomly partitions nodes into num_partitions.
    Returns a list of (node_mask, edge_index_for_partition) for each partition.
    """
    print(f"Performing random partitioning into {num_partitions} parts...")
    start_time = time.time()
    num_nodes = data.num_nodes
    node_partition_ids = torch.randint(0, num_partitions, (num_nodes,))

    partitions = []
    for i in range(num_partitions):
        # Nodes belonging to this partition
        partition_node_mask = (node_partition_ids == i)
        
        # Create a subgraph for this partition
        # This is a simplification: in true distributed, you'd only pass relevant edges.
        # Here, we're creating a 'view' of the original graph for each partition.
        # For a more rigorous approach, you'd need to re-index nodes within each partition.
        
        # Filter edges where both source and destination nodes are in this partition
        # This simulates internal edges. Edges crossing partitions are "cut edges".
        src, dst = data.edge_index
        edge_mask_internal = (node_partition_ids[src] == i) & (node_partition_ids[dst] == i)
        partition_edge_index = data.edge_index[:, edge_mask_internal]

        # Count cut edges
        edge_mask_cut = (node_partition_ids[src] == i) & (node_partition_ids[dst] != i)
        num_cut_edges = edge_mask_cut.sum().item()

        partitions.append({
            'node_mask': partition_node_mask,
            'edge_index': partition_edge_index,
            'num_cut_edges': num_cut_edges,
            'num_nodes_in_partition': partition_node_mask.sum().item()
        })
    end_time = time.time()
    print(f"Random partitioning complete in {end_time - start_time:.4f} seconds.")
    return partitions

def metis_partition_graph(data, num_partitions):
    """
    Partitions the graph using METIS.
    Returns a list of (node_mask, edge_index_for_partition) for each partition.
    """
    if not METIS_AVAILABLE:
        print("METIS is not available. Skipping METIS partitioning.")
        return None

    print(f"Performing METIS partitioning into {num_partitions} parts...")
    start_time = time.time()
    num_nodes = data.num_nodes
    adj_list = convert_to_metis_format(data.edge_index, num_nodes)

    # PyMetis returns edge cuts and node assignments
    edge_cuts, node_partition_ids = pymetis.part_graph(num_partitions, adj_list)
    node_partition_ids = torch.tensor(node_partition_ids, dtype=torch.long)

    partitions = []
    total_cut_edges = 0
    for i in range(num_partitions):
        partition_node_mask = (node_partition_ids == i)
        
        src, dst = data.edge_index
        # Edges where both source and destination nodes are in this partition
        edge_mask_internal = (node_partition_ids[src] == i) & (node_partition_ids[dst] == i)
        partition_edge_index = data.edge_index[:, edge_mask_internal]

        # Count edges where source is in this partition and destination is not
        edge_mask_cut_from_this_partition = (node_partition_ids[src] == i) & (node_partition_ids[dst] != i)
        num_cut_edges_from_this_partition = edge_mask_cut_from_this_partition.sum().item()
        total_cut_edges += num_cut_edges_from_this_partition

        partitions.append({
            'node_mask': partition_node_mask,
            'edge_index': partition_edge_index,
            'num_cut_edges': num_cut_edges_from_this_partition,
            'num_nodes_in_partition': partition_node_mask.sum().item()
        })
    end_time = time.time()
    print(f"METIS partitioning complete in {end_time - start_time:.4f} seconds.")
    print(f"Total METIS cut edges (reported by METIS): {edge_cuts}")
    # Note: The sum of num_cut_edges_from_this_partition across all partitions will be 2x the actual cut edges
    # because each cut edge is counted twice (once from each side).
    print(f"Total METIS cut edges (calculated from partitions, should be 2x actual): {total_cut_edges}")
    return partitions

# --- 4. Training Function for a Single Partition (Worker) ---
# This function will be executed by each Dask worker.
def train_partition_worker(rank, world_size, partition_data_list,
                           num_node_features, hidden_channels, num_classes,
                           epochs, data_x_cpu, data_y_cpu, global_train_mask_cpu, global_val_mask_cpu, global_test_mask_cpu,
                           global_edge_index_cpu):
    """
    Function executed by each Dask worker process to train on its partition.
    It takes the global data and masks to evaluate global accuracy.
    """
    # Setup distributed environment modify 
  # Setup distributed environment
# These environment variables are crucial for PyTorch's distributed training (e.g., DistributedDataParallel)
# or Dask's distributed setup to allow workers to find and communicate with the master node.

# MASTER_ADDR: Specifies the IP address of the master node (the rank 0 process)
    os.environ['MASTER_ADDR'] = 'localhost'
# For single-node training (all GPUs on the same machine), 'localhost' is correct.
#
# For multi-node training on AWS GPU instances:
# You MUST replace 'localhost' with the private IP address of the designated master instance.
# Example: os.environ['MASTER_ADDR'] = '172.31.X.Y' (where 172.31.X.Y is the private IP of your master instance)
# Ensure security groups allow traffic on MASTER_PORT between instances.

# MASTER_PORT: Specifies the port for communication between the master and worker nodes.
    os.environ['MASTER_PORT'] = '12355'
# This port must be open and accessible between all instances in your distributed setup.
# '12355' is a common choice, but any unused port can be used.
# For AWS instances, ensure this port is explicitly allowed in the security group rules
# for both inbound and outbound traffic among your instances. 

    dist.init_process_group("nccl", rank=rank, world_size=world_size) 

    device = torch.device(f'cuda:{rank}') 

    print(f"Worker {rank}: Starting training on its partition on device {device}...")
    model = GCN(num_node_features, hidden_channels, num_classes).to(device)
    model = DDP(model, device_ids=[rank]) 

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
    criterion = torch.nn.CrossEntropyLoss()

    # Move global data to device once
    data_x_gpu = data_x_cpu.to(device)
    data_y_gpu = data_y_cpu.to(device)
    global_train_mask_gpu = global_train_mask_cpu.to(device)
    global_val_mask_gpu = global_val_mask_cpu.to(device)
    global_test_mask_gpu = global_test_mask_cpu.to(device)
    global_edge_index_gpu = global_edge_index_cpu.to(device)

    # Get partition specific data for this rank
    partition_data = partition_data_list[rank]
    partition_node_mask_gpu = partition_data['node_mask'].to(device)
    
    # Filter global masks to only include nodes within this partition for local loss calculation
    local_train_mask = global_train_mask_gpu & partition_node_mask_gpu

    # --- PyTorch Profiler Integration for Distributed Training ---
    # Only profile on rank 0 to avoid redundant traces and simplify analysis
    if rank == 0:
        with torch.profiler.profile(
            schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
            on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(LOG_DIR, f"distributed_gcn_rank{rank}")),
            with_stack=True
        ) as prof:
            for epoch in range(1, epochs + 1):
                model.train()
                optimizer.zero_grad()
                
                out = model(data_x_gpu, global_edge_index_gpu)
                
                if local_train_mask.sum() > 0:
                    loss = criterion(out[local_train_mask], data_y_gpu[local_train_mask])
                    loss.backward()
                    optimizer.step()
                else:
                    loss = torch.tensor(0.0).to(device)

                dist.all_reduce(loss, op=dist.ReduceOp.SUM)
                loss_avg = loss.item() / world_size

                if (epoch % 10 == 0 or epoch == 1):
                    train_acc = evaluate_global(model.module, data_x_gpu, global_edge_index_gpu, data_y_gpu, global_train_mask_gpu)
                    val_acc = evaluate_global(model.module, data_x_gpu, global_edge_index_gpu, data_y_gpu, global_val_mask_gpu)
                    print(f'Worker {rank} (Aggregator) - Epoch: {epoch:03d}, Local Loss: {loss_avg:.4f}, Global Train Acc: {train_acc:.4f}, Global Val Acc: {val_acc:.4f}')
                
                prof.step() # Mark the end of the step for the profiler
                dist.barrier() # Ensure all processes are synchronized before moving to next epoch
    else: # For other ranks, just run the training loop without profiling
        for epoch in range(1, epochs + 1):
            model.train()
            optimizer.zero_grad()
            
            out = model(data_x_gpu, global_edge_index_gpu)
            
            if local_train_mask.sum() > 0:
                loss = criterion(out[local_train_mask], data_y_gpu[local_train_mask])
                loss.backward()
                optimizer.step()
            else:
                loss = torch.tensor(0.0).to(device)

            dist.all_reduce(loss, op=dist.ReduceOp.SUM)
            # No print for other ranks to avoid cluttered output
            
            dist.barrier()

    dist.destroy_process_group()
    print(f"Worker {rank}: Finished training and destroying process group.")
    
    if rank == 0:
        return model.module.state_dict()
    else:
        return None

def evaluate_global(model, x, edge_index, y, mask):
    """Evaluates accuracy on a global mask. Assumes model and data are already on the correct device."""
    model.eval()
    with torch.no_grad():
        out = model(x, edge_index)
        pred = out[mask].argmax(dim=1)
        correct = (pred == y[mask]).sum()
        acc = int(correct) / int(mask.sum())
    return acc

# --- 5. Main Distributed Training Orchestrator with Dask ---
def run_distributed_training(num_partitions, partition_type='random', epochs=200):
    if not torch.cuda.is_available():
        print("No GPUs available. Distributed training across GPUs cannot be performed.")
        return 0.0, 0.0, None # Return dummy values and None for model state

    num_gpus = torch.cuda.device_count()
    if num_partitions > num_gpus:
        print(f"Warning: Requested {num_partitions} partitions but only {num_gpus} GPUs available. Using {num_gpus} GPUs.")
        num_partitions = num_gpus
    
    if num_partitions == 0:
        print("No GPUs available to run distributed training.")
        return 0.0, 0.0, None

    print(f"\n--- Running Distributed Training with {num_partitions} Partitions ({partition_type}) across GPUs (Dask Orchestrated) ---")
    
    partition_start_time = time.time()
    if partition_type == 'random':
        partitions = random_partition_graph(data, num_partitions)
    elif partition_type == 'metis':
        partitions = metis_partition_graph(data, num_partitions)
    else:
        raise ValueError("Invalid partition_type. Choose 'random' or 'metis'.")
    partition_end_time = time.time()
    print(f"Total partitioning time: {partition_end_time - partition_start_time:.4f} seconds.")

    total_nodes_sum = 0
    total_cut_edges_sum = 0
    print("\nPartition Statistics:")
    for i, p in enumerate(partitions):
        print(f"  Partition {i}: Nodes = {p['num_nodes_in_partition']}, Cut Edges = {p['num_cut_edges']}")
        total_nodes_sum += p['num_nodes_in_partition']
        total_cut_edges_sum += p['num_cut_edges']
    print(f"  Total nodes across partitions (sum): {total_nodes_sum}")
    print(f"  Total cut edges across partitions (sum of individual partition's cuts): {total_cut_edges_sum}")
    print(f"  Note: Total cut edges sum is 2x the actual unique cut edges as each is counted from both sides.")

    cluster = LocalCluster(n_workers=num_partitions, threads_per_worker=1, processes=True)
    client = Client(cluster)
    print(f"Dask Dashboard: {client.dashboard_link}")

    common_args = (partitions, data.num_node_features, hidden_channels, dataset.num_classes,
                   epochs, data.x, data.y, data.train_mask, data.val_mask, data.test_mask, data.edge_index)

    training_start_time = time.time()
    
    futures = []
    for i in range(num_partitions):
        futures.append(client.submit(train_partition_worker, i, num_partitions, *common_args))

    results_list = client.gather(futures)
    
    final_model_state_dict = None
    for res in results_list:
        if res is not None:
            final_model_state_dict = res
            break
    
    if final_model_state_dict is None:
        print("Error: Could not retrieve final model state from any worker.")
        return 0.0, 0.0, None

    training_end_time = time.time()
    total_training_time = training_end_time - training_start_time
    print(f"\nTotal Distributed Training Time ({num_partitions} partitions): {total_training_time:.4f} seconds.")

    client.close()
    cluster.close()
    print("Dask client and cluster closed.")

    global_model = GCN(data.num_node_features, hidden_channels, dataset.num_classes)
    global_model.load_state_dict(final_model_state_dict)

    test_acc = evaluate_global(global_model, data.x, data.edge_index, data.y, data.test_mask)
    print(f'Final Test Accuracy (Aggregated Model): {test_acc:.4f}')
    return total_training_time, test_acc, global_model # Return the trained model for clustering

# # --- New Function: Random Walk based Node Classification (Node2Vec) ---
# def run_node2vec_classification(data, embedding_dim=128, walk_length=20, context_size=10,
#                                 walks_per_node=10, num_negative_samples=1, batch_size=128,
#                                 n2v_epochs=50, lr=0.01, num_classes=None):
#     """
#     Runs Node2Vec to generate embeddings and then trains a Logistic Regression
#     classifier on these embeddings for node classification.
#     """
#     if not NODE2VEC_AVAILABLE:
#         print("\nNode2Vec is not available. Skipping Node2Vec based Node Classification.")
#         return 0.0, 0.0 # Return dummy values
        
#     print("\n--- Running Node2Vec based Node Classification ---")
#     start_time = time.time()

#     device = 'cpu' # Node2Vec training is often done on CPU or can be moved to GPU if desired
    
#     model_n2v = Node2Vec(data.edge_index, embedding_dim=embedding_dim, walk_length=walk_length,
#                          context_size=context_size, walks_per_node=walks_per_node,
#                          num_negative_samples=num_negative_samples, p=1.0, q=1.0, sparse=True).to(device)
    
#     optimizer_n2v = torch.optim.Adam(model_n2v.parameters(), lr=lr)

#     loader = model_n2v.loader(batch_size=batch_size, shuffle=True, num_workers=0)

#     print(f"Training Node2Vec embeddings for {n2v_epochs} epochs...")
#     # --- PyTorch Profiler Integration for Node2Vec (Optional) ---
#     # For Node2Vec, we can also add a profiler.
#     # The `on_trace_ready` handler will save the trace to a file.
#     with torch.profiler.profile(
#         schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
#         on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(LOG_DIR, "node2vec_profiler")),
#         with_stack=True
#     ) as prof:
#         for epoch in range(1, n2v_epochs + 1):
#             model_n2v.train()
#             total_loss = 0
#             for pos_rw, neg_rw in loader:
#                 optimizer_n2v.zero_grad()
#                 loss = model_n2v.loss(pos_rw.to(device), neg_rw.to(device))
#                 loss.backward()
#                 optimizer_n2v.step()
#                 total_loss += loss.item()
#             if epoch % 10 == 0 or epoch == 1:
#                 print(f'Node2Vec Epoch: {epoch:03d}, Loss: {total_loss / len(loader):.4f}')
#             prof.step() # Mark the end of the step for the profiler
    
#     model_n2v.eval()
#     with torch.no_grad():
#         embeddings = model_n2v(torch.arange(data.num_nodes, device=device)).cpu().numpy()

#     embedding_time = time.time() - start_time
#     print(f"Node2Vec embedding generation complete in {embedding_time:.4f} seconds.")

#     print("Training Logistic Regression classifier on Node2Vec embeddings...")
#     train_indices = data.train_mask.nonzero(as_tuple=True)[0].numpy()
#     test_indices = data.test_mask.nonzero(as_tuple=True)[0].numpy()

#     X_train = embeddings[train_indices]
#     y_train = data.y[train_indices].numpy()
#     X_test = embeddings[test_indices]
#     y_test = data.y[test_indices].numpy()

#     classifier = LogisticRegression(solver='liblinear', multi_class='auto', max_iter=200)
#     classifier.fit(X_train, y_train)

#     y_pred = classifier.predict(X_test)
#     test_acc = accuracy_score(y_test, y_pred)

#     classification_time = time.time() - (start_time + embedding_time)
#     total_time = embedding_time + classification_time

#     print(f"Logistic Regression classification complete in {classification_time:.4f} seconds.")
#     print(f"Node2Vec + LR Total Time: {total_time:.4f} seconds, Test Accuracy: {test_acc:.4f}")

#     return total_time, test_acc

# --- New Function: GCN-based Clustering ---
# def run_gcn_clustering(trained_gcn_model, data_x_cpu, data_edge_index_cpu, data_y_cpu, num_classes):
#     """
#     Extracts embeddings from a trained GCN model and performs K-Means clustering.
#     Evaluates clustering quality using Adjusted Rand Index and Normalized Mutual Information.
#     """
#     print("\n--- Running GCN-based Clustering ---")
#     start_time = time.time()

#     device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
#     trained_gcn_model.eval() # Set model to evaluation mode
#     trained_gcn_model.to(device)

#     with torch.no_grad():
#         # Get embeddings from the hidden layer of the GCN
#         # We pass return_embeddings=True to get the output of conv1 (hidden layer)
#         gcn_embeddings = trained_gcn_model(data_x_cpu.to(device), data_edge_index_cpu.to(device), return_embeddings=True).cpu().numpy()

#     embedding_extraction_time = time.time() - start_time
#     print(f"GCN embedding extraction complete in {embedding_extraction_time:.4f} seconds.")

#     # Perform K-Means clustering
#     print(f"Performing K-Means clustering with {num_classes} clusters...")
#     kmeans = KMeans(n_clusters=num_classes, random_state=0, n_init=10) # n_init for robustness
#     cluster_labels = kmeans.fit_predict(gcn_embeddings)

#     clustering_time = time.time() - (start_time + embedding_extraction_time)
#     total_time = embedding_extraction_time + clustering_time

#     # Evaluate clustering quality using ground-truth labels (data.y)
#     # Note: Clustering is unsupervised, so "accuracy" is not directly applicable.
#     # We use metrics that compare the discovered clusters to the true classes.
#     true_labels = data_y_cpu.numpy()

#     ari_score = adjusted_rand_score(true_labels, cluster_labels)
#     nmi_score = normalized_mutual_info_score(true_labels, cluster_labels)

#     print(f"K-Means clustering complete in {clustering_time:.4f} seconds.")
#     print(f"GCN Clustering Total Time: {total_time:.4f} seconds.")
#     print(f"Adjusted Rand Index (ARI): {ari_score:.4f}")
#     print(f"Normalized Mutual Information (NMI): {nmi_score:.4f}")

#     return total_time, ari_score, nmi_score # Return ARI as the primary "accuracy" for plotting


# --- 6. Main Execution Block for Experimentation and Profiling ---
if __name__ == '__main__':
    hidden_channels = 16 # Number of channels in the GCN hidden layer

    if torch.cuda.is_available():
        num_partitions_list = [1, 2,3,4,8] 
        num_gpus_available = torch.cuda.device_count()
        num_partitions_list = [p for p in num_partitions_list if p <= num_gpus_available]
        if not num_partitions_list:
            print("No suitable number of partitions to run multi-GPU training (0 or more partitions than GPUs available).")
            print("Running only sequential CPU training.")
            num_partitions_list = [1]
    else:
        print("No GPUs available. Running only sequential CPU training.")
        num_partitions_list = [1]

    results = {} # Stores {'config_name': {'time': ..., 'accuracy': ..., 'nmi': ...}}

    # --- Run Sequential GCN (1 partition) as Baseline ---
    print("\n--- Running Sequential GCN Training (1 Partition) as Baseline ---")
    # Using PyTorch profiler for sequential GCN training
    with torch.profiler.profile(
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
        on_trace_ready=torch.profiler.tensorboard_trace_handler(os.path.join(LOG_DIR, "sequential_gcn_profiler")),
        with_stack=True
    ) as prof:
        model_sequential = GCN(data.num_node_features, hidden_channels, dataset.num_classes)
        optimizer_sequential = torch.optim.Adam(model_sequential.parameters(), lr=0.01, weight_decay=5e-4)
        criterion_sequential = torch.nn.CrossEntropyLoss()
        
        device_sequential = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        model_sequential.to(device_sequential)
        data_x_seq = data.x.to(device_sequential)
        data_y_seq = data.y.to(device_sequential)
        data_edge_index_seq = data.edge_index.to(device_sequential)
        data_train_mask_seq = data.train_mask.to(device_sequential)
        data_val_mask_seq = data.val_mask.to(device_sequential)
        data_test_mask_seq = data.test_mask.to(device_sequential)

        seq_start_time = time.time()
        for epoch in range(1, 101): # Fewer epochs for sequential for quicker profiling
            model_sequential.train()
            optimizer_sequential.zero_grad()
            out = model_sequential(data_x_seq, data_edge_index_seq)
            loss = criterion_sequential(out[data_train_mask_seq], data_y_seq[data_train_mask_seq])
            loss.backward()
            optimizer_sequential.step()
            if epoch % 10 == 0 or epoch == 1:
                train_acc = evaluate_global(model_sequential, data_x_seq, data_edge_index_seq, data_y_seq, data_train_mask_seq)
                val_acc = evaluate_global(model_sequential, data_x_seq, data_edge_index_seq, data_y_seq, data_val_mask_seq)
                print(f'Sequential GCN - Epoch: {epoch:03d}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}')
            prof.step() # Mark the end of the step for the profiler
        seq_end_time = time.time()
        seq_total_time = seq_end_time - seq_start_time
        seq_test_acc = evaluate_global(model_sequential, data_x_seq, data_edge_index_seq, data_y_seq, data_test_mask_seq)
        print(f"Sequential GCN Training Time (1 partition): {seq_total_time:.4f} seconds.")
        print(f"Sequential GCN Test Accuracy: {seq_test_acc:.4f}")
    
    # No cProfile/pstats for GCN sequential training as torch.profiler is used
    results['GCN_Sequential'] = {'time': seq_total_time, 'accuracy': seq_test_acc, 'nmi': None}


    # --- Run Distributed GCN Training ---
    trained_gcn_model_dist = None # To store the model for clustering
    if any(p > 1 for p in num_partitions_list):
        for n_parts in num_partitions_list:
            if n_parts == 1:
                continue
            print(f"\n--- Running Distributed GCN Training with {n_parts} Partitions (Random Partitioning) ---")
            # Profiling for distributed training is handled inside train_partition_worker on rank 0
            train_time, test_acc, model_for_clustering = run_distributed_training(n_parts, partition_type='random', epochs=100)
            results[f'GCN_Random_{n_parts}_parts'] = {'time': train_time, 'accuracy': test_acc, 'nmi': None}
            if trained_gcn_model_dist is None:
                trained_gcn_model_dist = model_for_clustering

        if METIS_AVAILABLE:
            for n_parts in num_partitions_list:
                if n_parts == 1:
                    continue
                print(f"\n--- Running Distributed GCN Training with {n_parts} Partitions (METIS Partitioning) ---")
                # Profiling for distributed training is handled inside train_partition_worker on rank 0
                train_time, test_acc, model_for_clustering = run_distributed_training(n_parts, partition_type='metis', epochs=100)
                results[f'GCN_METIS_{n_parts}_parts'] = {'time': train_time, 'accuracy': test_acc, 'nmi': None}
                if trained_gcn_model_dist is None:
                    trained_gcn_model_dist = model_for_clustering

    # --- Run Node2Vec based Classification ---
#     n2v_embedding_dim = 128
#     n2v_walk_length = 20
#     n2v_context_size = 10
#     n2v_walks_per_node = 10
#     n2v_num_negative_samples = 1
#     n2v_batch_size = 128
#     n2v_epochs = 50

#     # Profiling for Node2Vec is integrated within run_node2vec_classification
#     n2v_time, n2v_acc = run_node2vec_classification(
#         data,
#         embedding_dim=n2v_embedding_dim,
#         walk_length=n2v_walk_length,
#         context_size=n2v_context_size,
#         walks_per_node=n2v_walks_per_node,
#         num_negative_samples=n2v_num_negative_samples,
#         batch_size=n2v_batch_size,
#         n2v_epochs=n2v_epochs,
#         num_classes=dataset.num_classes
#     )
#     results['Node2Vec_LR'] = {'time': n2v_time, 'accuracy': n2v_acc, 'nmi': None}


    # --- Run GCN-based Clustering (using the sequential GCN model for simplicity) ---
    # if model_sequential is not None:
    #     gcn_cluster_time, gcn_cluster_ari, gcn_cluster_nmi = run_gcn_clustering(
    #         model_sequential, data.x, data.edge_index, data.y, dataset.num_classes
    #     )
    #     results['GCN_Clustering'] = {'time': gcn_cluster_time, 'accuracy': gcn_cluster_ari, 'nmi': gcn_cluster_nmi}

    print("\n--- Summary of All Results ---")
    for config, res in results.items():
        print(f"Configuration: {config}, Total Time: {res['time']:.4f}s, Test Accuracy/ARI: {res['accuracy']:.4f}", end="")
        if res['nmi'] is not None:
            print(f", NMI: {res['nmi']:.4f}")
        else:
            print("")

    # --- 7. Visualization of Results ---
    print("\n--- Generating Performance Plots ---")
    configs = list(results.keys())
    times = [results[c]['time'] for c in configs]
    performance_scores = [results[c]['accuracy'] for c in configs]

    # --- Colorblind-friendly palette ---
    # Using a subset of the Color Brewer 'Set2' or 'Paired' for distinct categories
    # Or a custom set based on common colorblind-safe recommendations
    colorblind_colors = {
        'GCN_Sequential': '#8DA0CB', # Light Blue/Lavender
        'GCN_Random': '#FC8D62',     # Salmon/Orange
        'GCN_METIS': '#66C2A5',      # Teal/Green
        # Add more if other configurations are uncommented later
    }

    colors = []
    for config in configs:
        # Assign colors based on the main category in the config name
        if 'GCN_Sequential' in config:
            colors.append(colorblind_colors['GCN_Sequential'])
        elif 'GCN_Random' in config:
            colors.append(colorblind_colors['GCN_Random'])
        elif 'GCN_METIS' in config:
            colors.append(colorblind_colors['GCN_METIS'])
        else:
            colors.append('#A6D854') # A fallback color if new configs appear

    # Plotting Total Times
    plt.figure(figsize=(14, 7))
    plt.bar(configs, times, color=colors)
    plt.xlabel('Configuration', fontsize=12, fontweight='bold')
    plt.ylabel('Total Time (seconds)', fontsize=12, fontweight='bold')
    plt.title('Comparison of Total Time Across Configurations', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right', fontsize=10, fontweight='bold')
    plt.yticks(fontsize=10, fontweight='bold') # Make y-tick labels bold
    plt.tight_layout()
    plt.show()

    # Plotting Performance Scores (Accuracy for Classification, ARI for Clustering)
    plt.figure(figsize=(14, 7))
    plt.bar(configs, performance_scores, color=colors)
    plt.xlabel('Configuration', fontsize=12, fontweight='bold')
    plt.ylabel('Performance Score (Accuracy)', fontsize=12, fontweight='bold')
    plt.title('Comparison of Performance Scores Across Configurations', fontsize=14, fontweight='bold')
    plt.ylim(min(performance_scores) * 0.9, max(performance_scores) * 1.05)
    plt.xticks(rotation=45, ha='right', fontsize=10, fontweight='bold')
    plt.yticks(fontsize=10, fontweight='bold') # Make y-tick labels bold
    plt.tight_layout()
    plt.show()


    # Plotting Performance Scores (Accuracy for Classification, ARI for Clustering)
    # plt.figure(figsize=(14, 7))
    # plt.bar(configs, performance_scores, color=colors)
    # plt.xlabel('Configuration')
    # plt.ylabel('Performance Score (Accuracy for Classification, ARI for Clustering)')
    # plt.title('Comparison of Performance Scores Across Configurations')
    # plt.ylim(min(performance_scores) * 0.9, max(performance_scores) * 1.05)
    # plt.xticks(rotation=45, ha='right')
    # plt.tight_layout()
    # plt.show()

    # Plotting NMI Scores for Clustering (if applicable)
    # if nmi_scores:
    #     nmi_colors = [colors[configs.index(c)] for c in nmi_configs]
    #     plt.figure(figsize=(8, 6))
    #     plt.bar(nmi_configs, nmi_scores, color=nmi_colors)
    #     plt.xlabel('Clustering Configuration')
    #     plt.ylabel('Normalized Mutual Information (NMI)')
    #     plt.title('Comparison of NMI Scores for Clustering Configurations')
    #     plt.ylim(min(nmi_scores) * 0.9, max(nmi_scores) * 1.05)
    #     plt.xticks(rotation=45, ha='right')
    #     plt.tight_layout()
    #     plt.show()

    #print("\nHPC GCN, Node2Vec, and GCN-based Clustering examples finished.")
    print("This comparison highlights different approaches to node classification and clustering on graphs.")
    print("For GCN Clustering, 'accuracy' in the plot refers to Adjusted Rand Index (ARI).")
    print("\n--- PyTorch Profiler Trace Files ---")
    print(f"Trace files have been saved to the '{LOG_DIR}' directory.")
    print("You can visualize these traces using TensorBoard by running:")
    print(f"tensorboard --logdir {LOG_DIR}")
    print("Then open your web browser and navigate to the address provided by TensorBoard (usually http://localhost:6006).")
    print("In TensorBoard, go to the 'Profile' tab to explore the execution details.")
    print("Further optimizations and hyperparameter tuning would be beneficial for all methods.")
    
    print("\n--- NVIDIA Nsight Systems Profiling ---")
    print("NVIDIA Nsight Systems is an external command-line profiler that provides deep insights into GPU utilization, CUDA kernel performance, and CPU-GPU interactions.")
    print("To profile your script with Nsight Systems, ensure it is installed and in your system's PATH.")
    print("Then, run your Python script from the terminal using the `nsys` command. For example:")
    print(f"nsys profile -o {LOG_DIR}/nsys_report --stats=true python your_script_name.py")
    print("Replace 'your_script_name.py' with the actual name of your Python script.")
    print("This will generate an Nsight Systems report (`.qdrep` file) in the specified directory.")
    print("You can then open this report using the Nsight Systems GUI for detailed analysis.")
    print("Further optimizations and hyperparameter tuning would be beneficial for all methods.")


In [None]:
%load_ext tensorboard
%tensorboard --logdir runs/hpc_gcn_profiler