<a href="https://colab.research.google.com/github/SecretPasta/DAGFCN/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Prerequisites
Clean installing pytorch depenedencies for Node2Vec, and doing Imports

In [None]:
!pip uninstall -y torch-scatter torch-sparse torch-cluster torch-spline-conv


In [None]:
import torch
print(torch.__version__)  # Should match the version in the installation instructions
print(torch.version.cuda)  # Ensure this matches the target version (e.g., '11.8')


In [None]:
# Required installations for PyTorch Geometric
!pip install -q torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-2.5.1+cu121.html


In [None]:
try:
    import torch_scatter
    import torch_sparse
    import torch_cluster
    import torch_spline_conv
    print("All required libraries are installed and working!")
except ImportError as e:
    print(f"Error: {e}")


In [None]:
import torch
from torch_geometric.nn import Node2Vec
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print("PyTorch Geometric is successfully installed!")


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch_geometric.datasets import CoraFull
from torch_geometric.nn import Node2Vec
from torch_geometric.utils import to_torch_csr_tensor
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os

print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))  # Should display T4

In [None]:
# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Datasets


In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


Loading CoraFull dataset from torch_geometric

In [None]:
def load_cora_dataset(save_path="/content/drive/My Drive/Dataset/CoraFull_saved.pt"):

    # Ensure the directory exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Load the dataset
    dataset = CoraFull(root='./data/CoraFull')
    data = dataset[0].to(device)

    # Calculate the memory usage of the dataset in bytes
    memory_usage = 0
    for key, value in data:
        if isinstance(value, torch.Tensor):
            memory_usage += value.element_size() * value.numel()

    # Convert memory usage to GB
    memory_usage_gb = memory_usage / (1024 ** 3)

    print(f"Dataset: {dataset}")
    print(f"Number of nodes: {data.num_nodes}")
    print(f"Number of edges: {data.num_edges}")
    print(f"Number of features: {data.num_features}")
    print(f"Number of classes: {dataset.num_classes}")
    print(f"Dataset size: {memory_usage_gb:.4f} GB")

    # Save the dataset to the specified path in Google Drive
    torch.save(data, save_path)
    print(f"Dataset saved to: {save_path}")

    return data

# Specify the save path in Google Drive
save_path = "/content/drive/My Drive/Dataset/CoraFull_saved.pt"

# Load and save the dataset
data = load_cora_dataset(save_path=save_path)

Loading Pubmed dataset from torch_geometric

In [None]:
from torch_geometric.datasets import Planetoid

def load_and_save_pubmed(save_path="/content/drive/My Drive/Dataset/Pubmed_saved.pt"):
    """
    Load the Pubmed dataset, print its statistics, and save it to Google Drive.

    Parameters:
        save_path (str): The file path to save the dataset in Google Drive.

    Returns:
        torch_geometric.data.Data: The loaded dataset.
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Load the Pubmed dataset
    dataset = Planetoid(root='./data/Pubmed', name='Pubmed')
    data = dataset[0].to(device)

    # Calculate the memory usage of the dataset in bytes
    memory_usage = 0
    for key, value in data:
        if isinstance(value, torch.Tensor):
            memory_usage += value.element_size() * value.numel()

    # Convert memory usage to GB
    memory_usage_gb = memory_usage / (1024 ** 3)

    print(f"Dataset: {dataset}")
    print(f"Number of nodes: {data.num_nodes}")
    print(f"Number of edges: {data.num_edges}")
    print(f"Number of features: {data.num_features}")
    print(f"Number of classes: {dataset.num_classes}")
    print(f"Dataset size: {memory_usage_gb:.4f} GB")

    # Save the dataset to the specified path in Google Drive
    torch.save(data, save_path)
    print(f"Dataset saved to: {save_path}")

    return data

# Specify the save path in Google Drive
save_path = "/content/drive/My Drive/Dataset/Pubmed_saved.pt"

# Load and save the Pubmed dataset
#data = load_and_save_pubmed(save_path=save_path)

Loading CiteSeer from torch.geometric

In [None]:
def load_and_save_citeseer(save_path="/content/drive/My Drive/Dataset/CiteSeer_saved.pt"):
    """
    Load the Pubmed dataset, print its statistics, and save it to Google Drive.

    Parameters:
        save_path (str): The file path to save the dataset in Google Drive.

    Returns:
        torch_geometric.data.Data: The loaded dataset.
    """
    # Ensure the directory exists
    os.makedirs(os.path.dirname(save_path), exist_ok=True)

    # Load the Pubmed dataset
    dataset = Planetoid(root='./data/CiteSeer', name='CiteSeer')
    data = dataset[0].to(device)

    # Calculate the memory usage of the dataset in bytes
    memory_usage = 0
    for key, value in data:
        if isinstance(value, torch.Tensor):
            memory_usage += value.element_size() * value.numel()

    # Convert memory usage to GB
    memory_usage_gb = memory_usage / (1024 ** 3)

    print(f"Dataset: {dataset}")
    print(f"Number of nodes: {data.num_nodes}")
    print(f"Number of edges: {data.num_edges}")
    print(f"Number of features: {data.num_features}")
    print(f"Number of classes: {dataset.num_classes}")
    print(f"Dataset size: {memory_usage_gb:.4f} GB")

    # Save the dataset to the specified path in Google Drive
    torch.save(data, save_path)
    print(f"Dataset saved to: {save_path}")

    return data

# Specify the save path in Google Drive
save_path = "/content/drive/My Drive/Dataset/CiteSeer_saved.pt"

# Load and save the Pubmed dataset
#data = load_and_save_citeseer(save_path=save_path)

Load the datasets from drive

In [None]:
def load_cora_from_drive(load_path="/content/drive/My Drive/Dataset/CoraFull_saved.pt"):
    """
    Load the saved Cora dataset from Google Drive.

    Parameters:
        load_path (str): The file path to load the Cora dataset from Google Drive.

    Returns:
        torch_geometric.data.Data: The loaded Cora dataset.
    """
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"The file at {load_path} does not exist. Please ensure it is saved correctly.")

    data = torch.load(load_path)
    print(f"Cora dataset loaded successfully from {load_path}.")
    print(f"Number of nodes: {data.num_nodes}")
    print(f"Number of edges: {data.num_edges}")
    print(f"Number of features: {data.num_features}")
    return data
#data = load_cora_from_drive()

In [None]:
def load_pubmed_from_drive(load_path="/content/drive/My Drive/Dataset/Pubmed_saved.pt"):
    """
    Load the saved Pubmed dataset from Google Drive.

    Parameters:
        load_path (str): The file path to load the Pubmed dataset from Google Drive.

    Returns:
        torch_geometric.data.Data: The loaded Pubmed dataset.
    """
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"The file at {load_path} does not exist. Please ensure it is saved correctly.")

    data = torch.load(load_path)
    print(f"Pubmed dataset loaded successfully from {load_path}.")
    print(f"Number of nodes: {data.num_nodes}")
    print(f"Number of edges: {data.num_edges}")
    print(f"Number of features: {data.num_features}")
    return data

#data = load_pubmed_from_drive()

In [None]:
def load_citeseer_from_drive(load_path="/content/drive/My Drive/Dataset/CiteSeer_saved.pt"):
    """
    Load the saved CiteSeer dataset from Google Drive.

    Parameters:
        load_path (str): The file path to load the CiteSeer dataset from Google Drive.

    Returns:
        torch_geometric.data.Data: The loaded CiteSeer dataset.
    """
    if not os.path.exists(load_path):
        raise FileNotFoundError(f"The file at {load_path} does not exist. Please ensure it is saved correctly.")

    data = torch.load(load_path)
    print(f"CiteSeer dataset loaded successfully from {load_path}.")
    print(f"Number of nodes: {data.num_nodes}")
    print(f"Number of edges: {data.num_edges}")
    print(f"Number of features: {data.num_features}")
    return data

#data = load_citeseer_from_drive()

# Node2Vec

Passing in the dataset to generate node embeddings

In [None]:
embedding_dim = 128
walk_length = 20
context_size = 10
walks_per_node=10
num_negative_samples=1
p=0.5
q=0.25
sparse=True
n2v_lr=0.01 #learning rate
n2v_bs=128 #batch size

In [None]:
# Step 1: Node2Vec for Embedding Initialization
node2vec = Node2Vec(
    edge_index=data.edge_index,
    embedding_dim=embedding_dim,
    walk_length=walk_length,
    context_size=context_size,
    walks_per_node=walks_per_node,
    num_negative_samples=num_negative_samples,
    p=p, q=q,
    sparse=sparse
).to(device)

node2vec_optimizer = torch.optim.SparseAdam(node2vec.parameters(), lr=n2v_lr)
node2vec_loader = node2vec.loader(batch_size=n2v_bs, shuffle=True)

def train_node2vec(epochs=10):
    node2vec.train()
    for epoch in range(epochs):
        total_loss = 0
        for pos_rw, neg_rw in node2vec_loader:
            node2vec_optimizer.zero_grad()
            loss = node2vec.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            node2vec_optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(node2vec_loader):.4f}")

train_node2vec(epochs=10)

# Extract embeddings
node_embeddings = node2vec().detach().cpu().numpy()

# Isolation Forest

Passing in the Node Embeddings into the Isolation forest to isolate anamolies within the dataset

In [None]:
def isolation_forest(node_embeddings, n_estimators=50, contamination=0.2):
    """
    Detect anomalies in node embeddings using Isolation Forest.

    Parameters:
        node_embeddings (numpy.ndarray): A 2D array where each row represents the embedding of a node.
        n_estimators (int): Number of trees in the Isolation Forest. Default is 100.
        contamination (float): The proportion of anomalies in the data. Default is 0.1.

    Returns:
        tuple:
            numpy.ndarray: Labels array where 1 indicates an anomaly and 0 indicates normal.
            numpy.ndarray: Anomaly mask where True indicates an anomaly and False indicates normal.
    """
    # Initialize the Isolation Forest model
    isolation_model = IsolationForest(
        n_estimators=n_estimators,
        contamination=contamination,
        random_state=42
    )

    # Fit the model to the node embeddings
    isolation_model.fit(node_embeddings)

    # Predict anomaly labels: 1 for normal, -1 for anomaly
    labels = isolation_model.predict(node_embeddings)

    # Create an anomaly mask: True for anomalies, False for normal points
    anomaly_mask = labels == -1

    # Adjust labels to binary format: -1 (anomaly) -> 1, 1 (normal) -> 0
    labels = anomaly_mask.astype(int)

    return labels, anomaly_mask

# GFCN

Defining the Graph Fairing Convolutional Network

In [None]:
# Step 3: Graph Fairing Convolutional Network (GFCN)
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        self.bias = nn.Parameter(torch.FloatTensor(out_features)) if bias else None
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight)
        if self.bias is not None:
            nn.init.zeros_(self.bias)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            output += self.bias
        return output

class GFCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GFCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nhid)
        self.gc3 = GraphConvolution(nhid, nclass)
        self.dropout = dropout

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = F.relu(self.gc2(x, adj))
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc3(x, adj)
        return x

In [None]:
def gfcn(data, node_embeddings, anomaly_mask, device='cuda', epochs=150, train_ratio=0.8):
    """
    Trains a Graph Fairing Convolutional Network (GFCN) on the given data and returns node labels.

    Parameters:
        data (torch_geometric.data.Data): The graph data containing edge_index and num_nodes.
        node_embeddings (numpy.ndarray): Node embeddings as input features.
        anomaly_mask (numpy.ndarray): Boolean mask indicating anomalies (True = anomaly).
        device (str): Device to run the model on ('cuda' or 'cpu').
        epochs (int): Number of training epochs. Default is 100.
        train_ratio (float): Ratio of training nodes to total nodes. Default is 0.8.

    Returns:
        torch.Tensor: Predicted labels for all nodes (0 = normal, 1 = anomaly).
    """
    # Convert node_embeddings and anomaly_mask to PyTorch tensors
    features = torch.tensor(node_embeddings, dtype=torch.float32, device=device)
    labels = torch.tensor(anomaly_mask.astype(int), dtype=torch.long, device=device)

    # Convert edge_index to a PyTorch sparse tensor
    adj = to_torch_csr_tensor(data.edge_index, size=(data.num_nodes, data.num_nodes)).to(device)

    # Train/Test split
    num_nodes = data.num_nodes
    num_train = int(train_ratio * num_nodes)
    idx_train = torch.arange(num_train, device=device)
    idx_test = torch.arange(num_train, num_nodes, device=device)

    # Define the GFCN model
    class GFCN(nn.Module):
        def __init__(self, nfeat, nhid, nclass, dropout):
            super(GFCN, self).__init__()
            self.gc1 = nn.Linear(nfeat, nhid)
            self.gc2 = nn.Linear(nhid, nclass)
            self.dropout = dropout

        def forward(self, x, adj):
            x = torch.relu(self.gc1(x))
            x = torch.dropout(x, p=self.dropout, train=self.training)
            x = self.gc2(x)
            return x

    # Model and optimizer
    gcn = GFCN(nfeat=features.shape[1], nhid=64, nclass=2, dropout=0.5).to(device)
    optimizer = torch.optim.Adam(gcn.parameters(), lr=0.01)
    criterion = nn.CrossEntropyLoss()

    # Training the model
    def train_gfcn(epochs):
        gcn.train()
        for epoch in range(epochs):
            optimizer.zero_grad()
            output = gcn(features, adj)
            loss = criterion(output[idx_train], labels[idx_train])
            loss.backward()
            optimizer.step()
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")

    train_gfcn(epochs)

    # Evaluation: Predict labels for all nodes
    gcn.eval()
    with torch.no_grad():
        output = gcn(features, adj)
        predictions = torch.argmax(output, dim=1)  # Predicted labels

    return predictions


In [None]:
def evaluate_model(data, node_embeddings, true_labels, predicted_labels, iteration):
    """
    Evaluate the model at the end of each iteration.

    Parameters:
        data (torch_geometric.data.Data): The graph data containing edge_index and num_nodes.
        node_embeddings (numpy.ndarray): Current node embeddings.
        true_labels (numpy.ndarray): True labels of the nodes (1 for anomaly, 0 for normal).
        predicted_labels (numpy.ndarray): Predicted labels from the GFCN (1 for anomaly, 0 for normal).
        iteration (int): The current iteration number.

    Returns:
        dict: A dictionary containing evaluation metrics.
    """
    print(f"Evaluating model at iteration {iteration}...")

    # Calculate evaluation metrics manually
    true_positives = np.sum((true_labels == 1) & (predicted_labels == 1))
    true_negatives = np.sum((true_labels == 0) & (predicted_labels == 0))
    false_positives = np.sum((true_labels == 0) & (predicted_labels == 1))
    false_negatives = np.sum((true_labels == 1) & (predicted_labels == 0))

    accuracy = (true_positives + true_negatives) / len(true_labels)
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    print(f"Iteration {iteration} - Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
def remove_nodes(data, node_indices):
    """
    Remove nodes from graph data.

    Parameters:
        data (torch_geometric.data.Data): The graph data containing edge_index and num_nodes.
        node_indices (numpy.ndarray): Indices of nodes to remove.

    Returns:
        torch_geometric.data.Data: Updated graph data with specified nodes removed.
    """
    mask = np.ones(data.num_nodes, dtype=bool)
    mask[node_indices[node_indices < data.num_nodes]] = False  # Ensure indices are within bounds

    data.x = data.x[mask]

    # Filter edges based on the updated node mask
    edge_index_cpu = data.edge_index.cpu().numpy()
    edge_index_cpu = edge_index_cpu[:, (edge_index_cpu[0] < mask.size) & (edge_index_cpu[1] < mask.size)]  # Ensure edge indices are within bounds
    edge_mask = mask[edge_index_cpu[0]] & mask[edge_index_cpu[1]]
    data.edge_index = torch.tensor(edge_index_cpu[:, edge_mask], dtype=torch.long, device=data.edge_index.device)

    data.num_nodes = mask.sum()
    return data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Creates a Folder for the experiments

In [None]:
# Function to create a folder
def create_folder_in_drive(base_path, folder_name):
    """
    Create a folder in a specified path in Google Drive.

    :param base_path: The base path where the folder will be created (str)
    :param folder_name: The name of the folder to create (str)
    :return: Full path of the created folder (str)
    """
    # Combine base path and folder name
    folder_path = os.path.join(base_path, folder_name)

    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder created: {folder_path}")
    else:
        print(f"Folder already exists: {folder_path}")

    return folder_path

# Path to the folder holding the experiments
base_path = '/content/drive/MyDrive/Final Project - Boris & Omri/Experiments'

# Specify the folder name should be experiment name
folder_name = 'TestRun'

# Create the folder
output_folder  = create_folder_in_drive(base_path, folder_name)
print(f"Folder path: {output_folder }")


In [None]:
import matplotlib.pyplot as plt


def iterative_anomaly_detection(data, node_embeddings, K, device='cuda'):
    evaluation_scores = []
    removed_nodes_per_iteration = []  # Track nodes removed in each iteration

    print(f"Initial Number of Nodes: {data.num_nodes}")
    for iteration in range(K):
        # Step 1: Apply Isolation Forest to detect anomalies
        labels, anomaly_mask = isolation_forest(node_embeddings, 150, 0.3)
        num_anomalies_iforest = np.sum(anomaly_mask)
        print(f"Iteration {iteration + 1}/{K}: {num_anomalies_iforest} anomalies detected by Isolation Forest.")

        # Step 2: Apply GFCN to classify anomalies
        predictions = gfcn(data, node_embeddings, anomaly_mask, device=device)
        anomaly_indices = np.where(predictions.cpu().numpy() == 1)[0]
        num_anomalies_gfcn = len(anomaly_indices)
        print(f"Iteration {iteration + 1}/{K}: {num_anomalies_gfcn} anomalies detected by GFCN.")

        if num_anomalies_gfcn == 0:
            print("No anomalies detected by GFCN. Stopping iteration.")
            break

        anomaly_indices = anomaly_indices[anomaly_indices < node_embeddings.shape[0]]
        removed_nodes_per_iteration.append(anomaly_indices.tolist())
        node_embeddings = np.delete(node_embeddings, anomaly_indices, axis=0)
        data = remove_nodes(data, anomaly_indices)
        print(f"Iteration {iteration + 1}/{K}: Number of Nodes after anomaly removal: {data.num_nodes}")

        scores = evaluate_model(data, node_embeddings, labels, predictions.cpu().numpy(), iteration + 1)
        evaluation_scores.append(scores)

    # Plot the final evaluation scores
    iterations = range(1, len(evaluation_scores) + 1)
    accuracies = [score["accuracy"] for score in evaluation_scores]
    f1_scores = [score["f1"] for score in evaluation_scores]
    precisions = [score["precision"] for score in evaluation_scores]
    recalls = [score["recall"] for score in evaluation_scores]

    plt.figure(figsize=(10, 6))
    plt.plot(iterations, accuracies, label='Accuracy', marker='o')
    plt.plot(iterations, f1_scores, label='F1 Score', marker='o')
    plt.plot(iterations, precisions, label='Precision', marker='o')
    plt.plot(iterations, recalls, label='Recall', marker='o')

    plt.xlabel('Iteration')
    plt.ylabel('Score')
    plt.title('Evaluation Scores Across Iterations')
    plt.xticks(iterations)
    plt.legend()
    plt.grid(True)

    # Save the plot to the output folder
    plot_path = os.path.join(output_folder, 'evaluation_scores_plot.png')
    plt.savefig(plot_path)
    print(f"Plot saved to {plot_path}")
    plt.show()

    # Save removed nodes per iteration to a CSV file
    csv_path = os.path.join(output_folder, 'removed_nodes_per_iteration.csv')
    removed_nodes_df = pd.DataFrame({'Iteration': range(1, len(removed_nodes_per_iteration) + 1),
                                     'RemovedNodes': removed_nodes_per_iteration})
    removed_nodes_df.to_csv(csv_path, index=False)
    print(f"Removed nodes saved to {csv_path}")

    # Print final evaluation scores
    print("\nFinal Evaluation Scores:")
    for i, scores in enumerate(evaluation_scores, 1):
        print(f"Iteration {i} - Accuracy: {scores['accuracy']:.4f}, F1 Score: {scores['f1']:.4f}, Precision: {scores['precision']:.4f}, Recall: {scores['recall']:.4f}")

    return node_embeddings, removed_nodes_per_iteration

In [None]:
#print(f"Initial Number of Nodes: {cora_data.num_nodes}")
#node_embeddings_final, removed_nodes=iterative_anomaly_detection(load_cora_dataset(),node_embeddings, 10)
node_embeddings_final, removed_nodes=iterative_anomaly_detection(data,node_embeddings, 10)
#node_embeddings_final, removed_nodes=iterative_anomaly_detection(load_citeseer_from_drive(),node_embeddings, 50)
#node_embeddings_final, removed_nodes=iterative_anomaly_detection(load_snap_from_drive(),node_embeddings, 10)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np

def plot_remaining_and_removed_embeddings(final_embeddings, removed_nodes, original_embeddings, output_folder):
    """
    Plots the remaining and removed node embeddings in two different colors with enhanced visibility,
    with PCA applied separately for each group, and saves the plot as an image.

    Parameters:
        final_embeddings (np.ndarray): Array of final embeddings, shape (m, d).
        removed_nodes (list): List of lists containing indices of removed nodes per iteration.
        original_embeddings (np.ndarray): Array of original embeddings, shape (n, d).
        output_folder (str): Path to the folder where the plot image will be saved.
    """
    import os
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.decomposition import PCA

    # Ensure inputs are NumPy arrays
    final_embeddings = np.array(final_embeddings)
    original_embeddings = np.array(original_embeddings)

    # Extract embeddings for removed nodes
    removed_indices = [idx for iteration in removed_nodes for idx in iteration]
    removed_embeddings = original_embeddings[removed_indices]

    # Perform PCA separately for remaining and removed nodes
    pca_final = PCA(n_components=2)
    reduced_final = pca_final.fit_transform(final_embeddings)

    pca_removed = PCA(n_components=2)
    reduced_removed = pca_removed.fit_transform(removed_embeddings)

    # Plot the reduced embeddings
    plt.figure(figsize=(12, 8))
    plt.scatter(
        reduced_final[:, 0],
        reduced_final[:, 1],
        color='green',
        marker='o',
        s=50,
        label='Remaining Embeddings',
        alpha=0.8,
        edgecolor='black'
    )
    plt.scatter(
        reduced_removed[:, 0],
        reduced_removed[:, 1],
        color='red',
        marker='x',
        s=70,
        label='Removed Embeddings',
        alpha=0.8
    )

    # Add labels, legend, and title
    plt.xlabel("")
    plt.ylabel("")
    plt.legend(fontsize=12)
    plt.title("Node Embeddings: Remaining vs Removed", fontsize=14)
    plt.grid(True, linestyle='--', alpha=0.6)

    # Save the plot
    output_path = os.path.join(output_folder, "CoraFull.png")
    plt.savefig(output_path, dpi=300, bbox_inches='tight')

    # Display the plot
    plt.show()

    print(f"Plot saved to {output_path}")

plot_remaining_and_removed_embeddings(node_embeddings_final, removed_nodes, node_embeddings,output_folder)

In [None]:
import numpy as np
from sklearn.decomposition import PCA
import plotly.graph_objects as go

def plot_remaining_and_removed_embeddings_plotly(final_embeddings, removed_nodes, original_embeddings):
    """
    Plots the remaining and removed node embeddings in an interactive 3D plot using Plotly,
    with PCA applied separately for each group.

    Parameters:
        final_embeddings (np.ndarray): Array of final embeddings, shape (m, d).
        removed_nodes (list): List of lists containing indices of removed nodes per iteration.
        original_embeddings (np.ndarray): Array of original embeddings, shape (n, d).
    """
    # Ensure inputs are NumPy arrays
    final_embeddings = np.array(final_embeddings)
    original_embeddings = np.array(original_embeddings)

    # Extract embeddings for removed nodes
    removed_indices = [idx for iteration in removed_nodes for idx in iteration]
    removed_embeddings = original_embeddings[removed_indices]

    # Perform PCA separately for remaining and removed nodes
    pca_final = PCA(n_components=3)
    reduced_final = pca_final.fit_transform(final_embeddings)

    pca_removed = PCA(n_components=3)
    reduced_removed = pca_removed.fit_transform(removed_embeddings)

    # Create a Plotly 3D scatter plot
    fig = go.Figure()

    # Add remaining nodes
    fig.add_trace(go.Scatter3d(
        x=reduced_final[:, 0],
        y=reduced_final[:, 1],
        z=reduced_final[:, 2],
        mode='markers',
        marker=dict(size=5, color='green', opacity=0.8),
        name='Remaining Embeddings'
    ))

    # Add removed nodes
    fig.add_trace(go.Scatter3d(
        x=reduced_removed[:, 0],
        y=reduced_removed[:, 1],
        z=reduced_removed[:, 2],
        mode='markers',
        marker=dict(size=6, color='red', opacity=0.8),
        name='Removed Embeddings'
    ))

    # Update layout
    fig.update_layout(
        scene=dict(
            xaxis_title='PCA Dimension 1',
            yaxis_title='PCA Dimension 2',
            zaxis_title='PCA Dimension 3',
        ),
        title="Interactive 3D Node Embeddings",
        legend=dict(font=dict(size=10)),
        margin=dict(l=0, r=0, b=0, t=40)
    )

    # Show the plot
    fig.show()

plot_remaining_and_removed_embeddings_plotly(node_embeddings_final, removed_nodes, node_embeddings)
