In [None]:
import os
import networkx as nx
from tqdm import tqdm

# Directories
TRAINING_DIR = "Training_data"  # Directory containing training .net files
PROCESSED_DIR = "processed_training_graphs"  # Where preprocessed graphs (.gml) will be stored

# Ensure output directory exists
os.makedirs(PROCESSED_DIR, exist_ok=True)

### **Step 1: Convert .net to .gml and Preprocess Graphs**
def preprocess_graph(file_path, output_path):
    """Converts .net to .gml, removes isolated nodes, and normalizes labels."""
    G = nx.read_pajek(file_path)  # Load .net graph
    G = nx.Graph(G)  # Convert to undirected (if needed)
    G.remove_nodes_from(list(nx.isolates(G)))  # Remove isolated nodes

    # Convert node labels to integers (required for embeddings)
    G = nx.convert_node_labels_to_integers(G, label_attribute="original_label")

    # Save as .gml for better compatibility
    nx.write_gml(G, output_path)

### **Step 2: Process All Graphs in Training Data**
for file in tqdm(os.listdir(TRAINING_DIR)):
    if file.endswith(".net"):
        file_path = os.path.join(TRAINING_DIR, file)
        output_file = os.path.join(PROCESSED_DIR, file.replace(".net", ".gml"))

        # Convert & Preprocess
        preprocess_graph(file_path, output_file)

print("Preprocessing Complete!")
print(f"Processed graphs are stored in: {PROCESSED_DIR}")


100%|██████████| 111/111 [02:23<00:00,  1.30s/it]

✅ Preprocessing Complete!
Processed graphs are stored in: processed_training_graphs





In [None]:
import os
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
from node2vec import Node2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Directories
PROCESSED_DIR = "processed_training_graphs"  # Input graphs
OUTPUT_DIR = "node2vec_embeddings"  # Stores only cosine similarity matrices

# Ensure the output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Networks to process
NODE2VEC_NETWORKS = {
    "ERD", "KHN", "LDG", "SMG", "ZWL", "HTC", "CGS",
    "NSC", "GRQ", "HMT", "FBK", "ADV", "EML", "YST"
}

### **Step 1: Generate Node2Vec Embeddings**
def generate_node2vec_embeddings(graph, dimensions=128, walk_length=80, num_walks=10, p=1, q=1):
    """Generates Node2Vec embeddings for a given NetworkX graph."""
    node2vec = Node2Vec(graph, dimensions=dimensions, walk_length=walk_length, num_walks=num_walks, p=p, q=q, workers=4)
    model = node2vec.fit(window=10, min_count=1, batch_words=4)
    return model

### **Step 2: Compute Cosine Similarity**
def compute_similarity(embeddings):
    """Computes cosine similarity with proper diagonal values and symmetry."""
    # Ensure no NaN values in embeddings
    if np.isnan(embeddings).any():
        # Replace NaN values with zeros - or consider better handling
        embeddings = np.nan_to_num(embeddings, nan=0.0)
    
    # Compute cosine similarity
    similarity = cosine_similarity(embeddings)
    
    # Ensure proper diagonal values (should be 1.0)
    np.fill_diagonal(similarity, 1.0)
    
    # Ensure symmetry by averaging with transpose
    # This makes sim(i,j) = sim(j,i)
    similarity = (similarity + similarity.T) / 2
    
    return similarity

# Process all .gml files
for file in tqdm(os.listdir(PROCESSED_DIR)):
    if file.endswith(".gml"):
        base_name = file.split("_train_")[0]
        print(f"Processing {file} -> Detected as {base_name}")

        if base_name in NODE2VEC_NETWORKS:
            file_path = os.path.join(PROCESSED_DIR, file)

            # Load Graph
            G = nx.read_gml(file_path)

            # Skip very small graphs
            if len(G.nodes) < 5:
                print(f"Skipping {file} (Graph too small: {len(G.nodes)} nodes)")
                continue

            try:
                # Generate Node2Vec Embeddings
                model = generate_node2vec_embeddings(G)
            except Exception as e:
                print(f"Node2Vec failed for {file}: {e}")
                continue  # Move to the next network

            # Extract embeddings
            try:
                # Get node list first to ensure order consistency
                node_labels = list(G.nodes())
                
                # More robust embedding extraction with error handling
                node_embeddings = []
                for node in node_labels:
                    try:
                        node_embeddings.append(model.wv[str(node)])
                    except KeyError:
                        print(f"Missing embedding for node {node} in {file}, using zeros")
                        # Use zero vector for missing embeddings
                        node_embeddings.append(np.zeros(model.wv.vector_size))
                
                node_embeddings = np.array(node_embeddings)
                
                # Validate embeddings - check for zero-vectors that would cause NaN
                zero_vectors = np.where(np.all(node_embeddings == 0, axis=1))[0]
                if len(zero_vectors) > 0:
                    print(f"Found {len(zero_vectors)} zero vectors in {file}")
                    # Replace zeros with small random values to avoid NaN in cosine similarity
                    for idx in zero_vectors:
                        node_embeddings[idx] = np.random.normal(0, 0.01, model.wv.vector_size)
                
            except Exception as e:
                print(f"Embedding extraction failed for {file}: {e}")
                continue

            # Compute and Save Cosine Similarity
            try:
                similarity_matrix = compute_similarity(node_embeddings)
                similarity_df = pd.DataFrame(similarity_matrix, index=node_labels, columns=node_labels)

                similarity_filename = file.replace(".gml", "_cosine_similarity.csv")
                similarity_path = os.path.join(OUTPUT_DIR, similarity_filename)
                similarity_df.to_csv(similarity_path)

                print(f"Saved similarity: {similarity_path}")
            except Exception as e:
                print(f"Similarity computation failed for {file}: {e}")

print("\nCosine similarity computed for all networks!")

  from .autonotebook import tqdm as notebook_tqdm
  0%|          | 0/110 [00:00<?, ?it/s]

🔍 Processing ADV_train_0.gml -> Detected as ADV


Computing transition probabilities: 100%|██████████| 4907/4907 [00:11<00:00, 439.92it/s] 
  1%|          | 1/110 [02:10<3:56:35, 130.24s/it]

✅ Saved similarity: node2vec_embeddings\ADV_train_0_cosine_similarity.csv
🔍 Processing ADV_train_1.gml -> Detected as ADV


Computing transition probabilities: 100%|██████████| 4891/4891 [00:11<00:00, 422.33it/s] 
  2%|▏         | 2/110 [04:18<3:52:37, 129.24s/it]

✅ Saved similarity: node2vec_embeddings\ADV_train_1_cosine_similarity.csv
🔍 Processing ADV_train_2.gml -> Detected as ADV


Computing transition probabilities: 100%|██████████| 4903/4903 [00:11<00:00, 424.40it/s] 
  3%|▎         | 3/110 [06:26<3:48:59, 128.40s/it]

✅ Saved similarity: node2vec_embeddings\ADV_train_2_cosine_similarity.csv
🔍 Processing ADV_train_3.gml -> Detected as ADV


Computing transition probabilities: 100%|██████████| 4914/4914 [00:11<00:00, 414.22it/s] 
  4%|▎         | 4/110 [08:34<3:46:58, 128.48s/it]

✅ Saved similarity: node2vec_embeddings\ADV_train_3_cosine_similarity.csv
🔍 Processing ADV_train_4.gml -> Detected as ADV


Computing transition probabilities: 100%|██████████| 4878/4878 [00:11<00:00, 408.78it/s] 
  5%|▍         | 5/110 [10:39<3:42:45, 127.29s/it]

✅ Saved similarity: node2vec_embeddings\ADV_train_4_cosine_similarity.csv
🔍 Processing BUP_train_0.gml -> Detected as BUP
🔍 Processing BUP_train_1.gml -> Detected as BUP
🔍 Processing BUP_train_2.gml -> Detected as BUP
🔍 Processing BUP_train_3.gml -> Detected as BUP
🔍 Processing BUP_train_4.gml -> Detected as BUP
🔍 Processing CDM_train_0.gml -> Detected as CDM
🔍 Processing CDM_train_1.gml -> Detected as CDM
🔍 Processing CDM_train_2.gml -> Detected as CDM
🔍 Processing CDM_train_3.gml -> Detected as CDM
🔍 Processing CDM_train_4.gml -> Detected as CDM
🔍 Processing CEG_train_0.gml -> Detected as CEG
🔍 Processing CEG_train_1.gml -> Detected as CEG
🔍 Processing CEG_train_2.gml -> Detected as CEG
🔍 Processing CEG_train_3.gml -> Detected as CEG
🔍 Processing CEG_train_4.gml -> Detected as CEG
🔍 Processing CGS_train_0.gml -> Detected as CGS


Computing transition probabilities: 100%|██████████| 5648/5648 [00:00<00:00, 7453.96it/s]
 19%|█▉        | 21/110 [12:29<29:29, 19.88s/it]  

✅ Saved similarity: node2vec_embeddings\CGS_train_0_cosine_similarity.csv
🔍 Processing CGS_train_1.gml -> Detected as CGS


Computing transition probabilities: 100%|██████████| 5726/5726 [00:00<00:00, 6922.46it/s]
 20%|██        | 22/110 [15:42<47:56, 32.68s/it]

✅ Saved similarity: node2vec_embeddings\CGS_train_1_cosine_similarity.csv
🔍 Processing CGS_train_2.gml -> Detected as CGS


Computing transition probabilities: 100%|██████████| 5698/5698 [00:02<00:00, 2075.16it/s]
 21%|██        | 23/110 [18:46<1:08:15, 47.07s/it]

✅ Saved similarity: node2vec_embeddings\CGS_train_2_cosine_similarity.csv
🔍 Processing CGS_train_3.gml -> Detected as CGS


Computing transition probabilities: 100%|██████████| 5664/5664 [00:01<00:00, 5129.86it/s]
 22%|██▏       | 24/110 [20:53<1:21:10, 56.63s/it]

✅ Saved similarity: node2vec_embeddings\CGS_train_3_cosine_similarity.csv
🔍 Processing CGS_train_4.gml -> Detected as CGS


Computing transition probabilities: 100%|██████████| 5668/5668 [00:00<00:00, 10198.02it/s]
 23%|██▎       | 25/110 [23:02<1:35:14, 67.23s/it]

✅ Saved similarity: node2vec_embeddings\CGS_train_4_cosine_similarity.csv
🔍 Processing EML_train_0.gml -> Detected as EML


Computing transition probabilities: 100%|██████████| 1087/1087 [00:00<00:00, 3116.04it/s]
 24%|██▎       | 26/110 [23:26<1:23:37, 59.73s/it]

✅ Saved similarity: node2vec_embeddings\EML_train_0_cosine_similarity.csv
🔍 Processing EML_train_1.gml -> Detected as EML


Computing transition probabilities: 100%|██████████| 1102/1102 [00:00<00:00, 3534.64it/s]
 25%|██▍       | 27/110 [23:46<1:11:45, 51.87s/it]

✅ Saved similarity: node2vec_embeddings\EML_train_1_cosine_similarity.csv
🔍 Processing EML_train_2.gml -> Detected as EML


Computing transition probabilities: 100%|██████████| 1099/1099 [00:00<00:00, 3691.33it/s]
 25%|██▌       | 28/110 [24:09<1:02:12, 45.52s/it]

✅ Saved similarity: node2vec_embeddings\EML_train_2_cosine_similarity.csv
🔍 Processing EML_train_3.gml -> Detected as EML


Computing transition probabilities: 100%|██████████| 1092/1092 [00:00<00:00, 2717.00it/s]
 26%|██▋       | 29/110 [24:31<53:55, 39.94s/it]  

✅ Saved similarity: node2vec_embeddings\EML_train_3_cosine_similarity.csv
🔍 Processing EML_train_4.gml -> Detected as EML


Computing transition probabilities: 100%|██████████| 1103/1103 [00:00<00:00, 3064.46it/s]
 27%|██▋       | 30/110 [24:59<49:09, 36.87s/it]

✅ Saved similarity: node2vec_embeddings\EML_train_4_cosine_similarity.csv
🔍 Processing ERD_train_0.gml -> Detected as ERD


Computing transition probabilities: 100%|██████████| 5980/5980 [00:02<00:00, 2033.78it/s]
 28%|██▊       | 31/110 [29:17<2:06:06, 95.78s/it]

✅ Saved similarity: node2vec_embeddings\ERD_train_0_cosine_similarity.csv
🔍 Processing ERD_train_1.gml -> Detected as ERD


Computing transition probabilities: 100%|██████████| 5904/5904 [00:01<00:00, 3276.88it/s]
 29%|██▉       | 32/110 [33:05<2:52:08, 132.42s/it]

✅ Saved similarity: node2vec_embeddings\ERD_train_1_cosine_similarity.csv
🔍 Processing ERD_train_2.gml -> Detected as ERD


Computing transition probabilities: 100%|██████████| 5953/5953 [00:03<00:00, 1878.73it/s]
 30%|███       | 33/110 [37:17<3:33:22, 166.27s/it]

✅ Saved similarity: node2vec_embeddings\ERD_train_2_cosine_similarity.csv
🔍 Processing ERD_train_3.gml -> Detected as ERD


Computing transition probabilities: 100%|██████████| 5926/5926 [00:03<00:00, 1779.31it/s]
 31%|███       | 34/110 [41:09<3:54:22, 185.03s/it]

✅ Saved similarity: node2vec_embeddings\ERD_train_3_cosine_similarity.csv
🔍 Processing ERD_train_4.gml -> Detected as ERD


Computing transition probabilities: 100%|██████████| 5885/5885 [00:03<00:00, 1625.35it/s]
 32%|███▏      | 35/110 [45:05<4:09:46, 199.82s/it]

✅ Saved similarity: node2vec_embeddings\ERD_train_4_cosine_similarity.csv
🔍 Processing FBK_train_0.gml -> Detected as FBK


Computing transition probabilities: 100%|██████████| 4001/4001 [00:52<00:00, 76.20it/s] 
 33%|███▎      | 36/110 [48:35<4:10:15, 202.91s/it]

✅ Saved similarity: node2vec_embeddings\FBK_train_0_cosine_similarity.csv
🔍 Processing FBK_train_1.gml -> Detected as FBK


Computing transition probabilities: 100%|██████████| 4003/4003 [00:22<00:00, 174.69it/s]
 34%|███▎      | 37/110 [50:42<3:39:33, 180.45s/it]

✅ Saved similarity: node2vec_embeddings\FBK_train_1_cosine_similarity.csv
🔍 Processing FBK_train_2.gml -> Detected as FBK


Computing transition probabilities: 100%|██████████| 3998/3998 [00:22<00:00, 175.72it/s]
 35%|███▍      | 38/110 [52:49<3:17:39, 164.72s/it]

✅ Saved similarity: node2vec_embeddings\FBK_train_2_cosine_similarity.csv
🔍 Processing FBK_train_3.gml -> Detected as FBK


Computing transition probabilities: 100%|██████████| 4003/4003 [00:23<00:00, 167.56it/s]
 35%|███▌      | 39/110 [54:59<3:02:30, 154.24s/it]

✅ Saved similarity: node2vec_embeddings\FBK_train_3_cosine_similarity.csv
🔍 Processing FBK_train_4.gml -> Detected as FBK


Computing transition probabilities: 100%|██████████| 4006/4006 [00:22<00:00, 175.60it/s]
 36%|███▋      | 40/110 [57:07<2:50:53, 146.48s/it]

✅ Saved similarity: node2vec_embeddings\FBK_train_4_cosine_similarity.csv
🔍 Processing GRQ_train_0.gml -> Detected as GRQ


Computing transition probabilities: 100%|██████████| 4942/4942 [00:00<00:00, 6608.77it/s]
 37%|███▋      | 41/110 [58:55<2:35:07, 134.89s/it]

✅ Saved similarity: node2vec_embeddings\GRQ_train_0_cosine_similarity.csv
🔍 Processing GRQ_train_1.gml -> Detected as GRQ


Computing transition probabilities: 100%|██████████| 4945/4945 [00:00<00:00, 7607.39it/s]
 38%|███▊      | 42/110 [1:00:43<2:23:57, 127.02s/it]

✅ Saved similarity: node2vec_embeddings\GRQ_train_1_cosine_similarity.csv
🔍 Processing GRQ_train_2.gml -> Detected as GRQ


Computing transition probabilities: 100%|██████████| 4960/4960 [00:00<00:00, 6617.96it/s]
 39%|███▉      | 43/110 [1:02:33<2:16:05, 121.87s/it]

✅ Saved similarity: node2vec_embeddings\GRQ_train_2_cosine_similarity.csv
🔍 Processing GRQ_train_3.gml -> Detected as GRQ


Computing transition probabilities: 100%|██████████| 4973/4973 [00:00<00:00, 6901.63it/s]
 40%|████      | 44/110 [1:04:22<2:09:36, 117.82s/it]

✅ Saved similarity: node2vec_embeddings\GRQ_train_3_cosine_similarity.csv
🔍 Processing GRQ_train_4.gml -> Detected as GRQ


Computing transition probabilities: 100%|██████████| 4941/4941 [00:00<00:00, 7291.88it/s]
 41%|████      | 45/110 [1:06:10<2:04:41, 115.11s/it]

✅ Saved similarity: node2vec_embeddings\GRQ_train_4_cosine_similarity.csv
🔍 Processing HMT_train_0.gml -> Detected as HMT


Computing transition probabilities: 100%|██████████| 2340/2340 [00:01<00:00, 1230.25it/s]
 42%|████▏     | 46/110 [1:07:01<1:42:13, 95.83s/it] 

✅ Saved similarity: node2vec_embeddings\HMT_train_0_cosine_similarity.csv
🔍 Processing HMT_train_1.gml -> Detected as HMT


Computing transition probabilities: 100%|██████████| 2361/2361 [00:01<00:00, 1284.36it/s]
 43%|████▎     | 47/110 [1:07:53<1:26:37, 82.50s/it]

✅ Saved similarity: node2vec_embeddings\HMT_train_1_cosine_similarity.csv
🔍 Processing HMT_train_2.gml -> Detected as HMT


Computing transition probabilities: 100%|██████████| 2359/2359 [00:01<00:00, 1274.64it/s]
 44%|████▎     | 48/110 [1:08:46<1:16:20, 73.87s/it]

✅ Saved similarity: node2vec_embeddings\HMT_train_2_cosine_similarity.csv
🔍 Processing HMT_train_3.gml -> Detected as HMT


Computing transition probabilities: 100%|██████████| 2369/2369 [00:01<00:00, 1273.43it/s]
 45%|████▍     | 49/110 [1:09:38<1:08:15, 67.14s/it]

✅ Saved similarity: node2vec_embeddings\HMT_train_3_cosine_similarity.csv
🔍 Processing HMT_train_4.gml -> Detected as HMT


Computing transition probabilities: 100%|██████████| 2339/2339 [00:01<00:00, 1257.54it/s]
 45%|████▌     | 50/110 [1:10:28<1:02:10, 62.18s/it]

✅ Saved similarity: node2vec_embeddings\HMT_train_4_cosine_similarity.csv
🔍 Processing HPD_train_0.gml -> Detected as HPD
🔍 Processing HPD_train_1.gml -> Detected as HPD
🔍 Processing HPD_train_2.gml -> Detected as HPD
🔍 Processing HPD_train_3.gml -> Detected as HPD
🔍 Processing HPD_train_4.gml -> Detected as HPD
🔍 Processing HTC_train_0.gml -> Detected as HTC


Computing transition probabilities: 100%|██████████| 7174/7174 [00:00<00:00, 12657.72it/s]
 51%|█████     | 56/110 [1:13:31<35:26, 39.38s/it]  

✅ Saved similarity: node2vec_embeddings\HTC_train_0_cosine_similarity.csv
🔍 Processing HTC_train_1.gml -> Detected as HTC


Computing transition probabilities: 100%|██████████| 7181/7181 [00:00<00:00, 14220.89it/s]
 52%|█████▏    | 57/110 [1:16:33<53:10, 60.20s/it]

✅ Saved similarity: node2vec_embeddings\HTC_train_1_cosine_similarity.csv
🔍 Processing HTC_train_2.gml -> Detected as HTC


Computing transition probabilities: 100%|██████████| 7167/7167 [00:00<00:00, 13432.54it/s]
 53%|█████▎    | 58/110 [1:19:38<1:10:54, 81.82s/it]

✅ Saved similarity: node2vec_embeddings\HTC_train_2_cosine_similarity.csv
🔍 Processing HTC_train_3.gml -> Detected as HTC


Computing transition probabilities: 100%|██████████| 7171/7171 [00:00<00:00, 14385.48it/s]
 54%|█████▎    | 59/110 [1:22:39<1:26:09, 101.36s/it]

✅ Saved similarity: node2vec_embeddings\HTC_train_3_cosine_similarity.csv
🔍 Processing HTC_train_4.gml -> Detected as HTC


Computing transition probabilities: 100%|██████████| 7158/7158 [00:00<00:00, 12821.41it/s]
 55%|█████▍    | 60/110 [1:25:40<1:39:05, 118.91s/it]

✅ Saved similarity: node2vec_embeddings\HTC_train_4_cosine_similarity.csv
🔍 Processing INF_train_0.gml -> Detected as INF
🔍 Processing INF_train_1.gml -> Detected as INF
🔍 Processing INF_train_2.gml -> Detected as INF
🔍 Processing INF_train_3.gml -> Detected as INF
🔍 Processing INF_train_4.gml -> Detected as INF
🔍 Processing KHN_train_0.gml -> Detected as KHN


Computing transition probabilities: 100%|██████████| 3595/3595 [00:02<00:00, 1553.37it/s]
 60%|██████    | 66/110 [1:27:09<37:17, 50.84s/it]   

✅ Saved similarity: node2vec_embeddings\KHN_train_0_cosine_similarity.csv
🔍 Processing KHN_train_1.gml -> Detected as KHN


Computing transition probabilities: 100%|██████████| 3629/3629 [00:02<00:00, 1683.84it/s]
 61%|██████    | 67/110 [1:28:38<40:09, 56.05s/it]

✅ Saved similarity: node2vec_embeddings\KHN_train_1_cosine_similarity.csv
🔍 Processing KHN_train_2.gml -> Detected as KHN


Computing transition probabilities: 100%|██████████| 3595/3595 [00:02<00:00, 1637.92it/s]
 62%|██████▏   | 68/110 [1:30:06<42:48, 61.16s/it]

✅ Saved similarity: node2vec_embeddings\KHN_train_2_cosine_similarity.csv
🔍 Processing KHN_train_3.gml -> Detected as KHN


Computing transition probabilities: 100%|██████████| 3592/3592 [00:02<00:00, 1611.45it/s]
 63%|██████▎   | 69/110 [1:31:36<45:28, 66.54s/it]

✅ Saved similarity: node2vec_embeddings\KHN_train_3_cosine_similarity.csv
🔍 Processing KHN_train_4.gml -> Detected as KHN


Computing transition probabilities: 100%|██████████| 3623/3623 [00:02<00:00, 1684.46it/s]
 64%|██████▎   | 70/110 [1:33:04<47:23, 71.09s/it]

✅ Saved similarity: node2vec_embeddings\KHN_train_4_cosine_similarity.csv
🔍 Processing LDG_train_0.gml -> Detected as LDG


Computing transition probabilities: 100%|██████████| 8062/8062 [00:06<00:00, 1171.25it/s]
 65%|██████▍   | 71/110 [1:37:20<1:14:05, 113.99s/it]

✅ Saved similarity: node2vec_embeddings\LDG_train_0_cosine_similarity.csv
🔍 Processing LDG_train_1.gml -> Detected as LDG


Computing transition probabilities: 100%|██████████| 8024/8024 [00:07<00:00, 1138.67it/s]
 65%|██████▌   | 72/110 [1:41:35<1:34:20, 148.96s/it]

✅ Saved similarity: node2vec_embeddings\LDG_train_1_cosine_similarity.csv
🔍 Processing LDG_train_2.gml -> Detected as LDG


Computing transition probabilities: 100%|██████████| 8061/8061 [00:07<00:00, 1105.68it/s]
 66%|██████▋   | 73/110 [1:45:52<1:49:27, 177.51s/it]

✅ Saved similarity: node2vec_embeddings\LDG_train_2_cosine_similarity.csv
🔍 Processing LDG_train_3.gml -> Detected as LDG


Computing transition probabilities: 100%|██████████| 8068/8068 [00:07<00:00, 1095.49it/s]
 67%|██████▋   | 74/110 [1:50:10<1:59:33, 199.26s/it]

✅ Saved similarity: node2vec_embeddings\LDG_train_3_cosine_similarity.csv
🔍 Processing LDG_train_4.gml -> Detected as LDG


Computing transition probabilities: 100%|██████████| 8054/8054 [00:07<00:00, 1055.75it/s]
 68%|██████▊   | 75/110 [1:54:27<2:05:37, 215.36s/it]

✅ Saved similarity: node2vec_embeddings\LDG_train_4_cosine_similarity.csv
🔍 Processing NSC_train_0.gml -> Detected as NSC


Computing transition probabilities: 100%|██████████| 1382/1382 [00:00<00:00, 18522.93it/s]
 69%|██████▉   | 76/110 [1:54:45<1:30:12, 159.20s/it]

✅ Saved similarity: node2vec_embeddings\NSC_train_0_cosine_similarity.csv
🔍 Processing NSC_train_1.gml -> Detected as NSC


Computing transition probabilities: 100%|██████████| 1382/1382 [00:00<00:00, 18958.64it/s]
 70%|███████   | 77/110 [1:55:04<1:05:09, 118.45s/it]

✅ Saved similarity: node2vec_embeddings\NSC_train_1_cosine_similarity.csv
🔍 Processing NSC_train_2.gml -> Detected as NSC


Computing transition probabilities: 100%|██████████| 1388/1388 [00:00<00:00, 17176.79it/s]
 71%|███████   | 78/110 [1:55:23<47:35, 89.23s/it]   

✅ Saved similarity: node2vec_embeddings\NSC_train_2_cosine_similarity.csv
🔍 Processing NSC_train_3.gml -> Detected as NSC


Computing transition probabilities: 100%|██████████| 1387/1387 [00:00<00:00, 16888.07it/s]
 72%|███████▏  | 79/110 [1:55:41<35:19, 68.39s/it]

✅ Saved similarity: node2vec_embeddings\NSC_train_3_cosine_similarity.csv
🔍 Processing NSC_train_4.gml -> Detected as NSC


Computing transition probabilities: 100%|██████████| 1370/1370 [00:00<00:00, 17523.11it/s]
 73%|███████▎  | 80/110 [1:56:00<26:51, 53.70s/it]

✅ Saved similarity: node2vec_embeddings\NSC_train_4_cosine_similarity.csv
🔍 Processing PGP_train_0.gml -> Detected as PGP
🔍 Processing PGP_train_1.gml -> Detected as PGP
🔍 Processing PGP_train_2.gml -> Detected as PGP
🔍 Processing PGP_train_3.gml -> Detected as PGP
🔍 Processing PGP_train_4.gml -> Detected as PGP
🔍 Processing SMG_train_0.gml -> Detected as SMG


Computing transition probabilities: 100%|██████████| 992/992 [00:00<00:00, 1997.58it/s]
 78%|███████▊  | 86/110 [1:56:20<07:01, 17.55s/it]

✅ Saved similarity: node2vec_embeddings\SMG_train_0_cosine_similarity.csv
🔍 Processing SMG_train_1.gml -> Detected as SMG


Computing transition probabilities: 100%|██████████| 999/999 [00:00<00:00, 1681.70it/s]
 79%|███████▉  | 87/110 [1:56:40<06:51, 17.90s/it]

✅ Saved similarity: node2vec_embeddings\SMG_train_1_cosine_similarity.csv
🔍 Processing SMG_train_2.gml -> Detected as SMG


Computing transition probabilities: 100%|██████████| 992/992 [00:00<00:00, 1631.50it/s]
 80%|████████  | 88/110 [1:57:00<06:40, 18.20s/it]

✅ Saved similarity: node2vec_embeddings\SMG_train_2_cosine_similarity.csv
🔍 Processing SMG_train_3.gml -> Detected as SMG


Computing transition probabilities: 100%|██████████| 996/996 [00:00<00:00, 1789.44it/s]
 81%|████████  | 89/110 [1:57:20<06:28, 18.52s/it]

✅ Saved similarity: node2vec_embeddings\SMG_train_3_cosine_similarity.csv
🔍 Processing SMG_train_4.gml -> Detected as SMG


Computing transition probabilities: 100%|██████████| 997/997 [00:00<00:00, 2036.38it/s]
 82%|████████▏ | 90/110 [1:57:39<06:15, 18.80s/it]

✅ Saved similarity: node2vec_embeddings\SMG_train_4_cosine_similarity.csv
🔍 Processing UAL_train_0.gml -> Detected as UAL
🔍 Processing UAL_train_1.gml -> Detected as UAL
🔍 Processing UAL_train_2.gml -> Detected as UAL
🔍 Processing UAL_train_3.gml -> Detected as UAL
🔍 Processing UAL_train_4.gml -> Detected as UAL
🔍 Processing UPG_train_0.gml -> Detected as UPG
🔍 Processing UPG_train_1.gml -> Detected as UPG
🔍 Processing UPG_train_2.gml -> Detected as UPG
🔍 Processing UPG_train_3.gml -> Detected as UPG
🔍 Processing UPG_train_4.gml -> Detected as UPG
🔍 Processing YST_train_0.gml -> Detected as YST


Computing transition probabilities: 100%|██████████| 2112/2112 [00:00<00:00, 6200.56it/s]
 92%|█████████▏| 101/110 [1:58:20<01:04,  7.11s/it]

✅ Saved similarity: node2vec_embeddings\YST_train_0_cosine_similarity.csv
🔍 Processing YST_train_1.gml -> Detected as YST


Computing transition probabilities: 100%|██████████| 2121/2121 [00:00<00:00, 5890.25it/s]
 93%|█████████▎| 102/110 [1:59:03<01:22, 10.32s/it]

✅ Saved similarity: node2vec_embeddings\YST_train_1_cosine_similarity.csv
🔍 Processing YST_train_2.gml -> Detected as YST


Computing transition probabilities: 100%|██████████| 2124/2124 [00:00<00:00, 5383.52it/s]
 94%|█████████▎| 103/110 [1:59:44<01:37, 13.94s/it]

✅ Saved similarity: node2vec_embeddings\YST_train_2_cosine_similarity.csv
🔍 Processing YST_train_3.gml -> Detected as YST


Computing transition probabilities: 100%|██████████| 2110/2110 [00:00<00:00, 6353.01it/s]
 95%|█████████▍| 104/110 [2:00:26<01:47, 17.86s/it]

✅ Saved similarity: node2vec_embeddings\YST_train_3_cosine_similarity.csv
🔍 Processing YST_train_4.gml -> Detected as YST


Computing transition probabilities: 100%|██████████| 2127/2127 [00:00<00:00, 6443.83it/s]
 95%|█████████▌| 105/110 [2:01:09<01:50, 22.06s/it]

✅ Saved similarity: node2vec_embeddings\YST_train_4_cosine_similarity.csv
🔍 Processing ZWL_train_0.gml -> Detected as ZWL


Computing transition probabilities: 100%|██████████| 6568/6568 [00:06<00:00, 1049.80it/s]
 96%|█████████▋| 106/110 [2:04:33<03:49, 57.37s/it]

✅ Saved similarity: node2vec_embeddings\ZWL_train_0_cosine_similarity.csv
🔍 Processing ZWL_train_1.gml -> Detected as ZWL


Computing transition probabilities: 100%|██████████| 6542/6542 [00:06<00:00, 1041.16it/s]
 97%|█████████▋| 107/110 [2:07:53<04:25, 88.36s/it]

✅ Saved similarity: node2vec_embeddings\ZWL_train_1_cosine_similarity.csv
🔍 Processing ZWL_train_2.gml -> Detected as ZWL


Computing transition probabilities: 100%|██████████| 6566/6566 [00:06<00:00, 1028.46it/s]
 98%|█████████▊| 108/110 [2:11:18<03:51, 115.86s/it]

✅ Saved similarity: node2vec_embeddings\ZWL_train_2_cosine_similarity.csv
🔍 Processing ZWL_train_3.gml -> Detected as ZWL


Computing transition probabilities: 100%|██████████| 6554/6554 [00:06<00:00, 1019.96it/s]
 99%|█████████▉| 109/110 [2:14:49<02:20, 140.06s/it]

✅ Saved similarity: node2vec_embeddings\ZWL_train_3_cosine_similarity.csv
🔍 Processing ZWL_train_4.gml -> Detected as ZWL


Computing transition probabilities: 100%|██████████| 6548/6548 [00:06<00:00, 969.33it/s] 
100%|██████████| 110/110 [2:18:14<00:00, 75.41s/it] 

✅ Saved similarity: node2vec_embeddings\ZWL_train_4_cosine_similarity.csv

🚀 Cosine similarity computed for all networks!





In [None]:
import os
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

# Configuration
INPUT_DIR = "processed_training_graph_spectral"
OUTPUT_DIR = "spectral_similarity_embeddings"
SPARSE_NETWORKS = ["UPG", "HPD", "PGP", "CDM"]  # Networks needing spectral embeddings
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --------------------------
# Embedding Generation Functions
# --------------------------

def generate_spectral_embeddings(G, dimensions=32):
    """Generate spectral embeddings using Laplacian Eigenmaps (optimized for large networks)"""
    try:
        if len(G) < 2:
            print("Graph is too small for spectral embeddings.")
            return None, None

        # Use the largest connected component to avoid convergence issues
        if not nx.is_connected(G):
            largest_cc = max(nx.connected_components(G), key=len)
            G = G.subgraph(largest_cc).copy()
            print(f"Using largest connected component with {len(G)} nodes.")

        # Dynamically adjust dimensions
        dimensions = min(dimensions, len(G) - 1)
        
        # Compute normalized Laplacian
        laplacian = nx.normalized_laplacian_matrix(G).asfptype()

        # Eigen decomposition with increased iterations
        eigenvalues, eigenvectors = sp.linalg.eigsh(
            laplacian.astype(float), k=dimensions+1, which='SM', maxiter=100000
        )

        # Discard first eigenvector (trivial solution) and normalize
        embeddings = eigenvectors[:, 1:dimensions+1].real
        
        # Ensure no NaN values in embeddings
        if np.isnan(embeddings).any():
            print(f"Warning: NaN values found in embeddings, replacing with zeros")
            embeddings = np.nan_to_num(embeddings, nan=0.0)
            
        # Normalize embeddings to prevent NaN in cosine similarity
        norms = np.linalg.norm(embeddings, axis=1)
        
        # Handle zero vectors to prevent division by zero
        mask = norms > 1e-10
        normalized_embeddings = np.zeros_like(embeddings)
        normalized_embeddings[mask] = embeddings[mask] / norms[mask, None]
        
        # Replace zero vectors with small random values
        zero_vectors = np.where(~mask)[0]
        if len(zero_vectors) > 0:
            print(f"Found {len(zero_vectors)} zero vectors, replacing with small random values")
            for idx in zero_vectors:
                normalized_embeddings[idx] = np.random.normal(0, 0.01, dimensions)
                # Normalize these random vectors too
                normalized_embeddings[idx] = normalized_embeddings[idx] / np.linalg.norm(normalized_embeddings[idx])
                
        return normalized_embeddings, list(G.nodes())
    
    except Exception as e:
        print(f"Spectral embedding failed: {str(e)}")
        return None, None


# --------------------------
# Modified Processing Pipeline
# --------------------------

def compute_similarity(embeddings):
    """Compute cosine similarity with proper diagonal values and symmetry"""
    try:
        # Compute cosine similarity
        similarity = cosine_similarity(embeddings)
        
        # Ensure proper diagonal values (should be 1.0)
        np.fill_diagonal(similarity, 1.0)
        
        # Ensure symmetry by averaging with transpose
        similarity = (similarity + similarity.T) / 2
        
        return similarity
    except Exception as e:
        print(f"Similarity computation error: {str(e)}")
        return None

# Main processing loop
for file in tqdm(os.listdir(INPUT_DIR), leave=True, mininterval=1):
    if file.endswith(".gml"):
        file_path = os.path.join(INPUT_DIR, file)
        print(f"Processing {file}...")
        
        try:
            G = nx.read_gml(file_path)  # Load graph
            
            # Default to spectral for SPARSE_NETWORKS
            method = "spectral"
            if any(network in file for network in SPARSE_NETWORKS):
                embeddings, nodes = generate_spectral_embeddings(G)
            else:
                # For non-sparse networks, also use spectral but with possibility
                # to add different methods in the future
                embeddings, nodes = generate_spectral_embeddings(G)
                
            if embeddings is None or len(embeddings) == 0:
                print(f"Skipping {file} - embedding generation failed")
                continue
                
            # Validate embeddings before computing similarity
            if np.isnan(embeddings).any():
                print(f"Warning: NaN values in final embeddings for {file}, fixing...")
                embeddings = np.nan_to_num(embeddings, nan=0.0)
                
            # Compute & save similarity
            similarity_matrix = compute_similarity(embeddings)
            if similarity_matrix is not None:
                # Final check for NaN values
                if np.isnan(similarity_matrix).any():
                    print(f"Warning: NaN values in similarity matrix for {file}, fixing...")
                    similarity_matrix = np.nan_to_num(similarity_matrix, nan=0.0)
                    # Re-ensure diagonal is 1.0
                    np.fill_diagonal(similarity_matrix, 1.0)
                
                df = pd.DataFrame(similarity_matrix, index=nodes, columns=nodes)
                output_file = os.path.join(OUTPUT_DIR, f"{file.replace('.gml', '')}_{method}_cosine_similarity.csv")
                df.to_csv(output_file)
                
                # Verify the saved matrix (optional)
                saved_matrix = pd.read_csv(output_file, index_col=0).values
                issues = []
                if np.isnan(saved_matrix).any():
                    issues.append("Contains NaN")
                if not np.allclose(saved_matrix, saved_matrix.T, rtol=1e-5, atol=1e-8):
                    issues.append("Matrix not symmetric")
                if not np.allclose(np.diag(saved_matrix), np.ones(saved_matrix.shape[0]), rtol=1e-5, atol=1e-8):
                    issues.append("Diagonal values not 1")
                
                if issues:
                    print(f"Issues with {output_file}: {'; '.join(issues)}")
                else:
                    print(f"Successfully processed {file} with {method}")
            else:
                print(f"Similarity computation failed for {file}")
        
        except Exception as e:
            print(f"Error processing {file}: {str(e)}")

print("Semantic similarity computation completed!")

  0%|          | 0/20 [00:00<?, ?it/s]

Processing CDM_train_0.gml...
Using largest connected component with 13290 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
  5%|▌         | 1/20 [04:46<1:30:49, 286.82s/it]

✅ Successfully processed CDM_train_0.gml with spectral
Processing CDM_train_1.gml...
Using largest connected component with 13244 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 10%|█         | 2/20 [09:52<1:29:22, 297.91s/it]

✅ Successfully processed CDM_train_1.gml with spectral
Processing CDM_train_2.gml...
Using largest connected component with 13247 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 15%|█▌        | 3/20 [20:16<2:06:32, 446.59s/it]

✅ Successfully processed CDM_train_2.gml with spectral
Processing CDM_train_3.gml...
Using largest connected component with 13250 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 20%|██        | 4/20 [29:32<2:10:39, 489.96s/it]

✅ Successfully processed CDM_train_3.gml with spectral
Processing CDM_train_4.gml...
Using largest connected component with 13315 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 25%|██▌       | 5/20 [38:07<2:04:45, 499.02s/it]

✅ Successfully processed CDM_train_4.gml with spectral
Processing HPD_train_0.gml...
Using largest connected component with 7976 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 30%|███       | 6/20 [41:44<1:34:02, 403.01s/it]

✅ Successfully processed HPD_train_0.gml with spectral
Processing HPD_train_1.gml...
Using largest connected component with 7952 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 35%|███▌      | 7/20 [45:12<1:13:31, 339.33s/it]

✅ Successfully processed HPD_train_1.gml with spectral
Processing HPD_train_2.gml...
Using largest connected component with 7973 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 40%|████      | 8/20 [48:38<59:22, 296.88s/it]  

✅ Successfully processed HPD_train_2.gml with spectral
Processing HPD_train_3.gml...
Using largest connected component with 7971 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 45%|████▌     | 9/20 [51:18<46:36, 254.26s/it]

✅ Successfully processed HPD_train_3.gml with spectral
Processing HPD_train_4.gml...
Using largest connected component with 7933 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 50%|█████     | 10/20 [53:34<36:17, 217.77s/it]

✅ Successfully processed HPD_train_4.gml with spectral
Processing PGP_train_0.gml...
Using largest connected component with 8733 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 55%|█████▌    | 11/20 [57:14<32:46, 218.45s/it]

✅ Successfully processed PGP_train_0.gml with spectral
Processing PGP_train_1.gml...
Using largest connected component with 8787 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 60%|██████    | 12/20 [59:31<25:48, 193.61s/it]

✅ Successfully processed PGP_train_1.gml with spectral
Processing PGP_train_2.gml...
Using largest connected component with 8733 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 65%|██████▌   | 13/20 [1:01:50<20:37, 176.85s/it]

✅ Successfully processed PGP_train_2.gml with spectral
Processing PGP_train_3.gml...
Using largest connected component with 8785 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 70%|███████   | 14/20 [1:04:07<16:30, 165.04s/it]

✅ Successfully processed PGP_train_3.gml with spectral
Processing PGP_train_4.gml...
Using largest connected component with 8481 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 75%|███████▌  | 15/20 [1:06:17<12:52, 154.43s/it]

✅ Successfully processed PGP_train_4.gml with spectral
Processing UPG_train_0.gml...
Using largest connected component with 3980 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 80%|████████  | 16/20 [1:06:51<07:52, 118.17s/it]

✅ Successfully processed UPG_train_0.gml with spectral
Processing UPG_train_1.gml...
Using largest connected component with 3876 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 85%|████████▌ | 17/20 [1:07:23<04:36, 92.18s/it] 

✅ Successfully processed UPG_train_1.gml with spectral
Processing UPG_train_2.gml...
Using largest connected component with 4075 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 90%|█████████ | 18/20 [1:07:59<02:30, 75.35s/it]

✅ Successfully processed UPG_train_2.gml with spectral
Processing UPG_train_3.gml...
Using largest connected component with 3871 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
 95%|█████████▌| 19/20 [1:08:32<01:02, 62.54s/it]

✅ Successfully processed UPG_train_3.gml with spectral
Processing UPG_train_4.gml...
Using largest connected component with 4059 nodes.


  laplacian = nx.normalized_laplacian_matrix(G).asfptype()
100%|██████████| 20/20 [1:09:07<00:00, 207.36s/it]

✅ Successfully processed UPG_train_4.gml with spectral
✅ Semantic similarity computation completed!





In [None]:
import os
import networkx as nx
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
from sklearn.manifold import SpectralEmbedding

# Directories
PROCESSED_DIR = "training_spectral_class"  # Input directory for processed graphs
OUTPUT_DIR = "small_network_similarity_results"  # Separate output directory for Group 4

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Group 4: Small-Scale Networks
SMALL_NETWORKS = {"BUP", "CEG", "INF", "UAL"}

def generate_spectral_embeddings(G, dimensions=64):
    """Generate spectral embeddings using sklearn's SpectralEmbedding (for small graphs)."""
    try:
        if len(G) < 2:
            print("Graph is too small for spectral embeddings.")
            return None, None

        # Use sklearn's SpectralEmbedding (better for small graphs)
        model = SpectralEmbedding(n_components=min(dimensions, len(G)-1), 
                                  affinity='precomputed', 
                                  random_state=42)
        adjacency_matrix = nx.to_numpy_array(G)  # Convert graph to adjacency matrix
        embeddings = model.fit_transform(adjacency_matrix)
        
        # Check for and handle NaN values
        if np.isnan(embeddings).any():
            print(f"NaN values found in embeddings, replacing with zeros")
            embeddings = np.nan_to_num(embeddings, nan=0.0)
        
        # Normalize embeddings to unit length for proper cosine similarity
        norms = np.linalg.norm(embeddings, axis=1)
        zero_norm_indices = np.where(norms < 1e-10)[0]
        
        # Handle zero vectors to prevent division by zero
        if len(zero_norm_indices) > 0:
            print(f"Found {len(zero_norm_indices)} zero vectors, replacing with random unit vectors")
            for idx in zero_norm_indices:
                random_vec = np.random.normal(0, 0.01, embeddings.shape[1])
                embeddings[idx] = random_vec / np.linalg.norm(random_vec)
        else:
            # Normalize all non-zero vectors
            embeddings = embeddings / norms[:, np.newaxis]
            
        return embeddings, list(G.nodes())

    except Exception as e:
        print(f"Spectral embedding failed: {e}")
        return None, None

def compute_cosine_similarity(embeddings):
    """Compute cosine similarity matrix with proper diagonal values and symmetry."""
    # Compute cosine similarity
    similarity = cosine_similarity(embeddings)
    
    # Ensure proper diagonal values (should be 1.0)
    np.fill_diagonal(similarity, 1.0)
    
    # Ensure symmetry by averaging with transpose
    similarity = (similarity + similarity.T) / 2
    
    return similarity

# Process all .gml files and apply Spectral Embeddings for Small Networks
for file in tqdm(os.listdir(PROCESSED_DIR)):
    if file.endswith(".gml"):
        base_name = file.split("_train_")[0]
        file_path = os.path.join(PROCESSED_DIR, file)
        
        try:
            G = nx.read_gml(file_path)

            # Skip very small graphs
            if len(G.nodes) < 5:
                print(f"Skipping {file} (Graph too small: {len(G.nodes)} nodes)")
                continue

            if base_name in SMALL_NETWORKS:
                print(f"Processing {file} with Spectral Embeddings")
                embeddings, nodes = generate_spectral_embeddings(G)

                if embeddings is None or len(embeddings) == 0:
                    print(f"Skipping {file} due to embedding failure")
                    continue

                # Final check for NaN values before similarity calculation
                if np.isnan(embeddings).any():
                    print(f"NaN values in embeddings after processing, fixing...")
                    embeddings = np.nan_to_num(embeddings, nan=0.0)
                
                # Compute similarity matrix
                similarity_matrix = compute_cosine_similarity(embeddings)
                
                # Verify the matrix properties
                issues = []
                if np.isnan(similarity_matrix).any():
                    print(f"NaN values in similarity matrix, fixing...")
                    similarity_matrix = np.nan_to_num(similarity_matrix, nan=0.0)
                    issues.append("Contains NaN (fixed)")
                
                if not np.allclose(similarity_matrix, similarity_matrix.T, rtol=1e-5, atol=1e-8):
                    print(f"Matrix not symmetric, enforcing symmetry...")
                    similarity_matrix = (similarity_matrix + similarity_matrix.T) / 2
                    issues.append("Matrix not symmetric (fixed)")
                
                if not np.allclose(np.diag(similarity_matrix), np.ones(similarity_matrix.shape[0]), rtol=1e-5, atol=1e-8):
                    print(f"Diagonal values not 1, fixing...")
                    np.fill_diagonal(similarity_matrix, 1.0)
                    issues.append("Diagonal values not 1 (fixed)")

                # Save as CSV
                df = pd.DataFrame(similarity_matrix, index=nodes, columns=nodes)
                output_path = os.path.join(OUTPUT_DIR, f"{file.replace('.gml', '_cosine_similarity.csv')}")
                df.to_csv(output_path)
                
                status = "Saved" if not issues else f"Saved with fixes: {', '.join(issues)}"
                print(f"{status} {output_path}")
                
        except Exception as e:
            print(f"Error processing {file}: {e}")

print("\n Spectral embeddings and cosine similarity computed for all small networks!")

  0%|          | 0/20 [00:00<?, ?it/s]

🔵 Processing BUP_train_0.gml with Spectral Embeddings


 20%|██        | 4/20 [00:00<00:02,  5.56it/s]

✅ Saved small_network_similarity_results\BUP_train_0_cosine_similarity.csv
🔵 Processing BUP_train_1.gml with Spectral Embeddings
✅ Saved small_network_similarity_results\BUP_train_1_cosine_similarity.csv
🔵 Processing BUP_train_2.gml with Spectral Embeddings
✅ Saved small_network_similarity_results\BUP_train_2_cosine_similarity.csv
🔵 Processing BUP_train_3.gml with Spectral Embeddings
✅ Saved small_network_similarity_results\BUP_train_3_cosine_similarity.csv
🔵 Processing BUP_train_4.gml with Spectral Embeddings
✅ Saved small_network_similarity_results\BUP_train_4_cosine_similarity.csv
🔵 Processing CEG_train_0.gml with Spectral Embeddings


 30%|███       | 6/20 [00:01<00:02,  5.88it/s]

✅ Saved small_network_similarity_results\CEG_train_0_cosine_similarity.csv
🔵 Processing CEG_train_1.gml with Spectral Embeddings


 35%|███▌      | 7/20 [00:01<00:02,  4.95it/s]

✅ Saved small_network_similarity_results\CEG_train_1_cosine_similarity.csv
🔵 Processing CEG_train_2.gml with Spectral Embeddings


 40%|████      | 8/20 [00:01<00:02,  4.47it/s]

✅ Saved small_network_similarity_results\CEG_train_2_cosine_similarity.csv
🔵 Processing CEG_train_3.gml with Spectral Embeddings


 45%|████▌     | 9/20 [00:02<00:02,  4.34it/s]

✅ Saved small_network_similarity_results\CEG_train_3_cosine_similarity.csv
🔵 Processing CEG_train_4.gml with Spectral Embeddings


 50%|█████     | 10/20 [00:02<00:02,  3.42it/s]

✅ Saved small_network_similarity_results\CEG_train_4_cosine_similarity.csv
🔵 Processing INF_train_0.gml with Spectral Embeddings


 55%|█████▌    | 11/20 [00:02<00:02,  3.14it/s]

✅ Saved small_network_similarity_results\INF_train_0_cosine_similarity.csv
🔵 Processing INF_train_1.gml with Spectral Embeddings


 60%|██████    | 12/20 [00:03<00:02,  2.99it/s]

✅ Saved small_network_similarity_results\INF_train_1_cosine_similarity.csv
🔵 Processing INF_train_2.gml with Spectral Embeddings


 65%|██████▌   | 13/20 [00:03<00:02,  2.91it/s]

✅ Saved small_network_similarity_results\INF_train_2_cosine_similarity.csv
🔵 Processing INF_train_3.gml with Spectral Embeddings


 70%|███████   | 14/20 [00:04<00:02,  2.77it/s]

✅ Saved small_network_similarity_results\INF_train_3_cosine_similarity.csv
🔵 Processing INF_train_4.gml with Spectral Embeddings


 75%|███████▌  | 15/20 [00:04<00:01,  2.68it/s]

✅ Saved small_network_similarity_results\INF_train_4_cosine_similarity.csv
🔵 Processing UAL_train_0.gml with Spectral Embeddings


 80%|████████  | 16/20 [00:04<00:01,  2.92it/s]

✅ Saved small_network_similarity_results\UAL_train_0_cosine_similarity.csv
🔵 Processing UAL_train_1.gml with Spectral Embeddings


 85%|████████▌ | 17/20 [00:04<00:00,  3.13it/s]

✅ Saved small_network_similarity_results\UAL_train_1_cosine_similarity.csv
🔵 Processing UAL_train_2.gml with Spectral Embeddings


 90%|█████████ | 18/20 [00:05<00:00,  3.18it/s]

✅ Saved small_network_similarity_results\UAL_train_2_cosine_similarity.csv
🔵 Processing UAL_train_3.gml with Spectral Embeddings


 95%|█████████▌| 19/20 [00:05<00:00,  3.31it/s]

✅ Saved small_network_similarity_results\UAL_train_3_cosine_similarity.csv
🔵 Processing UAL_train_4.gml with Spectral Embeddings


100%|██████████| 20/20 [00:06<00:00,  3.31it/s]

✅ Saved small_network_similarity_results\UAL_train_4_cosine_similarity.csv

🚀 Spectral embeddings and cosine similarity computed for all small networks!





In [None]:
import os
import pandas as pd
import numpy as np

def validate_cosine_similarity_csv(csv_folder):
    results = []

    for filename in os.listdir(csv_folder):
        if filename.endswith(".csv"):
            file_path = os.path.join(csv_folder, filename)
            print(f"🔍 Checking {filename}...")

            try:
                df = pd.read_csv(file_path, index_col=0)
                matrix = df.values

                issues = []
                score = 100

                # Check for NaN/Inf
                if np.isnan(matrix).any():
                    issues.append(" Contains NaN")
                    score -= 25
                if np.isinf(matrix).any():
                    issues.append(" Contains Inf")
                    score -= 25

                # Check symmetry
                if not np.allclose(matrix, matrix.T, atol=1e-5):
                    issues.append(" Matrix not symmetric")
                    score -= 15

                # Check diagonal
                diagonal = np.diag(matrix)
                if not np.allclose(diagonal, 1.0, atol=1e-2):
                    issues.append(" Diagonal values not 1")
                    score -= 10

                # Check range [0, 1]
                if matrix.min() < 0 or matrix.max() > 1:
                    issues.append(" Values not in [0, 1]")
                    score -= 10

                # Summary stats
                mean_val = np.mean(matrix)
                std_val = np.std(matrix)

                if score == 100:
                    issues.append(" OK")

                results.append({
                    "file": filename,
                    "mean": round(mean_val, 4),
                    "std": round(std_val, 4),
                    "score": score,
                    "issues": "; ".join(issues)
                })

            except Exception as e:
                results.append({
                    "file": filename,
                    "mean": "N/A",
                    "std": "N/A",
                    "score": 0,
                    "issues": f" Error reading file: {e}"
                })

    return pd.DataFrame(results)


# Example usage:
csv_folder = "cosine_training"  # <- replace with your folder path
report_df = validate_cosine_similarity_csv(csv_folder)
print("\n Summary Report:\n")
print(report_df.sort_values(by="score"))

# Optionally save the report
report_df.to_csv("cosine_csv_quality_report.csv", index=False)
print(" Quality report saved as 'cosine_csv_quality_report.csv'")

🔍 Checking ADV_train_0_cosine_similarity.csv...
🔍 Checking ADV_train_1_cosine_similarity.csv...
🔍 Checking ADV_train_2_cosine_similarity.csv...
🔍 Checking ADV_train_3_cosine_similarity.csv...
🔍 Checking ADV_train_4_cosine_similarity.csv...
🔍 Checking BUP_train_0_cosine_similarity.csv...
🔍 Checking BUP_train_1_cosine_similarity.csv...
🔍 Checking BUP_train_2_cosine_similarity.csv...
🔍 Checking BUP_train_3_cosine_similarity.csv...
🔍 Checking BUP_train_4_cosine_similarity.csv...
🔍 Checking CDM_train_0_cosine_similarity.csv...
🔍 Checking CDM_train_1_cosine_similarity.csv...
🔍 Checking CDM_train_2_cosine_similarity.csv...
🔍 Checking CDM_train_3_cosine_similarity.csv...
🔍 Checking CDM_train_4_cosine_similarity.csv...
🔍 Checking CEG_train_0_cosine_similarity.csv...
🔍 Checking CEG_train_1_cosine_similarity.csv...
🔍 Checking CEG_train_2_cosine_similarity.csv...
🔍 Checking CEG_train_3_cosine_similarity.csv...
🔍 Checking CEG_train_4_cosine_similarity.csv...
🔍 Checking CGS_train_0_cosine_similarity

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm import trange

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Directories
DATA_DIR = "processed_training_graphs"       # Input graphs (.gml files)
COSINE_SIM_DIR = "cosine_similarity"           # (Optional) cosine similarity matrices
MODEL_DIR = "models"                           # Where to save the trained model
os.makedirs(MODEL_DIR, exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ---------------------------
# Define GCN Layer and Model using sparse operations
# ---------------------------
class GCNLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_features, out_features)
    def forward(self, x, adj_sparse):
        # adj_sparse is a sparse tensor
        # Compute normalization: D^{-1/2} A D^{-1/2} should have been precomputed.
        support = self.linear(x)
        output = torch.sparse.mm(adj_sparse, support)
        return output

class GCNLinkPredictor(nn.Module):
    def __init__(self, in_features, hidden_dim, out_dim, dropout=0.2):
        super(GCNLinkPredictor, self).__init__()
        self.gc1 = GCNLayer(in_features, hidden_dim)
        self.gc2 = GCNLayer(hidden_dim, out_dim)
        self.dropout = dropout
    def encode(self, x, adj_sparse):
        x = self.gc1(x, adj_sparse)
        x = F.relu(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj_sparse)
        return x
    def decode(self, z, edge_index):
        src, dst = edge_index
        return torch.sum(z[src] * z[dst], dim=1)
    def forward(self, x, adj_sparse, edge_index):
        z = self.encode(x, adj_sparse)
        return self.decode(z, edge_index)

# -----------------------------------------------------
# GraphDataset: Convert dense adjacencies to sparse tensors
# -----------------------------------------------------
class GraphDataset:
    def __init__(self, graph_dir, embedding_dir=None, cosine_dir=None):
        self.graph_dir = graph_dir
        self.embedding_dir = embedding_dir
        self.cosine_dir = cosine_dir
        self.graphs = []
        self.node_features = []
        self.adj_matrices = []  # stores sparse tensors now
        self.edge_lists = []
        self.train_pos_edges = []
        self.val_pos_edges = []
        self.val_neg_edges = []
        self.test_pos_edges = []
        self.test_neg_edges = []
        self._load_graphs()
    def _load_graphs(self):
        graph_files = [f for f in os.listdir(self.graph_dir) if f.endswith(".gml")]
        print(f"Found {len(graph_files)} graph files")
        for i, graph_file in enumerate(graph_files):
            graph_path = os.path.join(self.graph_dir, graph_file)
            try:
                G = nx.read_gml(graph_path)
                for node in G.nodes():
                    G.nodes[node]['original_label'] = str(node)
                G = nx.convert_node_labels_to_integers(G, label_attribute='original_label')
                # Create dense adjacency and then convert to torch sparse tensor
                adj_dense = nx.to_numpy_array(G)
                adj_tensor = torch.FloatTensor(adj_dense)
                # Convert to sparse: use coalesce to ensure uniqueness of indices
                adj_sparse = adj_tensor.to_sparse().coalesce()
                num_nodes = len(G.nodes())
                # Basic features as identity matrix
                node_features = torch.eye(num_nodes)
                # (Optional augmentation omitted for brevity)
                edges = list(G.edges())
                edge_index = torch.tensor([[u, v] for u, v in edges], dtype=torch.long).t()
                self._split_edges(edge_index, num_nodes)
                self.graphs.append(G)
                self.node_features.append(node_features)
                self.adj_matrices.append(adj_sparse)
                self.edge_lists.append(edge_index)
                if i < 3:
                    print(f"Successfully loaded {graph_file} with {num_nodes} nodes and {len(edges)} edges")
            except Exception as e:
                print(f"Error processing {graph_file}: {str(e)}")
    def _split_edges(self, edge_index, num_nodes):
        edges = edge_index.t().numpy()
        edge_set = set([(int(u), int(v)) for u, v in edges])
        num_edges = len(edge_set)
        num_val = max(1, int(0.1 * num_edges))
        num_test = max(1, int(0.1 * num_edges))
        edge_list = list(edge_set)
        np.random.shuffle(edge_list)
        test_edges = edge_list[:num_test]
        val_edges = edge_list[num_test:num_test+num_val]
        train_edges = edge_list[num_test+num_val:]
        all_edges = set(edge_list + [(v, u) for u, v in edge_list])
        non_edges = []
        while len(non_edges) < num_val + num_test:
            batch_size = max(1000, (num_val + num_test) - len(non_edges))
            u_samples = np.random.randint(0, num_nodes, size=batch_size)
            v_samples = np.random.randint(0, num_nodes, size=batch_size)
            valid = (u_samples != v_samples)
            for u, v in zip(u_samples[valid], v_samples[valid]):
                if (u, v) not in all_edges and (v, u) not in all_edges:
                    non_edges.append((u, v))
                    all_edges.add((u, v))
                    if len(non_edges) >= num_val + num_test:
                        break
        val_neg_edges = non_edges[:num_val]
        test_neg_edges = non_edges[num_val:num_val+num_test]
        self.train_pos_edges.append(torch.tensor(train_edges, dtype=torch.long).t())
        self.val_pos_edges.append(torch.tensor(val_edges, dtype=torch.long).t())
        self.test_pos_edges.append(torch.tensor(test_edges, dtype=torch.long).t())
        self.val_neg_edges.append(torch.tensor(val_neg_edges, dtype=torch.long).t())
        self.test_neg_edges.append(torch.tensor(test_neg_edges, dtype=torch.long).t())
    def __len__(self):
        return len(self.graphs)
    def __getitem__(self, idx):
        return {
            'graph': self.graphs[idx],
            'features': self.node_features[idx],
            'adj': self.adj_matrices[idx],
            'edges': self.edge_lists[idx],
            'train_pos': self.train_pos_edges[idx],
            'val_pos': self.val_pos_edges[idx],
            'val_neg': self.val_neg_edges[idx],
            'test_pos': self.test_pos_edges[idx],
            'test_neg': self.test_neg_edges[idx]
        }

def generate_negative_edges(pos_edges, num_nodes, num_samples):
    """Vectorized negative edge generator."""
    pos_edge_set = set()
    for i in range(pos_edges.shape[1]):
        u, v = pos_edges[0, i].item(), pos_edges[1, i].item()
        pos_edge_set.add((u, v))
        pos_edge_set.add((v, u))
    neg_edges = []
    while len(neg_edges) < num_samples:
        batch_size = max(1000, num_samples - len(neg_edges))
        u_samples = np.random.randint(0, num_nodes, size=batch_size)
        v_samples = np.random.randint(0, num_nodes, size=batch_size)
        valid = (u_samples != v_samples)
        for u, v in zip(u_samples[valid], v_samples[valid]):
            if (u, v) not in pos_edge_set:
                neg_edges.append([u, v])
                pos_edge_set.add((u, v))
                if len(neg_edges) >= num_samples:
                    break
    return torch.tensor(neg_edges, dtype=torch.long).t()

# --------------------------------------------------
# Training Function: Single Model for Link Prediction (using sparse adjacencies)
# --------------------------------------------------
def train_link_prediction(dataset, epochs=100, hidden_dim=128, out_dim=64, lr=0.01, weight_decay=5e-4):
    """Train a single GCN model for link prediction across the entire dataset and save the model."""
    if len(dataset) == 0:
        raise ValueError("No graphs loaded successfully. Please check your data files.")
    max_features = max([data['features'].shape[1] for data in dataset])
    print(f"Maximum feature dimension: {max_features}")
    model = GCNLinkPredictor(max_features, hidden_dim, out_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_auc = 0
    best_model_state = None
    patience = 10
    patience_counter = 0
    for epoch in trange(1, epochs+1, desc="Training epochs"):
        model.train()
        total_loss = 0
        for i in range(len(dataset)):
            data = dataset[i]
            features = data['features']
            if features.shape[1] < max_features:
                padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                features = torch.cat([features, padding], dim=1)
            features = features.to(device)
            # Use the sparse adjacency tensor
            adj_sparse = data['adj'].to(device)
            train_pos = data['train_pos'].to(device)
            num_nodes = features.shape[0]
            train_neg = generate_negative_edges(train_pos, num_nodes, train_pos.shape[1]).to(device)
            optimizer.zero_grad()
            z = model.encode(features, adj_sparse)
            pos_score = model.decode(z, train_pos)
            neg_score = model.decode(z, train_neg)
            loss = F.binary_cross_entropy_with_logits(
                torch.cat([pos_score, neg_score]),
                torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataset)
        model.eval()
        val_auc = 0
        with torch.no_grad():
            for i in range(len(dataset)):
                data = dataset[i]
                features = data['features']
                if features.shape[1] < max_features:
                    padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                    features = torch.cat([features, padding], dim=1)
                features = features.to(device)
                adj_sparse = data['adj'].to(device)
                val_pos = data['val_pos'].to(device)
                val_neg = data['val_neg'].to(device)
                z = model.encode(features, adj_sparse)
                pos_score = model.decode(z, val_pos).cpu().numpy()
                neg_score = model.decode(z, val_neg).cpu().numpy()
                scores = np.concatenate([pos_score, neg_score])
                labels = np.concatenate([np.ones_like(pos_score), np.zeros_like(neg_score)])
                val_auc += roc_auc_score(labels, scores)
        val_auc /= len(dataset)
        if epoch % 5 == 0:
            print(f"Epoch {epoch}/{epochs}: Loss = {avg_loss:.4f}, Val AUC = {val_auc:.4f}")
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
    model.load_state_dict(best_model_state)
    model.eval()
    test_auc = 0
    test_ap = 0
    with torch.no_grad():
        for i in range(len(dataset)):
            data = dataset[i]
            features = data['features']
            if features.shape[1] < max_features:
                padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                features = torch.cat([features, padding], dim=1)
            features = features.to(device)
            adj_sparse = data['adj'].to(device)
            test_pos = data['test_pos'].to(device)
            test_neg = data['test_neg'].to(device)
            z = model.encode(features, adj_sparse)
            pos_score = model.decode(z, test_pos).cpu().numpy()
            neg_score = model.decode(z, test_neg).cpu().numpy()
            scores = np.concatenate([pos_score, neg_score])
            labels = np.concatenate([np.ones_like(pos_score), np.zeros_like(neg_score)])
            test_auc += roc_auc_score(labels, scores)
            test_ap += average_precision_score(labels, scores)
    test_auc /= len(dataset)
    test_ap /= len(dataset)
    print(f"Test Results: AUC = {test_auc:.4f}, AP = {test_ap:.4f}")
    # Save the model
    model_path = os.path.join(MODEL_DIR, "gcn_link_predictor.pt")
    torch.save({
        'state_dict': best_model_state,
        'in_features': max_features,
        'hidden_dim': hidden_dim,
        'out_dim': out_dim
    }, model_path)
    print(f"Model saved to {model_path}")
    return model, test_auc, test_ap

if __name__ == "__main__":
    dataset = GraphDataset(DATA_DIR, embedding_dir=None, cosine_dir=COSINE_SIM_DIR)
    print(f"Successfully loaded {len(dataset)} graphs")
    if len(dataset) > 0:
        model, test_auc, test_ap = train_link_prediction(
            dataset,
            epochs=100,
            hidden_dim=128,
            out_dim=64,
            lr=0.01
        )
        print(f"\nTraining completed with Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}")
    else:
        print("No graphs were successfully loaded. Please check your data files.")



Using device: cpu
Found 110 graph files
Successfully loaded ADV_train_0.gml with 4907 nodes and 31428 edges
Successfully loaded ADV_train_1.gml with 4891 nodes and 31428 edges
Successfully loaded ADV_train_2.gml with 4903 nodes and 31428 edges
Successfully loaded 110 graphs
Maximum feature dimension: 15719


Training epochs:   5%|▌         | 5/100 [12:31<3:53:23, 147.40s/it]

Epoch 5/100: Loss = 0.7181, Val AUC = 0.8352


Training epochs:  10%|█         | 10/100 [26:12<4:08:12, 165.47s/it]

Epoch 10/100: Loss = 1.3587, Val AUC = 0.8354


Training epochs:  15%|█▌        | 15/100 [43:29<4:46:13, 202.04s/it]

Epoch 15/100: Loss = 0.6920, Val AUC = 0.8354


Training epochs:  15%|█▌        | 15/100 [46:27<4:23:16, 185.85s/it]

Early stopping at epoch 16





Test Results: AUC = 0.8357, AP = 0.8384
Model saved to models\gcn_link_predictor.pt

Training completed with Test AUC: 0.8357, Test AP: 0.8384


In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm import trange

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Directories
DATA_DIR = "processed_training_graphs"       # Input graphs (.gml files)
COSINE_SIM_DIR = "cosine_similarity"           # (Optional) cosine similarity matrices
MODEL_DIR = "models"                           # Where to save the trained model
os.makedirs(MODEL_DIR, exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ---------------------------
# Define GAT Layer and Model (using dense adjacency in attention)
# ---------------------------
class GATLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2, concat=True):
        super(GATLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.concat = concat
        self.dropout = dropout
        self.alpha = alpha

        self.W = nn.Parameter(torch.empty(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)

        # Instead of a single parameter "a", we create two: one for the source and one for the target.
        self.a_l = nn.Parameter(torch.empty(size=(out_features, 1)))
        self.a_r = nn.Parameter(torch.empty(size=(out_features, 1)))
        nn.init.xavier_uniform_(self.a_l.data, gain=1.414)
        nn.init.xavier_uniform_(self.a_r.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, h, adj):
        # h: (N, in_features), adj: (N, N) dense adjacency matrix
        Wh = torch.mm(h, self.W)  # (N, out_features)
        # Compute attention coefficients efficiently:
        # (N,1) from source and (N,1) from target, then broadcast sum:
        f1 = torch.mm(Wh, self.a_l)  # (N, 1)
        f2 = torch.mm(Wh, self.a_r)  # (N, 1)
        # Broadcast: add f1 and f2.T to get (N, N) attention scores.
        e = self.leakyrelu(f1 + f2.t())
        
        # Masked attention: set entries where there's no edge to a huge negative value
        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, Wh)
        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime


    def _prepare_attentional_mechanism_input(self, Wh):
        N = Wh.size()[0]
        Wh_repeated_in = Wh.repeat_interleave(N, dim=0)
        Wh_repeated_out = Wh.repeat(N, 1)
        all_combinations = torch.cat([Wh_repeated_in, Wh_repeated_out], dim=1)
        return all_combinations.view(N, N, 2 * self.out_features)

class GATLinkPredictor(nn.Module):
    def __init__(self, in_features, hidden_dim, out_dim, dropout=0.6, alpha=0.2):
        super(GATLinkPredictor, self).__init__()
        # Use one GAT layer with concatenation followed by output layer (without concat)
        self.gat1 = GATLayer(in_features, hidden_dim, dropout=dropout, alpha=alpha, concat=True)
        self.gat2 = GATLayer(hidden_dim, out_dim, dropout=dropout, alpha=alpha, concat=False)
        self.dropout = dropout

    def encode(self, x, adj_sparse):
        # Convert sparse adjacency to dense
        adj_dense = adj_sparse.to_dense()
        x = self.gat1(x, adj_dense)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gat2(x, adj_dense)
        return x

    def decode(self, z, edge_index):
        src, dst = edge_index
        return torch.sum(z[src] * z[dst], dim=1)

    def forward(self, x, adj_sparse, edge_index):
        z = self.encode(x, adj_sparse)
        return self.decode(z, edge_index)

# -----------------------------------------------------
# GraphDataset: Convert dense adjacencies to sparse tensors
# -----------------------------------------------------
class GraphDataset:
    def __init__(self, graph_dir, embedding_dir=None, cosine_dir=None):
        self.graph_dir = graph_dir
        self.embedding_dir = embedding_dir
        self.cosine_dir = cosine_dir
        self.graphs = []
        self.node_features = []
        self.adj_matrices = []  # stores sparse tensors now
        self.edge_lists = []
        self.train_pos_edges = []
        self.val_pos_edges = []
        self.val_neg_edges = []
        self.test_pos_edges = []
        self.test_neg_edges = []
        self._load_graphs()

    def _load_graphs(self):
        graph_files = [f for f in os.listdir(self.graph_dir) if f.endswith(".gml")]
        print(f"Found {len(graph_files)} graph files")
        for i, graph_file in enumerate(graph_files):
            graph_path = os.path.join(self.graph_dir, graph_file)
            try:
                G = nx.read_gml(graph_path)
                for node in G.nodes():
                    G.nodes[node]['original_label'] = str(node)
                G = nx.convert_node_labels_to_integers(G, label_attribute='original_label')
                # Create dense adjacency and then convert to torch sparse tensor
                adj_dense = nx.to_numpy_array(G)
                adj_tensor = torch.FloatTensor(adj_dense)
                adj_sparse = adj_tensor.to_sparse().coalesce()
                num_nodes = len(G.nodes())
                # Basic features as identity matrix
                node_features = torch.eye(num_nodes)
                # (Optional augmentation omitted for brevity)
                edges = list(G.edges())
                edge_index = torch.tensor([[u, v] for u, v in edges], dtype=torch.long).t()
                self._split_edges(edge_index, num_nodes)
                self.graphs.append(G)
                self.node_features.append(node_features)
                self.adj_matrices.append(adj_sparse)
                self.edge_lists.append(edge_index)
                if i < 3:
                    print(f"Successfully loaded {graph_file} with {num_nodes} nodes and {len(edges)} edges")
            except Exception as e:
                print(f"Error processing {graph_file}: {str(e)}")

    def _split_edges(self, edge_index, num_nodes):
        edges = edge_index.t().numpy()
        edge_set = set([(int(u), int(v)) for u, v in edges])
        num_edges = len(edge_set)
        num_val = max(1, int(0.1 * num_edges))
        num_test = max(1, int(0.1 * num_edges))
        edge_list = list(edge_set)
        np.random.shuffle(edge_list)
        test_edges = edge_list[:num_test]
        val_edges = edge_list[num_test:num_test+num_val]
        train_edges = edge_list[num_test+num_val:]
        all_edges = set(edge_list + [(v, u) for u, v in edge_list])
        non_edges = []
        while len(non_edges) < num_val + num_test:
            batch_size = max(1000, (num_val + num_test) - len(non_edges))
            u_samples = np.random.randint(0, num_nodes, size=batch_size)
            v_samples = np.random.randint(0, num_nodes, size=batch_size)
            valid = (u_samples != v_samples)
            for u, v in zip(u_samples[valid], v_samples[valid]):
                if (u, v) not in all_edges and (v, u) not in all_edges:
                    non_edges.append((u, v))
                    all_edges.add((u, v))
                    if len(non_edges) >= num_val + num_test:
                        break
        val_neg_edges = non_edges[:num_val]
        test_neg_edges = non_edges[num_val:num_val+num_test]
        self.train_pos_edges.append(torch.tensor(train_edges, dtype=torch.long).t())
        self.val_pos_edges.append(torch.tensor(val_edges, dtype=torch.long).t())
        self.test_pos_edges.append(torch.tensor(test_edges, dtype=torch.long).t())
        self.val_neg_edges.append(torch.tensor(val_neg_edges, dtype=torch.long).t())
        self.test_neg_edges.append(torch.tensor(test_neg_edges, dtype=torch.long).t())

    def __len__(self):
        return len(self.graphs)

    def __getitem__(self, idx):
        return {
            'graph': self.graphs[idx],
            'features': self.node_features[idx],
            'adj': self.adj_matrices[idx],
            'edges': self.edge_lists[idx],
            'train_pos': self.train_pos_edges[idx],
            'val_pos': self.val_pos_edges[idx],
            'val_neg': self.val_neg_edges[idx],
            'test_pos': self.test_pos_edges[idx],
            'test_neg': self.test_neg_edges[idx]
        }

def generate_negative_edges(pos_edges, num_nodes, num_samples):
    """Vectorized negative edge generator."""
    pos_edge_set = set()
    for i in range(pos_edges.shape[1]):
        u, v = pos_edges[0, i].item(), pos_edges[1, i].item()
        pos_edge_set.add((u, v))
        pos_edge_set.add((v, u))
    neg_edges = []
    while len(neg_edges) < num_samples:
        batch_size = max(1000, num_samples - len(neg_edges))
        u_samples = np.random.randint(0, num_nodes, size=batch_size)
        v_samples = np.random.randint(0, num_nodes, size=batch_size)
        valid = (u_samples != v_samples)
        for u, v in zip(u_samples[valid], v_samples[valid]):
            if (u, v) not in pos_edge_set:
                neg_edges.append([u, v])
                pos_edge_set.add((u, v))
                if len(neg_edges) >= num_samples:
                    break
    return torch.tensor(neg_edges, dtype=torch.long).t()

# --------------------------------------------------
# Training Function: Single Model for Link Prediction using GAT
# --------------------------------------------------
def train_link_prediction(dataset, epochs=100, hidden_dim=128, out_dim=64, lr=0.01, weight_decay=5e-4):
    """Train a single GAT model for link prediction across the entire dataset and save the model."""
    if len(dataset) == 0:
        raise ValueError("No graphs loaded successfully. Please check your data files.")
    max_features = max([data['features'].shape[1] for data in dataset])
    print(f"Maximum feature dimension: {max_features}")
    model = GATLinkPredictor(max_features, hidden_dim, out_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_auc = 0
    best_model_state = None
    patience = 10
    patience_counter = 0
    for epoch in trange(1, epochs+1, desc="Training epochs"):
        model.train()
        total_loss = 0
        for i in range(len(dataset)):
            data = dataset[i]
            features = data['features']
            if features.shape[1] < max_features:
                padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                features = torch.cat([features, padding], dim=1)
            features = features.to(device)
            # Use the sparse adjacency tensor
            adj_sparse = data['adj'].to(device)
            train_pos = data['train_pos'].to(device)
            num_nodes = features.shape[0]
            train_neg = generate_negative_edges(train_pos, num_nodes, train_pos.shape[1]).to(device)
            optimizer.zero_grad()
            z = model.encode(features, adj_sparse)
            pos_score = model.decode(z, train_pos)
            neg_score = model.decode(z, train_neg)
            loss = F.binary_cross_entropy_with_logits(
                torch.cat([pos_score, neg_score]),
                torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataset)
        model.eval()
        val_auc = 0
        with torch.no_grad():
            for i in range(len(dataset)):
                data = dataset[i]
                features = data['features']
                if features.shape[1] < max_features:
                    padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                    features = torch.cat([features, padding], dim=1)
                features = features.to(device)
                adj_sparse = data['adj'].to(device)
                val_pos = data['val_pos'].to(device)
                val_neg = data['val_neg'].to(device)
                z = model.encode(features, adj_sparse)
                pos_score = model.decode(z, val_pos).cpu().numpy()
                neg_score = model.decode(z, val_neg).cpu().numpy()
                scores = np.concatenate([pos_score, neg_score])
                labels = np.concatenate([np.ones_like(pos_score), np.zeros_like(neg_score)])
                val_auc += roc_auc_score(labels, scores)
        val_auc /= len(dataset)
        if epoch % 5 == 0:
            print(f"Epoch {epoch}/{epochs}: Loss = {avg_loss:.4f}, Val AUC = {val_auc:.4f}")
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
    model.load_state_dict(best_model_state)
    model.eval()
    test_auc = 0
    test_ap = 0
    with torch.no_grad():
        for i in range(len(dataset)):
            data = dataset[i]
            features = data['features']
            if features.shape[1] < max_features:
                padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                features = torch.cat([features, padding], dim=1)
            features = features.to(device)
            adj_sparse = data['adj'].to(device)
            test_pos = data['test_pos'].to(device)
            test_neg = data['test_neg'].to(device)
            z = model.encode(features, adj_sparse)
            pos_score = model.decode(z, test_pos).cpu().numpy()
            neg_score = model.decode(z, test_neg).cpu().numpy()
            scores = np.concatenate([pos_score, neg_score])
            labels = np.concatenate([np.ones_like(pos_score), np.zeros_like(neg_score)])
            test_auc += roc_auc_score(labels, scores)
            test_ap += average_precision_score(labels, scores)
    test_auc /= len(dataset)
    test_ap /= len(dataset)
    print(f"Test Results: AUC = {test_auc:.4f}, AP = {test_ap:.4f}")
    # Save the model
    model_path = os.path.join(MODEL_DIR, "gat_link_predictor.pt")
    torch.save({
        'state_dict': best_model_state,
        'in_features': max_features,
        'hidden_dim': hidden_dim,
        'out_dim': out_dim
    }, model_path)
    print(f"Model saved to {model_path}")
    return model, test_auc, test_ap

if __name__ == "__main__":
    dataset = GraphDataset(DATA_DIR, embedding_dir=None, cosine_dir=COSINE_SIM_DIR)
    print(f"Successfully loaded {len(dataset)} graphs")
    if len(dataset) > 0:
        model, test_auc, test_ap = train_link_prediction(
            dataset,
            epochs=100,
            hidden_dim=128,
            out_dim=64,
            lr=0.01
        )
        print(f"\nTraining completed with Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}")
    else:
        print("No graphs were successfully loaded. Please check your data files.")



Using device: cpu
Found 110 graph files
Successfully loaded ADV_train_0.gml with 4907 nodes and 31428 edges
Successfully loaded ADV_train_1.gml with 4891 nodes and 31428 edges
Successfully loaded ADV_train_2.gml with 4903 nodes and 31428 edges
Successfully loaded 110 graphs
Maximum feature dimension: 15719


Training epochs:   5%|▌         | 5/100 [53:31<19:38:55, 744.58s/it]

Epoch 5/100: Loss = 0.6931, Val AUC = 0.6304


Training epochs:  10%|█         | 10/100 [1:39:41<13:19:38, 533.10s/it]

Epoch 10/100: Loss = 0.6931, Val AUC = 0.5000


Training epochs:  12%|█▏        | 12/100 [1:58:58<14:32:29, 594.88s/it]

Early stopping at epoch 13





Test Results: AUC = 0.5000, AP = 0.5000
Model saved to models\gat_link_predictor.pt

Training completed with Test AUC: 0.5000, Test AP: 0.5000


In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics import roc_auc_score, average_precision_score
from tqdm import trange

# Set seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(42)

# Directories
DATA_DIR = "processed_training_graphs"       # Input graphs (.gml files)
COSINE_SIM_DIR = "cosine_similarity"           # (Optional) cosine similarity matrices
MODEL_DIR = "models"                           # Where to save the trained model
os.makedirs(MODEL_DIR, exist_ok=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ------------------------------------------------------------------
# Utility: Create low-dimensional node features using node degree
# ------------------------------------------------------------------
def get_node_features(G):
    # Use the node degree as a 1-dimensional feature per node
    degrees = np.array([G.degree(n) for n in G.nodes()], dtype=np.float32)
    mean = degrees.mean()
    std = degrees.std() if degrees.std() > 0 else 1.0
    normalized = (degrees - mean) / std
    # Returns a tensor of shape (num_nodes, 1)
    return torch.tensor(normalized).unsqueeze(1)

# ------------------------------------------------------------------
# Define GraphSage Layer and GraphSage Link Predictor Model
# ------------------------------------------------------------------
class GraphSageLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(GraphSageLayer, self).__init__()
        # Layer takes concatenation of node's own features and the mean of its neighbors' features.
        self.linear = nn.Linear(in_features * 2, out_features)
        self.activation = nn.ReLU()
    
    def forward(self, x, adj_sparse):
        # x: (N, in_features); adj_sparse: (N, N) sparse tensor.
        # Aggregate neighbors’ features
        agg = torch.sparse.mm(adj_sparse, x)
        # Compute degree for proper mean aggregation (avoid division by zero)
        degrees = torch.sparse.sum(adj_sparse, dim=1).to_dense().unsqueeze(1).clamp(min=1)
        mean_agg = agg / degrees
        # Concatenate node's own features and aggregated neighborhood features
        out = torch.cat([x, mean_agg], dim=1)
        out = self.linear(out)
        out = self.activation(out)
        return out

class GraphSageLinkPredictor(nn.Module):
    def __init__(self, in_features, hidden_dim, out_dim, dropout=0.2):
        super(GraphSageLinkPredictor, self).__init__()
        self.gs1 = GraphSageLayer(in_features, hidden_dim)
        self.gs2 = GraphSageLayer(hidden_dim, out_dim)
        self.dropout = dropout
    # Passes node features through two GraphSage layers
    # and applies dropout between them.
    def encode(self, x, adj_sparse):
        x = self.gs1(x, adj_sparse)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gs2(x, adj_sparse)
        return x
    #computes link prediction scores by taking the dot product of node embeddings
    def decode(self, z, edge_index):
        src, dst = edge_index
        return torch.sum(z[src] * z[dst], dim=1)
    
    def forward(self, x, adj_sparse, edge_index):
        z = self.encode(x, adj_sparse)
        return self.decode(z, edge_index)

# ------------------------------------------------------------------
# GraphDataset: Load graphs and generate features/adjacencies and edge splits.
# Node features now use get_node_features for lower memory usage.
# it loads graphs from the specified directory, generates node features,
# and splits edges into training, validation, and test sets.
# ------------------------------------------------------------------
class GraphDataset:
    def __init__(self, graph_dir, embedding_dir=None, cosine_dir=None):
        self.graph_dir = graph_dir
        self.embedding_dir = embedding_dir
        self.cosine_dir = cosine_dir
        self.graphs = []
        self.node_features = []
        self.adj_matrices = []  # Sparse tensors
        self.edge_lists = []
        self.train_pos_edges = []
        self.val_pos_edges = []
        self.val_neg_edges = []
        self.test_pos_edges = []
        self.test_neg_edges = []
        self._load_graphs()
    #this function loads graphs from the specified directory, generates node features,
    def _load_graphs(self):
        graph_files = [f for f in os.listdir(self.graph_dir) if f.endswith(".gml")]
        print(f"Found {len(graph_files)} graph files")
        for i, graph_file in enumerate(graph_files):
            graph_path = os.path.join(self.graph_dir, graph_file)
            try:
                G = nx.read_gml(graph_path)
                for node in G.nodes():
                    G.nodes[node]['original_label'] = str(node)
                G = nx.convert_node_labels_to_integers(G, label_attribute='original_label')
                # Create dense adjacency then convert to a sparse tensor
                adj_dense = nx.to_numpy_array(G)
                adj_tensor = torch.FloatTensor(adj_dense)
                adj_sparse = adj_tensor.to_sparse().coalesce()
                num_nodes = len(G.nodes())
                # Instead of an identity matrix, use a low-dimensional feature (node degree)
                node_features = get_node_features(G)
                edges = list(G.edges())
                edge_index = torch.tensor([[u, v] for u, v in edges], dtype=torch.long).t()
                self._split_edges(edge_index, num_nodes)
                self.graphs.append(G)
                self.node_features.append(node_features)
                self.adj_matrices.append(adj_sparse)
                self.edge_lists.append(edge_index)
                if i < 3:
                    print(f"Successfully loaded {graph_file} with {num_nodes} nodes and {len(edges)} edges")
            except Exception as e:
                print(f"Error processing {graph_file}: {str(e)}")
    #this function splits edges into training, validation, and test sets.
    def _split_edges(self, edge_index, num_nodes):
        edges = edge_index.t().numpy()
        edge_set = set([(int(u), int(v)) for u, v in edges])
        num_edges = len(edge_set)
        num_val = max(1, int(0.1 * num_edges))
        num_test = max(1, int(0.1 * num_edges))
        edge_list = list(edge_set)
        np.random.shuffle(edge_list)
        test_edges = edge_list[:num_test]
        val_edges = edge_list[num_test:num_test+num_val]
        train_edges = edge_list[num_test+num_val:]
        all_edges = set(edge_list + [(v, u) for u, v in edge_list])
        non_edges = []
        while len(non_edges) < num_val + num_test:
            batch_size = max(1000, (num_val + num_test) - len(non_edges))
            u_samples = np.random.randint(0, num_nodes, size=batch_size)
            v_samples = np.random.randint(0, num_nodes, size=batch_size)
            valid = (u_samples != v_samples)
            for u, v in zip(u_samples[valid], v_samples[valid]):
                if (u, v) not in all_edges and (v, u) not in all_edges:
                    non_edges.append((u, v))
                    all_edges.add((u, v))
                    if len(non_edges) >= num_val + num_test:
                        break
        val_neg_edges = non_edges[:num_val]
        test_neg_edges = non_edges[num_val:num_val+num_test]
        self.train_pos_edges.append(torch.tensor(train_edges, dtype=torch.long).t())
        self.val_pos_edges.append(torch.tensor(val_edges, dtype=torch.long).t())
        self.test_pos_edges.append(torch.tensor(test_edges, dtype=torch.long).t())
        self.val_neg_edges.append(torch.tensor(val_neg_edges, dtype=torch.long).t())
        self.test_neg_edges.append(torch.tensor(test_neg_edges, dtype=torch.long).t())
    #this function returns the number of graphs in the dataset.
    def __len__(self):
        return len(self.graphs)
    
    def __getitem__(self, idx):
        return {
            'graph': self.graphs[idx],
            'features': self.node_features[idx],
            'adj': self.adj_matrices[idx],
            'edges': self.edge_lists[idx],
            'train_pos': self.train_pos_edges[idx],
            'val_pos': self.val_pos_edges[idx],
            'val_neg': self.val_neg_edges[idx],
            'test_pos': self.test_pos_edges[idx],
            'test_neg': self.test_neg_edges[idx]
        }
#
def generate_negative_edges(pos_edges, num_nodes, num_samples):
    pos_edge_set = set()
    for i in range(pos_edges.shape[1]):
        u, v = pos_edges[0, i].item(), pos_edges[1, i].item()
        pos_edge_set.add((u, v))
        pos_edge_set.add((v, u))
    neg_edges = []
    while len(neg_edges) < num_samples:
        batch_size = max(1000, num_samples - len(neg_edges))
        u_samples = np.random.randint(0, num_nodes, size=batch_size)
        v_samples = np.random.randint(0, num_nodes, size=batch_size)
        valid = (u_samples != v_samples)
        for u, v in zip(u_samples[valid], v_samples[valid]):
            if (u, v) not in pos_edge_set:
                neg_edges.append([u, v])
                pos_edge_set.add((u, v))
                if len(neg_edges) >= num_samples:
                    break
    return torch.tensor(neg_edges, dtype=torch.long).t()

# ------------------------------------------------------------------
# Training Function for Link Prediction using GraphSage
# This function trains a GraphSage model for link prediction across the entire dataset.
# It uses the node degree as a low-dimensional feature.
# The model is trained using binary cross-entropy loss.
# The training process includes early stopping based on validation AUC.
# The model is saved after training.
# ------------------------------------------------------------------
def train_link_prediction(dataset, epochs=100, hidden_dim=128, out_dim=64, lr=0.01, weight_decay=5e-4):
    if len(dataset) == 0:
        raise ValueError("No graphs loaded successfully. Please check your data files.")
    max_features = max([data['features'].shape[1] for data in dataset])
    print(f"Maximum feature dimension: {max_features}")
    model = GraphSageLinkPredictor(max_features, hidden_dim, out_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    best_val_auc = 0
    best_model_state = None
    patience = 10
    patience_counter = 0
    for epoch in trange(1, epochs+1, desc="Training epochs"):
        model.train()
        total_loss = 0
        for i in range(len(dataset)):
            data = dataset[i]
            features = data['features']
            if features.shape[1] < max_features:
                padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                features = torch.cat([features, padding], dim=1)
            features = features.to(device)
            adj_sparse = data['adj'].to(device)
            train_pos = data['train_pos'].to(device)
            num_nodes = features.shape[0]
            train_neg = generate_negative_edges(train_pos, num_nodes, train_pos.shape[1]).to(device)
            optimizer.zero_grad()
            z = model.encode(features, adj_sparse)
            pos_score = model.decode(z, train_pos)
            neg_score = model.decode(z, train_neg)
            loss = F.binary_cross_entropy_with_logits(
                torch.cat([pos_score, neg_score]),
                torch.cat([torch.ones_like(pos_score), torch.zeros_like(neg_score)])
            )
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(dataset)
        model.eval()
        val_auc = 0
        with torch.no_grad():
            for i in range(len(dataset)):
                data = dataset[i]
                features = data['features']
                if features.shape[1] < max_features:
                    padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                    features = torch.cat([features, padding], dim=1)
                features = features.to(device)
                adj_sparse = data['adj'].to(device)
                val_pos = data['val_pos'].to(device)
                val_neg = data['val_neg'].to(device)
                z = model.encode(features, adj_sparse)
                pos_score = model.decode(z, val_pos).cpu().numpy()
                neg_score = model.decode(z, val_neg).cpu().numpy()
                scores = np.concatenate([pos_score, neg_score])
                labels = np.concatenate([np.ones_like(pos_score), np.zeros_like(neg_score)])
                val_auc += roc_auc_score(labels, scores)
        val_auc /= len(dataset)
        if epoch % 5 == 0:
            print(f"Epoch {epoch}/{epochs}: Loss = {avg_loss:.4f}, Val AUC = {val_auc:.4f}")
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break
    model.load_state_dict(best_model_state)
    model.eval()
    test_auc = 0
    test_ap = 0
    with torch.no_grad():
        for i in range(len(dataset)):
            data = dataset[i]
            features = data['features']
            if features.shape[1] < max_features:
                padding = torch.zeros(features.shape[0], max_features - features.shape[1])
                features = torch.cat([features, padding], dim=1)
            features = features.to(device)
            adj_sparse = data['adj'].to(device)
            test_pos = data['test_pos'].to(device)
            test_neg = data['test_neg'].to(device)
            z = model.encode(features, adj_sparse)
            pos_score = model.decode(z, test_pos).cpu().numpy()
            neg_score = model.decode(z, test_neg).cpu().numpy()
            scores = np.concatenate([pos_score, neg_score])
            labels = np.concatenate([np.ones_like(pos_score), np.zeros_like(neg_score)])
            test_auc += roc_auc_score(labels, scores)
            test_ap += average_precision_score(labels, scores)
    test_auc /= len(dataset)
    test_ap /= len(dataset)
    print(f"Test Results: AUC = {test_auc:.4f}, AP = {test_ap:.4f}")
    model_path = os.path.join(MODEL_DIR, "graphsage_link_predictor.pt")
    torch.save({
        'state_dict': best_model_state,
        'in_features': max_features,
        'hidden_dim': hidden_dim,
        'out_dim': out_dim
    }, model_path)
    print(f"Model saved to {model_path}")
    return model, test_auc, test_ap

if __name__ == "__main__":
    dataset = GraphDataset(DATA_DIR, embedding_dir=None, cosine_dir=COSINE_SIM_DIR)
    print(f"Successfully loaded {len(dataset)} graphs")
    if len(dataset) > 0:
        model, test_auc, test_ap = train_link_prediction(
            dataset,
            epochs=100,
            hidden_dim=128,
            out_dim=64,
            lr=0.01
        )
        print(f"\nTraining completed with Test AUC: {test_auc:.4f}, Test AP: {test_ap:.4f}")
    else:
        print("No graphs were successfully loaded. Please check your data files.")

Using device: cpu
Found 110 graph files
Successfully loaded ADV_train_0.gml with 4907 nodes and 31428 edges
Successfully loaded ADV_train_1.gml with 4891 nodes and 31428 edges
Successfully loaded ADV_train_2.gml with 4903 nodes and 31428 edges
Successfully loaded 110 graphs
Maximum feature dimension: 1


Training epochs:   5%|▌         | 5/100 [05:18<1:42:16, 64.60s/it]

Epoch 5/100: Loss = 0.6259, Val AUC = 0.7568


Training epochs:  10%|█         | 10/100 [10:22<1:36:37, 64.42s/it]

Epoch 10/100: Loss = 0.5915, Val AUC = 0.7823


Training epochs:  15%|█▌        | 15/100 [15:41<1:29:18, 63.04s/it]

Epoch 15/100: Loss = 0.5941, Val AUC = 0.8091


Training epochs:  20%|██        | 20/100 [19:45<1:10:00, 52.51s/it]

Epoch 20/100: Loss = 0.5874, Val AUC = 0.8182


Training epochs:  21%|██        | 21/100 [21:09<1:19:37, 60.48s/it]

Early stopping at epoch 22





Test Results: AUC = 0.7938, AP = 0.8017
Model saved to models\graphsage_link_predictor.pt

Training completed with Test AUC: 0.7938, Test AP: 0.8017


In [None]:
import os
import networkx as nx
from tqdm import tqdm

# Directories
TRAINING_DIR = "Full_data"  # Directory containing training .net files
PROCESSED_DIR = "processed_full_graphs"  # Where preprocessed graphs (.gml) will be stored

# Ensure output directory exists
os.makedirs(PROCESSED_DIR, exist_ok=True)

### **Step 1: Convert .net to .gml and Preprocess Graphs**
def preprocess_graph(file_path, output_path):
    """Converts .net to .gml, removes isolated nodes, and normalizes labels."""
    G = nx.read_pajek(file_path)  # Load .net graph
    G = nx.Graph(G)  # Convert to undirected (if needed)
    G.remove_nodes_from(list(nx.isolates(G)))  # Remove isolated nodes

    # Convert node labels to integers (required for embeddings)
    G = nx.convert_node_labels_to_integers(G, label_attribute="original_label")

    # Save as .gml for better compatibility
    nx.write_gml(G, output_path)

### **Step 2: Process All Graphs in Training Data**
for file in tqdm(os.listdir(TRAINING_DIR)):
    if file.endswith(".net"):
        file_path = os.path.join(TRAINING_DIR, file)
        output_file = os.path.join(PROCESSED_DIR, file.replace(".net", ".gml"))

        # Convert & Preprocess
        preprocess_graph(file_path, output_file)

print(" Preprocessing Complete!")
print(f"Processed graphs are stored in: {PROCESSED_DIR}")


100%|██████████| 23/23 [00:20<00:00,  1.12it/s]

✅ Preprocessing Complete!
Processed graphs are stored in: processed_full_graphs





In [None]:
import os
import networkx as nx
from pyvis.network import Network

# Paths to the graphs – adjust as needed.
original_graph_path = os.path.join("processed_testing_graphs", "CEG_test_0.gml")
predicted_graph_path = os.path.join("processed_testing_graphs", "CEG_test_0_with_predictions_gcn.gml")

# Load graphs with networkx.
G_orig = nx.read_gml(original_graph_path)
G_pred = nx.read_gml(predicted_graph_path)

# Identify predicted edges not in the original graph.
original_edges = set(G_orig.edges())
predicted_edges = set(G_pred.edges())
new_predicted_edges = predicted_edges - original_edges

# Create a PyVis network.
net = Network(height="750px", width="100%", bgcolor="white", font_color="black", notebook=True)
net.force_atlas_2based()
net.barnes_hut()

# Add all nodes from the original graph.
for node in G_orig.nodes():
    net.add_node(node, label=str(node))

# Add original edges (blue, solid).
for u, v in original_edges:
    net.add_edge(u, v, color="blue")

# Add new predicted edges (red, dashed).
for u, v in new_predicted_edges:
    net.add_edge(u, v, color="green", dashes=True)

# Save and show the visualization.
net.show("graph_predictions.html")

graph_predictions.html


In [36]:
import networkx as nx

# Load graphs from GML files
original_graph = nx.read_gml('processed_testing_graphs/CEG_test_0.gml')
predicted_graph = nx.read_gml('processed_testing_graphs/CEG_test_0_with_predictions.gml')

# Convert edge lists to sets of sorted tuples
original_edges = set(tuple(sorted(edge)) for edge in original_graph.edges())
predicted_edges = set(tuple(sorted(edge)) for edge in predicted_graph.edges())

# Comparison
true_positives = predicted_edges & original_edges
false_positives = predicted_edges - original_edges
false_negatives = original_edges - predicted_edges

# Accuracy Metrics
precision = len(true_positives) / len(predicted_edges) if predicted_edges else 0
recall = len(true_positives) / len(original_edges) if original_edges else 0
f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0

# Output
print(f"True Positives (TP): {len(true_positives)}")
print(f"False Positives (FP): {len(false_positives)}")
print(f"False Negatives (FN): {len(false_negatives)}\n")

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1_score:.3f}")


True Positives (TP): 430
False Positives (FP): 15
False Negatives (FN): 0

Precision: 0.966
Recall: 1.000
F1 Score: 0.983
