In [90]:
import numpy as np # linear algebra
import pandas as pd

need a graph dataset where each node can be associated with some text, so you can compute an LLM embedding per node.
Citation Graphs (Cora / PubMed / CiteSeer)
Why this works perfectly

Nodes = papers

Text = title + abstract → LLM embedding

Edges = citations

Labels = research area

In [91]:
!pip install torch_geometric



For the cora dataset , the labels are : "Case Based",
    "Genetic Algorithms",
    "Neural Networks",
    "Probabilistic Methods",
    "Reinforcement Learning",
    "Rule Learning",
    "Theory"

### we are gonna restructure the dataset , for some nodes we are gonna re-do the embedding in a way that changes the class and we should add the mask on those we changed 

it was a bit tricky here cause we are going from pyg to csv and seperating in train_nodes, test nodes that will soely focus on stroing teh nodes embedidng (x) and a different seperate file edges.csv to store the truc (A , adj matrix)
the mask , which is needed for training 


for the corruption we preserve the corrupted version is on the same dimension

In [None]:
import os
import pandas as pd
import torch
import numpy as np
from torch_geometric.datasets import Planetoid

# -------------------------
#  We Load Dataset
# -------------------------
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]

num_nodes, num_features = data.x.shape
num_edges = data.edge_index.shape[1]
num_classes = dataset.num_classes

print(f"Dataset: {dataset.name}")
print(f"Nodes: {num_nodes}, Features: {num_features}, Edges: {num_edges}, Classes: {num_classes}")

Dataset: Cora
Nodes: 2708, Features: 1433, Edges: 10556, Classes: 7


In [None]:
# -------------------------
# We Prepare directories
# -------------------------
os.makedirs("data", exist_ok=True)

In [None]:
def corrupt_embeddings_hybrid(X, labels, corruption_rate=0.3, 
                               swap_ratio=0.7, noise_ratio=0.3, 
                               noise_level=0.5, seed=42):
    """
    Hybrid corruption:
    - swap with wrong class 
    - we add noise 
    
    This creates two types of challenges:
    1. Obvious contradictions (swapped embeddings)
    2. Subtle degradation (noisy embeddings)
    """
    np.random.seed(seed)
    n_nodes = len(X)
    n_corrupt = int(n_nodes * corruption_rate)
    
    # Split corrupted nodes into swap vs noise
    n_swap = int(n_corrupt * swap_ratio)
    n_noise = n_corrupt - n_swap
    
    corrupt_indices = np.random.choice(n_nodes, n_corrupt, replace=False)
    swap_indices = corrupt_indices[:n_swap]
    noise_indices = corrupt_indices[n_swap:]
    
    X_corrupted = X.copy()
    is_corrupted = np.zeros(n_nodes, dtype=bool)
    corruption_type = np.zeros(n_nodes, dtype=int)  
    
  
    for idx in swap_indices:
        true_class = labels[idx]
        wrong_classes = [c for c in range(labels.max()+1) if c != true_class]
        wrong_class = np.random.choice(wrong_classes)
        wrong_class_nodes = np.where(labels == wrong_class)[0]
        donor_node = np.random.choice(wrong_class_nodes)
        
        X_corrupted[idx] = X[donor_node]
        is_corrupted[idx] = True
        corruption_type[idx] = 1
    
  
    for idx in noise_indices:
        noise = np.random.normal(0, noise_level, X.shape[1])
        X_corrupted[idx] = X[idx] + noise
        is_corrupted[idx] = True
        corruption_type[idx] = 2
    
    print(f"Corrupted {n_corrupt}/{n_nodes} nodes:")
    print(f"  - {n_swap} swapped with wrong class ")
    print(f"  - {n_noise} with added noise ")
    
    return X_corrupted, is_corrupted, corruption_type

In [None]:
def corrupt_embeddings_graph_adversarial(
    X,
    labels,
    edge_index,
    corruption_rate=0.3,
    noise_ratio=0.3,
    noise_level=0.4,
    min_same_class_neighbors=3,
    seed=42,
):
    """
    Graph-adversarial corruption:
    - Corrupts nodes that are strongly supported by their neighborhood
    - Forces feature embeddings to contradict local neighborhood class
    - Mix of hard swaps and directional noise
    """

    np.random.seed(seed)
    n_nodes = X.shape[0]

    # adjacency list
    neighbors = [[] for _ in range(n_nodes)]
    for src, dst in edge_index.T:
        neighbors[src].append(dst)
        neighbors[dst].append(src)

    # we select nodes with strong homophily
    candidate_nodes = []
    for i in range(n_nodes):
        nbrs = neighbors[i]
        if len(nbrs) == 0:
            continue
        same_class = sum(labels[j] == labels[i] for j in nbrs)
        if same_class >= min_same_class_neighbors:
            candidate_nodes.append(i)

    n_corrupt = int(corruption_rate * n_nodes)
    corrupt_nodes = np.random.choice(
        candidate_nodes,
        min(len(candidate_nodes), n_corrupt),
        replace=False
    )

    n_noise = int(len(corrupt_nodes) * noise_ratio)
    noise_nodes = set(np.random.choice(corrupt_nodes, n_noise, replace=False))
    swap_nodes = set(corrupt_nodes) - noise_nodes

    X_corrupted = X.copy()
    is_corrupted = np.zeros(n_nodes, dtype=int)
    corruption_type = np.zeros(n_nodes, dtype=int)  # 0 clean, 1 swap, 2 noise

    for idx in swap_nodes:
        nbrs = neighbors[idx]
        neighbor_labels = labels[nbrs]
        majority_class = np.bincount(neighbor_labels).argmax()
        wrong_classes = [
            c for c in range(labels.max() + 1)
            if c != majority_class
        ]
        wrong_class = np.random.choice(wrong_classes)

        donor_candidates = np.where(labels == wrong_class)[0]
        donor = np.random.choice(donor_candidates)

        X_corrupted[idx] = X[donor]
        is_corrupted[idx] = 1
        corruption_type[idx] = 1

    for idx in noise_nodes:
        nbrs = neighbors[idx]
        if len(nbrs) == 0:
            continue

        neighborhood_mean = X[nbrs].mean(axis=0)
        direction = X[idx] - neighborhood_mean
        direction = direction / (np.linalg.norm(direction) + 1e-8)

        noise = np.random.normal(0, noise_level, X.shape[1])
        X_corrupted[idx] = X[idx] + noise + 0.5 * direction

        is_corrupted[idx] = 1
        corruption_type[idx] = 2

    print(f"Graph-adversarial corruption applied:")
    print(f"  - Total corrupted: {is_corrupted.sum()} / {n_nodes}")
    print(f"  - Hard swaps: {(corruption_type == 1).sum()}")
    print(f"  - Directional noise: {(corruption_type == 2).sum()}")

    return X_corrupted, is_corrupted


we corrupt only a small section and keep track of them

In [None]:

# -------------------------
# we save Edges CSV
# -------------------------
edge_index = data.edge_index.numpy()  # shape [2, num_edges]
edges_df = pd.DataFrame({
    'source': edge_index[0],
    'target': edge_index[1]
})
edges_df.to_csv("data/edges.csv", index=False)
print("✓ Saved edges.csv")

✓ Saved edges.csv


In [None]:
# -------------------------
# we apply Corruption
# -------------------------
features = data.x.numpy()  # [num_nodes, num_features]
labels = data.y.numpy()    # [num_nodes]

features_corrupted, corruption_mask = corrupt_embeddings_graph_adversarial(
    features, 
    labels, 
    edge_index,
    corruption_rate=0.35,  # 35% corruption
    seed=42
)

Graph-adversarial corruption applied:
  - Total corrupted: 947 / 2708
  - Hard swaps: 663
  - Directional noise: 284


we make our split of test and train

In [None]:

# -------------------------
#  we Create Better Train/Test Split
# -------------------------
np.random.seed(42)

# Use more nodes for training (80% train,  20% test)
n_train = int(num_nodes * 0.8)

indices = np.random.permutation(num_nodes)
train_indices = indices[:n_train]
test_indices = indices[n_train:]

print(f"Split: {len(train_indices)} train,  {len(test_indices)} test")

Split: 2166 train,  542 test


In [99]:
edges_df

Unnamed: 0,source,target
0,633,0
1,1862,0
2,2582,0
3,2,1
4,652,1
...,...,...
10551,2707,2706
10552,165,2707
10553,598,2707
10554,1473,2707


In [None]:
# -------------------------
# save Adjacency Matrix 
# -------------------------
adjacency = torch.sparse_coo_tensor(
    data.edge_index,
    torch.ones(data.edge_index.shape[1]),
    (num_nodes, num_nodes)
).to_dense().numpy()

np.save("data/adjacency_matrix.npy", adjacency)
print("✓ Saved adjacency_matrix.npy")


✓ Saved adjacency_matrix.npy


In [None]:
# -------------------------
# train CSV 
# -------------------------
train_df = pd.DataFrame(features_corrupted[train_indices])
train_df.columns = [f'feature_{i}' for i in range(num_features)]

train_df.insert(0, 'node_id', train_indices)
train_df['label'] = labels[train_indices]
train_df['is_corrupted'] = corruption_mask[train_indices].astype(int)  # 0 or 1

train_df.to_csv("data/train.csv", index=False)
print(f"✓ Saved train.csv ({len(train_indices)} nodes)")
print(f"  - Corrupted nodes in train: {corruption_mask[train_indices].sum()}")

✓ Saved train.csv (2166 nodes)
  - Corrupted nodes in train: 744


In [102]:
train_df

Unnamed: 0,node_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_1425,feature_1426,feature_1427,feature_1428,feature_1429,feature_1430,feature_1431,feature_1432,label,is_corrupted
0,1044,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3,0
1,439,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,0
2,1729,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2,1
3,296,-0.411609,0.346647,-0.387879,0.775460,0.806573,0.466978,-0.535278,-0.238821,0.239802,...,0.209798,-0.017429,-0.252375,0.585766,-0.131424,-0.168780,0.140918,0.016024,2,1
4,2211,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2161,1852,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3,0
2162,2374,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,5,0
2163,2105,-0.236034,0.760438,-0.205407,0.654937,0.779884,-0.350357,0.295512,0.189864,0.316862,...,0.390506,0.643409,-0.658021,-0.218903,0.235587,-0.173462,0.327836,0.253711,5,1
2164,133,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1,0


In [None]:

# -------------------------
#  Test CSV 
# -------------------------
test_df = pd.DataFrame(features_corrupted[test_indices])
test_df.columns = [f'feature_{i}' for i in range(num_features)]

test_df.insert(0, 'node_id', test_indices)


test_df.to_csv("data/test.csv", index=False)
print(f"✓ Saved test.csv ({len(test_indices)} nodes)")
print(f"  - Corrupted nodes in test: {corruption_mask[test_indices].sum()} (hidden from participants)")

✓ Saved test.csv (542 nodes)
  - Corrupted nodes in test: 203 (hidden from participants)


In [104]:
test_df

Unnamed: 0,node_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_1423,feature_1424,feature_1425,feature_1426,feature_1427,feature_1428,feature_1429,feature_1430,feature_1431,feature_1432
0,933,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,928,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,823,0.207136,0.587077,-0.165267,0.470813,-0.184076,0.163233,-0.477349,0.19668,-0.005473,...,0.339342,-0.827481,0.194771,0.355205,0.280877,-0.156461,0.139473,-0.311867,-0.314919,-0.280396
3,1531,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,1214,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
537,1638,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
538,1095,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
539,1130,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
540,1294,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
# -------------------------
# Save Ground Truth for Evaluation 
# -------------------------
test_ground_truth = pd.DataFrame({
    'node_id': test_indices,
    'label': labels[test_indices],
    'is_corrupted': corruption_mask[test_indices].astype(int)
})
test_ground_truth.to_csv("data/test_ground_truth.csv", index=False)
print("✓ Saved test_ground_truth.csv (KEEP PRIVATE for evaluation)")

✓ Saved test_ground_truth.csv (KEEP PRIVATE for evaluation)


In [106]:
test_ground_truth

Unnamed: 0,node_id,label,is_corrupted
0,933,2,0
1,928,3,0
2,823,2,1
3,1531,4,0
4,1214,2,1
...,...,...,...
537,1638,1,0
538,1095,3,0
539,1130,3,0
540,1294,3,0


In [None]:


# -------------------------
#  save Metadata
# -------------------------
metadata = {
    'num_nodes': num_nodes,
    'num_features': num_features,
    'num_edges': num_edges,
    'num_classes': num_classes,
    'corruption_rate': 0.35,
    'train_size': len(train_indices),
    'test_size': len(test_indices),
}

pd.DataFrame([metadata]).to_csv("data/metadata.csv", index=False)
print("✓ Saved metadata.csv")

print("\n" + "="*50)
print("Dataset preparation complete!")
print("="*50)
print("\nFiles created:")
print("  - edges.csv (graph structure)")
print("  - adjacency_matrix.npy (alternative format)")
print("  - train.csv (with labels + corruption mask)")
print("  - test.csv (only features, no labels/mask)")
print("  - test_ground_truth.csv (PRIVATE - for your evaluation)")
print("  - metadata.csv (dataset statistics)")




✓ Saved metadata.csv

Dataset preparation complete!

Files created:
  - edges.csv (graph structure)
  - adjacency_matrix.npy (alternative format)
  - train.csv (with labels + corruption mask)
  - test.csv (only features, no labels/mask)
  - test_ground_truth.csv (PRIVATE - for your evaluation)
  - metadata.csv (dataset statistics)
