In [None]:
import os
import time
import random
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.nn import SAGEConv

from sklearn.model_selection import StratifiedShuffleSplit


In [None]:
DATASET_NAME = "Cora"
MODEL_NAME = "GraphSAGE"

SEEDS = [0, 1, 2, 3, 4]
LABEL_RATES = [0.01, 0.03, 0.05, 0.10]

NOISE_LEVELS = [0.1, 0.3, 0.5]
EDGE_DROP_RATE = 0.2

MAX_EPOCHS = 300
PATIENCE = 50

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

CSV_SCHEMA = [
    "dataset", "model", "seed", "label_rate",
    "ablation", "noise_level",
    "accuracy", "best_epoch", "train_time"
]


In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [None]:
dataset = Planetoid(
    root="./data",
    name="Cora",
    transform=NormalizeFeatures()
)

data = dataset[0]


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [None]:
def create_label_splits(data, label_rate, seed):
    set_seed(seed)

    y = data.y.cpu().numpy()
    idx = np.arange(len(y))

    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
    train_val_idx, test_idx = next(sss1.split(idx, y))


    sss2 = StratifiedShuffleSplit(
        n_splits=1,
        train_size=label_rate,
        random_state=seed
    )
    train_idx, _ = next(sss2.split(train_val_idx, y[train_val_idx]))

    train_idx = train_val_idx[train_idx]


    val_idx = np.setdiff1d(train_val_idx, train_idx)

    masks = {}
    for name, indices in zip(["train", "val", "test"],
                             [train_idx, val_idx, test_idx]):
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[indices] = True
        masks[name] = mask

    return masks


In [None]:
def apply_feature_ablation(x, ablation, noise_level, seed):
    set_seed(seed)


    if ablation in ["Vanilla", "EdgeDrop"]:
        return x

    if ablation == "Identity":
        return torch.eye(x.size(0), device=x.device)

    if ablation == "Shuffled":
        perm = torch.randperm(x.size(0))
        return x[perm]

    if ablation == "Gaussian":
        noise = torch.randn_like(x) * noise_level
        return x + noise

    raise ValueError(f"Unknown feature ablation: {ablation}")


In [None]:
def apply_structure_ablation(edge_index, ablation, seed):
    set_seed(seed)

    if ablation != "EdgeDrop":
        return edge_index

    num_edges = edge_index.size(1)
    keep = int((1 - EDGE_DROP_RATE) * num_edges)
    perm = torch.randperm(num_edges)[:keep]

    return edge_index[:, perm]


In [None]:
class GraphSAGE(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.conv1 = SAGEConv(in_dim, hidden_dim)
        self.conv2 = SAGEConv(hidden_dim, out_dim)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x


In [None]:
def train_and_eval(data, masks, seed):
    set_seed(seed)

    model = GraphSAGE(
        in_dim=data.x.size(1),
        hidden_dim=128,
        out_dim=dataset.num_classes
    ).to(DEVICE)

    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=0.01,
        weight_decay=5e-4
    )

    best_val = 0
    best_epoch = 0
    patience_counter = 0

    start_time = time.time()

    for epoch in range(1, MAX_EPOCHS + 1):
        model.train()
        optimizer.zero_grad()

        out = model(data.x, data.edge_index)
        loss = F.cross_entropy(out[masks["train"]], data.y[masks["train"]])
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_acc = (
                out[masks["val"]].argmax(dim=1)
                == data.y[masks["val"]]
            ).float().mean().item()

        if val_acc > best_val:
            best_val = val_acc
            best_epoch = epoch
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= PATIENCE:
            break

    train_time = time.time() - start_time

    model.eval()
    with torch.no_grad():
        test_acc = (
            model(data.x, data.edge_index)[masks["test"]]
            .argmax(dim=1)
            == data.y[masks["test"]]
        ).float().mean().item()

    return test_acc, best_epoch, train_time


In [None]:
results = []

for seed in SEEDS:
    for label_rate in LABEL_RATES:
        masks = create_label_splits(data, label_rate, seed)

        for ablation in ["Vanilla", "Identity", "Shuffled", "Gaussian", "EdgeDrop"]:
            noise_levels = NOISE_LEVELS if ablation == "Gaussian" else [None]

            for noise in noise_levels:
                x = apply_feature_ablation(data.x, ablation, noise, seed)
                edge_index = apply_structure_ablation(data.edge_index, ablation, seed)

                run_data = data.clone()
                run_data.x = x
                run_data.edge_index = edge_index
                run_data = run_data.to(DEVICE)

                acc, best_epoch, train_time = train_and_eval(
                    run_data, masks, seed
                )

                results.append([
                    DATASET_NAME, MODEL_NAME, seed, label_rate,
                    ablation, noise,
                    acc, best_epoch, train_time
                ])


In [None]:
df = pd.DataFrame(results, columns=CSV_SCHEMA)

os.makedirs("results", exist_ok=True)
csv_path = f"results/{MODEL_NAME}_{DATASET_NAME}.csv"

df.to_csv(csv_path, index=False)
print(f"Saved results to {csv_path}")


Saved results to results/GraphSAGE_SyntheticHeterophilous.csv


In [None]:
DATASET_NAME = "CiteSeer"
MODEL_NAME = "GraphSAGE"


In [None]:
dataset = Planetoid(
    root="./data",
    name="CiteSeer",
    transform=NormalizeFeatures()
)

data = dataset[0]


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...
Done!


In [None]:
from torch_geometric.data import Data

def generate_synthetic_heterophilous_graph(
    num_nodes=2000,
    num_classes=5,
    feature_dim=128,
    p_in=0.01,
    p_out=0.05,
    seed=0
):
    set_seed(seed)


    y = torch.randint(0, num_classes, (num_nodes,))

    class_means = torch.randn(num_classes, feature_dim)
    x = torch.randn(num_nodes, feature_dim)
    x[y == 0] += 0.2
    x[y == 1] -= 0.2


    edge_list = []

    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):
            if y[i] == y[j]:
                if torch.rand(1).item() < p_in:
                    edge_list.append([i, j])
                    edge_list.append([j, i])
            else:
                if torch.rand(1).item() < p_out:
                    edge_list.append([i, j])
                    edge_list.append([j, i])

    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    data = Data(
        x=x,
        edge_index=edge_index,
        y=y
    )

    return data

In [None]:
def load_synthetic(seed):
    return generate_synthetic_heterophilous_graph(seed=seed)

In [None]:
DATASET_NAME = "SyntheticHeterophilous"
MODEL_NAME = "GraphSAGE"


In [None]:
data = load_synthetic(seed=0)


In [None]:
def create_label_splits(data, label_rate, seed):
    set_seed(seed)

    y = data.y.cpu().numpy()
    idx = np.arange(len(y))

    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=seed)
    train_val_idx, test_idx = next(sss1.split(idx, y))

    num_labeled = max(1, int(label_rate * len(train_val_idx)))

    sss2 = StratifiedShuffleSplit(
        n_splits=1,
        train_size=num_labeled,
        random_state=seed
    )
    train_idx, _ = next(sss2.split(train_val_idx, y[train_val_idx]))

    train_idx = train_val_idx[train_idx]
    val_idx = np.setdiff1d(train_val_idx, train_idx)

    masks = {}
    for name, indices in zip(["train", "val", "test"],
                             [train_idx, val_idx, test_idx]):
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[indices] = True
        masks[name] = mask

    return masks
