In [None]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [None]:
import os
import time
import csv
import random
import numpy as np

import torch
import torch.nn.functional as F
from torch import nn

from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures
from torch_geometric.utils import to_dense_adj

from sklearn.linear_model import LogisticRegression
from sklearn.semi_supervised import LabelPropagation
from sklearn.model_selection import StratifiedShuffleSplit


In [None]:
from torch_geometric.data import Data

def generate_synthetic_heterophilous_graph(
    num_nodes=2000,
    num_classes=5,
    feature_dim=128,
    p_in=0.01,
    p_out=0.05,
    seed=0
):
    set_seed(seed)

    # ---- labels ----
    y = torch.randint(0, num_classes, (num_nodes,))

    # ---- features (weakly correlated with labels) ----
    class_means = torch.randn(num_classes, feature_dim)
    x = torch.randn(num_nodes, feature_dim)
    x[y == 0] += 0.2
    x[y == 1] -= 0.2


    # ---- edges (heterophily: more inter-class edges) ----
    edge_list = []

    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):
            if y[i] == y[j]:
                if torch.rand(1).item() < p_in:
                    edge_list.append([i, j])
                    edge_list.append([j, i])
            else:
                if torch.rand(1).item() < p_out:
                    edge_list.append([i, j])
                    edge_list.append([j, i])

    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    data = Data(
        x=x,
        edge_index=edge_index,
        y=y
    )

    return data

In [None]:
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [None]:
def load_synthetic(seed):
    return generate_synthetic_heterophilous_graph(seed=seed)


In [None]:
def create_label_splits(data, label_rate, seed):
    set_seed(seed)

    y = data.y.cpu().numpy()
    idx = np.arange(data.num_nodes)

    sss1 = StratifiedShuffleSplit(
        n_splits=1, test_size=0.2, random_state=seed
    )
    train_val_idx, test_idx = next(sss1.split(idx, y))

    sss2 = StratifiedShuffleSplit(
        n_splits=1, train_size=label_rate, random_state=seed
    )
    labeled_idx, _ = next(
        sss2.split(train_val_idx, y[train_val_idx])
    )
    labeled_idx = train_val_idx[labeled_idx]

    val_idx = np.setdiff1d(train_val_idx, labeled_idx)

    def mask(indices):
        m = np.zeros(data.num_nodes, dtype=bool)
        m[indices] = True
        return m

    return {
        "train": mask(labeled_idx),
        "val": mask(val_idx),
        "test": mask(test_idx)
    }


In [None]:
def apply_feature_ablation(x, ablation, seed, noise_level=None):
    set_seed(seed)

    if ablation == "Vanilla":
        return x

    if ablation == "Identity":
        return torch.eye(x.size(0))

    if ablation == "Shuffled":
        perm = torch.randperm(x.size(0))
        return x[perm]

    if ablation == "Gaussian":
        noise = torch.randn_like(x) * noise_level
        return x + noise

    raise ValueError("Unknown feature ablation")


In [None]:
class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)


In [None]:
def run_mlp(x, y, masks, seed):
    set_seed(seed)

    model = MLP(x.size(1), 64, y.max().item() + 1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    best_val, best_epoch = 0, 0
    patience, wait = 100, 0
    start = time.time()

    for epoch in range(1, 1001):
        model.train()
        optimizer.zero_grad()
        out = model(x)
        loss = F.cross_entropy(
            out[masks["train"]], y[masks["train"]]
        )
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_acc = (
                out[masks["val"]].argmax(1) == y[masks["val"]]
            ).float().mean().item()

        if val_acc > best_val:
            best_val, best_epoch = val_acc, epoch
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                break

    train_time = time.time() - start

    with torch.no_grad():
        test_acc = (
            model(x)[masks["test"]].argmax(1) == y[masks["test"]]
        ).float().mean().item()

    return test_acc, best_epoch, train_time


In [None]:
def run_logreg(x, y, masks, seed):
    set_seed(seed)
    start = time.time()

    clf = LogisticRegression(
        max_iter=1000,
        multi_class="auto"
    )

    clf.fit(
        x[masks["train"]].numpy(),
        y[masks["train"]].numpy()
    )

    test_acc = clf.score(
        x[masks["test"]].numpy(),
        y[masks["test"]].numpy()
    )

    return test_acc, "NA", time.time() - start


In [None]:
def run_label_prop(data, masks):
    start = time.time()

    A = to_dense_adj(data.edge_index, max_num_nodes=data.num_nodes)[0].numpy()
    y = data.y.numpy().copy()

    y[~masks["train"]] = -1

    lp = LabelPropagation(kernel="knn", n_neighbors=10)
    lp.fit(A, y)

    preds = lp.transduction_
    test_acc = (preds[masks["test"]] == data.y.numpy()[masks["test"]]).mean()

    return test_acc, "NA", time.time() - start


In [None]:
def run_non_gnn_baselines(output_csv):
    data = load_synthetic(seed)
    y = data.y
    x_base = data.x

    seeds = [0, 1, 2, 3, 4]
    label_rates = [0.01, 0.03, 0.05, 0.10]

    ablations = [
        ("Vanilla", None),
        ("Identity", None),
        ("Shuffled", None),
        ("Gaussian", 0.1),
        ("Gaussian", 0.3),
        ("Gaussian", 0.5)
    ]

    with open(output_csv, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "dataset", "model", "seed", "label_rate",
            "ablation", "noise_level",
            "accuracy", "best_epoch", "train_time"
        ])

        for seed in seeds:
            for lr in label_rates:
                masks = create_label_splits(data, lr, seed)

                for ab, noise in ablations:
                    x = apply_feature_ablation(x_base, ab, seed, noise)

                    acc, ep, t = run_mlp(x, y, masks, seed)
                    writer.writerow(["Cora","MLP",seed,lr,ab,noise or "NA",acc,ep,t])

                    acc, ep, t = run_logreg(x, y, masks, seed)
                    writer.writerow(["Cora","LogReg",seed,lr,ab,noise or "NA",acc,ep,t])

                acc, ep, t = run_label_prop(data, masks)
                writer.writerow(["Cora","LabelProp",seed,lr,"Structure","NA",acc,ep,t])


In [None]:
run_non_gnn_baselines("nongnn_cora_results.csv")




In [None]:
!ls



data  nongnn_cora_results.csv  sample_data


In [None]:
def load_citeseer():
    dataset = Planetoid(
        root="./data",
        name="CiteSeer",
        transform=NormalizeFeatures()
    )
    return dataset[0]


In [None]:
def run_non_gnn_baselines(output_csv):
    data = load_citeseer()
    y = data.y
    x_base = data.x

    seeds = [0, 1, 2, 3, 4]
    label_rates = [0.01, 0.03, 0.05, 0.10]

    ablations = [
        ("Vanilla", None),
        ("Identity", None),
        ("Shuffled", None),
        ("Gaussian", 0.1),
        ("Gaussian", 0.3),
        ("Gaussian", 0.5)
    ]

    with open(output_csv, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "dataset", "model", "seed", "label_rate",
            "ablation", "noise_level",
            "accuracy", "best_epoch", "train_time"
        ])

        for seed in seeds:
            for lr in label_rates:
                masks = create_label_splits(data, lr, seed)

                for ab, noise in ablations:
                    x = apply_feature_ablation(x_base, ab, seed, noise)

                    acc, ep, t = run_mlp(x, y, masks, seed)
                    writer.writerow(["Citeseer","MLP",seed,lr,ab,noise or "NA",acc,ep,t])

                    acc, ep, t = run_logreg(x, y, masks, seed)
                    writer.writerow(["Citeseer","LogReg",seed,lr,ab,noise or "NA",acc,ep,t])

                acc, ep, t = run_label_prop(data, masks)
                writer.writerow(["Citeseer","LabelProp",seed,lr,"Structure","NA",acc,ep,t])


In [None]:
run_non_gnn_baselines("nongnn_citeseer_results.csv")


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...
Done!


In [None]:
def run_non_gnn_baselines(output_csv):

    seeds = [0, 1, 2, 3, 4]
    label_rates = [0.01, 0.03, 0.05, 0.10]

    ablations = [
        ("Vanilla", None),
        ("Identity", None),
        ("Shuffled", None),
        ("Gaussian", 0.1),
        ("Gaussian", 0.3),
        ("Gaussian", 0.5)
    ]

    with open(output_csv, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "dataset", "model", "seed", "label_rate",
            "ablation", "noise_level",
            "accuracy", "best_epoch", "train_time"
        ])

        for seed in seeds:
            # 🔑 Synthetic graph must be generated per seed
            data = load_synthetic(seed)
            y = data.y
            x_base = data.x

            for lr in label_rates:
                masks = create_label_splits(data, lr, seed)

                for ab, noise in ablations:
                    x = apply_feature_ablation(x_base, ab, seed, noise)

                    acc, ep, t = run_mlp(x, y, masks, seed)
                    writer.writerow(["Synthetic","MLP",seed,lr,ab,noise or "NA",acc,ep,t])

                    acc, ep, t = run_logreg(x, y, masks, seed)
                    writer.writerow(["Synthetic","LogReg",seed,lr,ab,noise or "NA",acc,ep,t])

                acc, ep, t = run_label_prop(data, masks)
                writer.writerow(["Synthetic","LabelProp",seed,lr,"Structure","NA",acc,ep,t])


In [None]:
run_non_gnn_baselines("nongnn_synthetic_results.csv")


