In [1]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.7.0


In [2]:
import time
import random
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv
from torch_geometric.utils import dropout_edge


In [6]:
'''# ====== FIXED BY CONTRACT ======
DATASET_NAME = "CiteSeer"
MODEL_NAME = "GCN"

SEEDS = [0, 1, 2, 3, 4]
LABEL_RATES = [0.01, 0.03, 0.05, 0.10]

NOISE_LEVELS = [0.1, 0.3, 0.5]

ABLATIONS = [
    "identity",
    "shuffled_features",
    "gaussian_features",
    "edge_drop"
]

EDGE_DROP_RATE = 0.2
MAX_EPOCHS = 200
PATIENCE = 20'''




In [7]:
DATASET_NAME = "Synthetic-Heterophilous"
MODEL_NAME = "GCN"

SEEDS = [0, 1, 2, 3, 4]
LABEL_RATES = [0.01, 0.03, 0.05, 0.10]

NOISE_LEVELS = [0.1, 0.3, 0.5]

ABLATIONS = [
    "identity",
    "shuffled_features",
    "gaussian_features",
    "edge_drop"
]

EDGE_DROP_RATE = 0.2
MAX_EPOCHS = 200
PATIENCE = 20


In [8]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


In [None]:
#do not run this for synthetic
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root="./data", name=DATASET_NAME)
data = dataset[0]

NUM_CLASSES = dataset.num_classes
NUM_FEATURES = dataset.num_features


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.citeseer.test.index
Processing...
Done!


In [9]:
def generate_synthetic_heterophilous_graph(
    num_nodes=2000,
    num_classes=5,
    feature_dim=128,
    p_in=0.01,
    p_out=0.05,
    seed=0
):
    set_seed(seed)

    y = torch.randint(0, num_classes, (num_nodes,))

    x = torch.randn(num_nodes, feature_dim)
    x[y == 0] += 0.2
    x[y == 1] -= 0.2

    edge_list = []
    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):
            if y[i] == y[j]:
                if torch.rand(1).item() < p_in:
                    edge_list.append([i, j])
                    edge_list.append([j, i])
            else:
                if torch.rand(1).item() < p_out:
                    edge_list.append([i, j])
                    edge_list.append([j, i])

    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    data = Data(x=x, edge_index=edge_index, y=y)
    return data


def load_synthetic(seed):
    return generate_synthetic_heterophilous_graph(seed=seed)


In [10]:
_tmp = load_synthetic(seed=0)
data = _tmp

NUM_CLASSES = int(data.y.max().item() + 1)
NUM_FEATURES = data.x.size(1)


In [11]:
def stratified_label_split(data, label_rate, seed):
    set_seed(seed)

    y = data.y.cpu().numpy()
    num_nodes = data.num_nodes

    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    for c in range(NUM_CLASSES):
        idx = np.where(y == c)[0]
        np.random.shuffle(idx)

        n_test = max(1, int(0.2 * len(idx)))
        n_train = max(1, int(label_rate * len(idx)))
        n_val = max(1, int(0.1 * len(idx)))

        test_idx = idx[:n_test]
        train_idx = idx[n_test:n_test + n_train]
        val_idx = idx[n_test + n_train:n_test + n_train + n_val]

        test_mask[test_idx] = True
        train_mask[train_idx] = True
        val_mask[val_idx] = True

    return train_mask, val_mask, test_mask


In [12]:
def apply_feature_ablation(x, ablation, noise_level=None, seed=None):
    if seed is not None:
        set_seed(seed)

    if ablation == "identity":
        return x

    elif ablation == "shuffled_features":
        perm = torch.randperm(x.size(0))
        return x[perm]

    elif ablation == "gaussian_features":
        noise = torch.randn_like(x) * noise_level
        return x + noise

    return x


def apply_structure_ablation(edge_index, ablation, seed=None):
    if seed is not None:
        set_seed(seed)

    if ablation == "edge_drop":
        edge_index, _ = dropout_edge(edge_index, p=EDGE_DROP_RATE)

    return edge_index


In [13]:
class GCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x


In [14]:
def train_and_eval(data, train_mask, val_mask, test_mask,
                   ablation, noise_level, seed):

    set_seed(seed)

    x = apply_feature_ablation(
        data.x.clone(), ablation, noise_level, seed
    )
    edge_index = apply_structure_ablation(
        data.edge_index.clone(), ablation, seed
    )

    model = GCN(NUM_FEATURES, 64, NUM_CLASSES)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.01, weight_decay=5e-4
    )

    best_val = 0
    best_epoch = 0
    patience_counter = 0

    start_time = time.time()

    for epoch in range(MAX_EPOCHS):
        model.train()
        optimizer.zero_grad()
        out = model(x, edge_index)
        loss = F.cross_entropy(out[train_mask], data.y[train_mask])
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            logits = model(x, edge_index)
            val_acc = (
                logits[val_mask].argmax(dim=1)
                == data.y[val_mask]
            ).float().mean().item()

        if val_acc > best_val:
            best_val = val_acc
            best_epoch = epoch
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= PATIENCE:
            break

    train_time = time.time() - start_time

    model.eval()
    with torch.no_grad():
        test_acc = (
            logits[test_mask].argmax(dim=1)
            == data.y[test_mask]
        ).float().mean().item()

    return test_acc, best_epoch, train_time


In [15]:
rows = []

for seed in SEEDS:
    data = load_synthetic(seed)

    for label_rate in LABEL_RATES:
        train_mask, val_mask, test_mask = stratified_label_split(
            data, label_rate, seed
        )

        for ablation in ABLATIONS:
            if ablation == "gaussian_features":
                for sigma in NOISE_LEVELS:
                    acc, best_ep, t = train_and_eval(
                        data, train_mask, val_mask, test_mask,
                        ablation, sigma, seed
                    )
                    rows.append([
                        DATASET_NAME, MODEL_NAME, seed,
                        label_rate, ablation, sigma,
                        acc, best_ep, t
                    ])
            else:
                acc, best_ep, t = train_and_eval(
                    data, train_mask, val_mask, test_mask,
                    ablation, None, seed
                )
                rows.append([
                    DATASET_NAME, MODEL_NAME, seed,
                    label_rate, ablation, None,
                    acc, best_ep, t
                ])


In [16]:
df = pd.DataFrame(rows, columns=[
    "dataset",
    "model",
    "seed",
    "label_rate",
    "ablation",
    "noise_level",
    "accuracy",
    "best_epoch",
    "train_time"
])

csv_path = "GCN_Synthetic_results.csv"
df.to_csv(csv_path, index=False)

print(f"Saved results to {csv_path}")
print(df.head())


Saved results to GCN_Synthetic_results.csv
                   dataset model  seed  label_rate           ablation  \
0  Synthetic-Heterophilous   GCN     0        0.01           identity   
1  Synthetic-Heterophilous   GCN     0        0.01  shuffled_features   
2  Synthetic-Heterophilous   GCN     0        0.01  gaussian_features   
3  Synthetic-Heterophilous   GCN     0        0.01  gaussian_features   
4  Synthetic-Heterophilous   GCN     0        0.01  gaussian_features   

   noise_level  accuracy  best_epoch  train_time  
0          NaN  0.502513          46   16.442842  
1          NaN  0.329146          45   15.300215  
2          0.1  0.520100          46   16.295225  
3          0.3  0.505025          48   16.994443  
4          0.5  0.500000          46   15.876031  


In [3]:
from torch_geometric.data import Data

def generate_synthetic_heterophilous_graph(
    num_nodes=2000,
    num_classes=5,
    feature_dim=128,
    p_in=0.01,
    p_out=0.05,
    seed=0
):
    set_seed(seed)

    # ---- labels ----
    y = torch.randint(0, num_classes, (num_nodes,))

    # ---- features (weakly correlated with labels) ----
    class_means = torch.randn(num_classes, feature_dim)
    x = torch.randn(num_nodes, feature_dim)
    x[y == 0] += 0.2
    x[y == 1] -= 0.2


    # ---- edges (heterophily: more inter-class edges) ----
    edge_list = []

    for i in range(num_nodes):
        for j in range(i + 1, num_nodes):
            if y[i] == y[j]:
                if torch.rand(1).item() < p_in:
                    edge_list.append([i, j])
                    edge_list.append([j, i])
            else:
                if torch.rand(1).item() < p_out:
                    edge_list.append([i, j])
                    edge_list.append([j, i])

    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

    data = Data(
        x=x,
        edge_index=edge_index,
        y=y
    )

    return data

In [4]:
def load_synthetic(seed):
    return generate_synthetic_heterophilous_graph(seed=seed)