In [None]:
! git clone https://github.com/WIQHE/Histographs.git


fatal: destination path 'Histographs' already exists and is not an empty directory.


In [None]:
import torch
print(torch.cuda.is_available())  # should be True
print(torch.cuda.get_device_name(0))
torch.backends.cudnn.benchmark = True
 # should print your GPU


True
Tesla T4


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()


Using device: cuda



In [None]:
! pip install torch-Geometric

Collecting torch-Geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-Geometric
Successfully installed torch-Geometric-2.6.1


In [None]:
# ! pip install torch-scatter -f https://data.pyg.org/whl/torch-2.1.0+cpu.html# Install PyTorch Geometric and dependencies
# ! pip install torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cpu.htmlimport os
# ! pip install torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cpu.htmlos.system
# ! pip install torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
! pip install torch-Geometric




In [None]:
# run_pipeline_fixed.py

import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
import numpy as np
from sklearn.model_selection import train_test_split
import os

# Enable CuDNN autotuner for potential speedups
torch.backends.cudnn.benchmark = True

# ─── Hyperparameters ───────────────────────────────────────────────────────────
DATASET_NAME   = 'MUTAG'      # fallback
WINDOW_SIZE    = 15         # ~5–10% of a 1–2k–node graph
STEP_SIZE      = WINDOW_SIZE // 2
HIDDEN_DIM     = 12
NUM_HEADS      = 4
NUM_LAYERS     = 3
BETA           = 0.02         # reduced reg strength
SIGMA          = 1.8          # wider RBF
EPS            = 1e-3
LR             = 5e-2
EPOCHS         = 25           # fewer epochs + early stopping
BATCH_SIZE     = 12           # small batch for big subgraphs
WEIGHT_DECAY   = 1e-3
DROPOUT        = 0.3
RANDOM_STATE   = 42
DEVICE         = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# ─── 1) Subgraph Extraction via Sliding-Window ─────────────────────────────────
def extract_subgraphs(data: Data, window_size: int, step: int):
    N = data.num_nodes
    subs = []
    for start in range(0, max(1, N - window_size + 1), step):
        idx = torch.arange(start, start + window_size).clamp(0, N-1)
        mask = idx.unique()
        sub_x = data.x[mask]
        edge_index = data.edge_index.cpu().numpy()
        node_map = {int(n): i for i, n in enumerate(mask.tolist())}
        keep = [(node_map[u], node_map[v])
                for u, v in edge_index.T
                if u in node_map and v in node_map]
        if not keep:
            continue
        ei = torch.tensor(keep, dtype=torch.long).t().contiguous()
        subs.append(Data(x=sub_x,
                         edge_index=ei,
                         y=data.y,
                         num_nodes=mask.size(0)))
    return subs


# ─── 2) Laplacian-Based Feature Smoothing ───────────────────────────────────────
def smooth_features(x: torch.Tensor, beta: float,
                    sigma: float = SIGMA, eps: float = EPS):
    X = x.cpu().numpy()
    n = X.shape[0]
    D2 = np.sum((X[:, None] - X[None, :])**2, axis=-1)
    W  = np.exp(-D2 / (2 * sigma**2 + eps))
    d  = W.sum(axis=1)
    D_inv_sqrt = np.diag(1.0 / np.sqrt(d + eps))
    L  = np.eye(n) - D_inv_sqrt @ W @ D_inv_sqrt
    A  = np.eye(n) + beta * L
    Xs = np.linalg.solve(A, X)
    return torch.tensor(Xs, dtype=torch.float32)


# ─── 3) GAT Model ──────────────────────────────────────────────────────────────
class GATGraphClassifier(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_classes,
                 num_layers, num_heads, dropout):
        super().__init__()
        self.convs = nn.ModuleList()
        # first GAT layer
        self.convs.append(GATConv(in_channels,
                                  hidden_dim // num_heads,
                                  heads=num_heads,
                                  concat=True,
                                  dropout=dropout))
        # additional layers
        for _ in range(num_layers - 1):
            self.convs.append(GATConv(hidden_dim,
                                      hidden_dim // num_heads,
                                      heads=num_heads,
                                      concat=True,
                                      dropout=dropout))
        self.lin = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        for conv in self.convs:
            x = F.elu(conv(x, edge_index))
        batch_size = int(batch.max().item()) + 1
        out = torch.zeros(batch_size, x.size(1), device=x.device)
        out = out.index_add(0, batch, x)
        counts = torch.bincount(batch)
        out = out / counts.unsqueeze(1).float()
        return self.lin(out), x


# ─── 4) Training & Evaluation ───────────────────────────────────────────────────
def train_one_epoch(model, loader, optimizer, beta):
    model.train()
    total_loss = 0.0
    for data in loader:
        data = data.to(DEVICE, non_blocking=True)
        optimizer.zero_grad()
        logits, node_emb = model(data.x, data.edge_index, data.batch)
        ce = F.cross_entropy(logits, data.y)
        reg = 0.0
        for graph_id in torch.unique(data.batch):
            mask = (data.batch == graph_id)
            Xi = node_emb[mask]
            Xi_np = Xi.detach().cpu().numpy()
            D2 = np.sum((Xi_np[:, None] - Xi_np[None, :])**2, axis=-1)
            W  = np.exp(-D2 / (2 * SIGMA**2 + EPS))
            d  = W.sum(1)
            D_inv_sqrt = np.diag(1.0 / np.sqrt(d + EPS))
            L  = torch.tensor(np.eye(len(d)) - D_inv_sqrt @ W @ D_inv_sqrt,
                              device=Xi.device, dtype=torch.float32)
            reg += torch.trace(Xi.t() @ L @ Xi)
        loss = ce + beta * reg
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)


@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(DEVICE, non_blocking=True)
        logits, _ = model(data.x, data.edge_index, data.batch)
        pred = logits.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)


def run_pipeline(train_csv: str = None, test_csv: str = None):
    # ─── 1) Load dataset ────────────────────────────────────────────────────────
    if train_csv and test_csv:
        train_meta = pd.read_csv(train_csv)
        test_meta  = pd.read_csv(test_csv)
        train_ds, test_ds = [], []
        for df, ds in [(train_meta, train_ds), (test_meta, test_ds)]:
            for _, row in df.iterrows():
                path = row['graph_path']
                g: Data = torch.load(path, weights_only=False)
                g.y = torch.tensor([row['label']], dtype=torch.long)
                ds.append(g)
        num_features = train_ds[0].x.shape[1]
        num_classes  = int(max([g.y.item() for g in train_ds + test_ds])) + 1
    else:
        dataset = TUDataset(root='./data', name=DATASET_NAME)
        train_ds, test_ds = train_test_split(
            dataset,
            test_size=0.2,
            stratify=[d.y.item() for d in dataset],
            random_state=RANDOM_STATE
        )
        num_features = dataset.num_features
        num_classes  = dataset.num_classes

    # ─── 2) Build augmented (subgraph + smoothed) lists ─────────────────────────
    train_aug, test_aug = [], []
    for ds, aug in [(train_ds, train_aug), (test_ds, test_aug)]:
        for data in ds:
            subs = extract_subgraphs(data, WINDOW_SIZE, STEP_SIZE)
            for g in subs:
                g.x = smooth_features(g.x, BETA)
                aug.append(g)

    # ─── 3) DataLoaders with pin_memory & workers ───────────────────────────────
    train_loader = DataLoader(
        train_aug,
        batch_size=BATCH_SIZE,
        shuffle=True,
        pin_memory=True,
        num_workers=4
    )
    test_loader = DataLoader(
        test_aug,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        num_workers=4
    )

    # ─── 4) Model & optimizer ───────────────────────────────────────────────────
    model = GATGraphClassifier(
        in_channels=num_features,
        hidden_dim=HIDDEN_DIM,
        num_classes=num_classes,
        num_layers=NUM_LAYERS,
        num_heads=NUM_HEADS,
        dropout=DROPOUT
    ).to(DEVICE)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LR,
        weight_decay=WEIGHT_DECAY
    )

    # ─── 5) Training loop with early saving ─────────────────────────────────────
    best_val = 0.0
    for epoch in range(1, EPOCHS + 1):
        loss = train_one_epoch(model, train_loader, optimizer, BETA)
        acc  = evaluate(model, test_loader)
        if epoch == 1 or epoch % 5 == 0:
            print(f"Epoch {epoch:02d} | Loss: {loss:.4f} | Test Acc: {acc:.4f}")
        if acc > best_val:
            best_val = acc
            torch.save(model.state_dict(), 'best_model.pt')

    print(f"Best Test Accuracy: {best_val:.4f}")


# To run:
run_pipeline()
# or


Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Processing...
Done!


Epoch 01 | Loss: 0.6861 | Test Acc: 0.7143
Epoch 05 | Loss: 0.5909 | Test Acc: 0.7143
Epoch 10 | Loss: 0.5905 | Test Acc: 0.7143
Epoch 15 | Loss: 0.5860 | Test Acc: 0.7143
Epoch 20 | Loss: 0.5910 | Test Acc: 0.7143
Epoch 25 | Loss: 0.5931 | Test Acc: 0.7143
Best Test Accuracy: 0.7143


In [None]:
# ! mv  sample_data/graphs_new_pannuke_edgeAtr ../

mv: cannot stat 'sample_data/graphs_new_pannuke_edgeAtr': No such file or directory


In [None]:
#this is runng the same code on our created graphs_new_pannuke_edgeAtr
# run_pipeline('./sample_data/train_meta.csv', './sample_data/test_meta.csv')



Epoch 01 | Loss: 4.1887 | Test Acc: 0.1776
Epoch 05 | Loss: 1.5706 | Test Acc: 0.2110
Epoch 10 | Loss: 153.6502 | Test Acc: 0.3361
Epoch 15 | Loss: 33.6410 | Test Acc: 0.3361
Epoch 20 | Loss: 12.3412 | Test Acc: 0.3361
Epoch 25 | Loss: 4.7610 | Test Acc: 0.3361
Best Test Accuracy: 0.3361


In [None]:
! pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [None]:
class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

In [None]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='/tmp/Cora', name='Cora')


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = dataset[0].to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [None]:
model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()

In [None]:
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum()
acc = int(correct) / int(data.test_mask.sum())
print(f'Accuracy: {acc:.4f}')

Accuracy: 0.8110


In [4]:
! pip install torch-geometric

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [5]:
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, global_mean_pool
import numpy as np
from sklearn.model_selection import train_test_split

torch.backends.cudnn.benchmark = True
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

WINDOW_SIZE = 15
STEP_SIZE = WINDOW_SIZE // 2
BETA = 0.02
SIGMA = 1.8
EPS = 1e-3

def extract_subgraphs(data, window_size, step):
    N = data.num_nodes
    subs = []
    for start in range(0, max(1, N - window_size + 1), step):
        idx = torch.arange(start, start + window_size).clamp(0, N - 1)
        mask = idx.unique()
        sub_x = data.x[mask]
        edge_index = data.edge_index.cpu().numpy()
        node_map = {int(n): i for i, n in enumerate(mask.tolist())}
        keep = [(node_map[u], node_map[v]) for u, v in edge_index.T if u in node_map and v in node_map]
        if not keep:
            continue
        ei = torch.tensor(keep, dtype=torch.long).t().contiguous()
        subs.append(Data(x=sub_x, edge_index=ei, y=data.y))
    return subs

def smooth_features(x, beta, sigma=SIGMA, eps=EPS):
    X = x.cpu().numpy()
    n = X.shape[0]
    D2 = np.sum((X[:, None] - X[None, :])**2, axis=-1)
    W = np.exp(-D2 / (2 * sigma**2 + eps))
    d = W.sum(axis=1)
    D_inv_sqrt = np.diag(1.0 / np.sqrt(d + eps))
    L = np.eye(n) - D_inv_sqrt @ W @ D_inv_sqrt
    A = np.eye(n) + beta * L
    Xs = np.linalg.solve(A, X)
    return torch.tensor(Xs, dtype=torch.float32)

class GCNGraphClassifier(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_classes):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.lin = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        out = global_mean_pool(x, batch)
        return self.lin(out)

def run_gcn_pipeline():
    dataset = TUDataset(root='./data', name='MUTAG')
    train_ds, test_ds = train_test_split(dataset, test_size=0.2, stratify=[d.y.item() for d in dataset])

    train_aug, test_aug = [], []
    for ds, aug in [(train_ds, train_aug), (test_ds, test_aug)]:
        for data in ds:
            subs = extract_subgraphs(data, WINDOW_SIZE, STEP_SIZE)
            for g in subs:
                g.x = smooth_features(g.x, BETA)
                aug.append(g)

    train_loader = DataLoader(train_aug, batch_size=12, shuffle=True)
    test_loader = DataLoader(test_aug, batch_size=12)

    model = GCNGraphClassifier(dataset.num_features, 64, dataset.num_classes).to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

    best_acc = 0
    for epoch in range(1, 26):
        model.train()
        for data in train_loader:
            data = data.to(DEVICE)
            optimizer.zero_grad()
            out = model(data.x, data.edge_index, data.batch)
            loss = F.cross_entropy(out, data.y)
            loss.backward()
            optimizer.step()

        model.eval()
        correct = sum((model(d.x.to(DEVICE), d.edge_index.to(DEVICE), d.batch.to(DEVICE)).argmax(1) == d.y.to(DEVICE)).sum().item() for d in test_loader)
        acc = correct / len(test_loader.dataset)
        if acc > best_acc:
            best_acc = acc
            torch.save(model.state_dict(), 'best_gcn_model.pt')
        if epoch % 5 == 0:
            print(f"Epoch {epoch} | Accuracy: {acc:.4f}")

    print(f"Best Test Accuracy: {best_acc:.4f}")

run_gcn_pipeline()


Downloading https://www.chrsmrrs.com/graphkerneldatasets/MUTAG.zip
Processing...
Done!


Epoch 5 | Accuracy: 0.7083
Epoch 10 | Accuracy: 0.7708
Epoch 15 | Accuracy: 0.6667
Epoch 20 | Accuracy: 0.7083
Epoch 25 | Accuracy: 0.8333
Best Test Accuracy: 0.8333


In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
import numpy as np

torch.backends.cudnn.benchmark = True
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

WINDOW_SIZE = 50
STEP_SIZE = 25
BETA = 0.02
SIGMA = 1.8
EPS = 1e-3

dataset = Planetoid(root='./data', name='Cora')
data = dataset[0]

def extract_subgraphs(data, window_size, step):
    N = data.num_nodes
    subs = []
    for start in range(0, max(1, N - window_size + 1), step):
        idx = torch.arange(start, start + window_size).clamp(0, N - 1)
        mask = idx.unique()
        sub_x = data.x[mask]
        edge_index = data.edge_index.cpu().numpy()
        node_map = {int(n): i for i, n in enumerate(mask.tolist())}
        keep = [(node_map[u], node_map[v]) for u, v in edge_index.T if u in node_map and v in node_map]
        if not keep:
            continue
        ei = torch.tensor(keep, dtype=torch.long).t().contiguous()
        subs.append(Data(x=sub_x, edge_index=ei, y=data.y[mask]))
    return subs

def smooth_features(x, beta, sigma=SIGMA, eps=EPS):
    X = x.cpu().numpy()
    n = X.shape[0]
    D2 = np.sum((X[:, None] - X[None, :])**2, axis=-1)
    W = np.exp(-D2 / (2 * sigma**2 + eps))
    d = W.sum(axis=1)
    D_inv_sqrt = np.diag(1.0 / np.sqrt(d + eps))
    L = np.eye(n) - D_inv_sqrt @ W @ D_inv_sqrt
    A = np.eye(n) + beta * L
    Xs = np.linalg.solve(A, X)
    return torch.tensor(Xs, dtype=torch.float32)

subs = extract_subgraphs(data, WINDOW_SIZE, STEP_SIZE)
for g in subs:
    g.x = smooth_features(g.x, BETA)

loader = DataLoader(subs, batch_size=16, shuffle=True)

model = GATConv(dataset.num_features, dataset.num_classes, heads=8).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

for epoch in range(1, 51):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(DEVICE)
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index)
        loss = F.cross_entropy(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    if epoch % 10 == 0:
        print(f"Epoch {epoch} | Loss: {total_loss/len(loader):.4f}")


Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


Epoch 10 | Loss: 0.5600
Epoch 20 | Loss: 0.3583
Epoch 30 | Loss: 0.2823
Epoch 40 | Loss: 0.2594
Epoch 50 | Loss: 0.2329
