<a href="https://colab.research.google.com/github/WIQHE/graphFeatureRefinement/blob/main/Atulya_WSSG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/WIQHE/Histographs.git


fatal: destination path 'Histographs' already exists and is not an empty directory.


In [None]:
import torch
print(torch.cuda.is_available())  # should be True
print(torch.cuda.get_device_name(0))
torch.backends.cudnn.benchmark = True
 # should print your GPU


True
Tesla T4


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()


Using device: cuda



In [None]:
! pip install torch-Geometric



In [None]:
# ! pip install torch-scatter -f https://data.pyg.org/whl/torch-2.1.0+cpu.html# Install PyTorch Geometric and dependencies
# ! pip install torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cpu.htmlimport os
# ! pip install torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cpu.htmlos.system
# ! pip install torch-sparse -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
! pip install torch-Geometric




In [None]:
# run_pipeline_fixed.py

import pandas as pd
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
import numpy as np
from sklearn.model_selection import train_test_split
import os

# Enable CuDNN autotuner for potential speedups
torch.backends.cudnn.benchmark = True

# ─── Hyperparameters ───────────────────────────────────────────────────────────
DATASET_NAME   = 'MUTAG'      # fallback
WINDOW_SIZE    = 150          # ~5–10% of a 1–2k–node graph
STEP_SIZE      = WINDOW_SIZE // 2
HIDDEN_DIM     = 128
NUM_HEADS      = 8
NUM_LAYERS     = 3
BETA           = 0.02         # reduced reg strength
SIGMA          = 1.8          # wider RBF
EPS            = 1e-8
LR             = 5e-4
EPOCHS         = 25           # fewer epochs + early stopping
BATCH_SIZE     = 12           # small batch for big subgraphs
WEIGHT_DECAY   = 1e-4
DROPOUT        = 0.4
RANDOM_STATE   = 42
DEVICE         = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# ─── 1) Subgraph Extraction via Sliding-Window ─────────────────────────────────
def extract_subgraphs(data: Data, window_size: int, step: int):
    N = data.num_nodes
    subs = []
    for start in range(0, max(1, N - window_size + 1), step):
        idx = torch.arange(start, start + window_size).clamp(0, N-1)
        mask = idx.unique()
        sub_x = data.x[mask]
        edge_index = data.edge_index.cpu().numpy()
        node_map = {int(n): i for i, n in enumerate(mask.tolist())}
        keep = [(node_map[u], node_map[v])
                for u, v in edge_index.T
                if u in node_map and v in node_map]
        if not keep:
            continue
        ei = torch.tensor(keep, dtype=torch.long).t().contiguous()
        subs.append(Data(x=sub_x,
                         edge_index=ei,
                         y=data.y,
                         num_nodes=mask.size(0)))
    return subs


# ─── 2) Laplacian-Based Feature Smoothing ───────────────────────────────────────
def smooth_features(x: torch.Tensor, beta: float,
                    sigma: float = SIGMA, eps: float = EPS):
    X = x.cpu().numpy()
    n = X.shape[0]
    D2 = np.sum((X[:, None] - X[None, :])**2, axis=-1)
    W  = np.exp(-D2 / (2 * sigma**2 + eps))
    d  = W.sum(axis=1)
    D_inv_sqrt = np.diag(1.0 / np.sqrt(d + eps))
    L  = np.eye(n) - D_inv_sqrt @ W @ D_inv_sqrt
    A  = np.eye(n) + beta * L
    Xs = np.linalg.solve(A, X)
    return torch.tensor(Xs, dtype=torch.float32)


# ─── 3) GAT Model ──────────────────────────────────────────────────────────────
class GATGraphClassifier(nn.Module):
    def __init__(self, in_channels, hidden_dim, num_classes,
                 num_layers, num_heads, dropout):
        super().__init__()
        self.convs = nn.ModuleList()
        # first GAT layer
        self.convs.append(GATConv(in_channels,
                                  hidden_dim // num_heads,
                                  heads=num_heads,
                                  concat=True,
                                  dropout=dropout))
        # additional layers
        for _ in range(num_layers - 1):
            self.convs.append(GATConv(hidden_dim,
                                      hidden_dim // num_heads,
                                      heads=num_heads,
                                      concat=True,
                                      dropout=dropout))
        self.lin = nn.Linear(hidden_dim, num_classes)

    def forward(self, x, edge_index, batch):
        for conv in self.convs:
            x = F.elu(conv(x, edge_index))
        batch_size = int(batch.max().item()) + 1
        out = torch.zeros(batch_size, x.size(1), device=x.device)
        out = out.index_add(0, batch, x)
        counts = torch.bincount(batch)
        out = out / counts.unsqueeze(1).float()
        return self.lin(out), x


# ─── 4) Training & Evaluation ───────────────────────────────────────────────────
def train_one_epoch(model, loader, optimizer, beta):
    model.train()
    total_loss = 0.0
    for data in loader:
        data = data.to(DEVICE, non_blocking=True)
        optimizer.zero_grad()
        logits, node_emb = model(data.x, data.edge_index, data.batch)
        ce = F.cross_entropy(logits, data.y)
        reg = 0.0
        for graph_id in torch.unique(data.batch):
            mask = (data.batch == graph_id)
            Xi = node_emb[mask]
            Xi_np = Xi.detach().cpu().numpy()
            D2 = np.sum((Xi_np[:, None] - Xi_np[None, :])**2, axis=-1)
            W  = np.exp(-D2 / (2 * SIGMA**2 + EPS))
            d  = W.sum(1)
            D_inv_sqrt = np.diag(1.0 / np.sqrt(d + EPS))
            L  = torch.tensor(np.eye(len(d)) - D_inv_sqrt @ W @ D_inv_sqrt,
                              device=Xi.device, dtype=torch.float32)
            reg += torch.trace(Xi.t() @ L @ Xi)
        loss = ce + beta * reg
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_graphs
    return total_loss / len(loader.dataset)


@torch.no_grad()
def evaluate(model, loader):
    model.eval()
    correct = 0
    for data in loader:
        data = data.to(DEVICE, non_blocking=True)
        logits, _ = model(data.x, data.edge_index, data.batch)
        pred = logits.argmax(dim=1)
        correct += int((pred == data.y).sum())
    return correct / len(loader.dataset)


def run_pipeline(train_csv: str = None, test_csv: str = None):
    # ─── 1) Load dataset ────────────────────────────────────────────────────────
    if train_csv and test_csv:
        train_meta = pd.read_csv(train_csv)
        test_meta  = pd.read_csv(test_csv)
        train_ds, test_ds = [], []
        for df, ds in [(train_meta, train_ds), (test_meta, test_ds)]:
            for _, row in df.iterrows():
                path = row['graph_path']
                g: Data = torch.load(path, weights_only=False)
                g.y = torch.tensor([row['label']], dtype=torch.long)
                ds.append(g)
        num_features = train_ds[0].x.shape[1]
        num_classes  = int(max([g.y.item() for g in train_ds + test_ds])) + 1
    else:
        dataset = TUDataset(root='./data', name=DATASET_NAME)
        train_ds, test_ds = train_test_split(
            dataset,
            test_size=0.2,
            stratify=[d.y.item() for d in dataset],
            random_state=RANDOM_STATE
        )
        num_features = dataset.num_features
        num_classes  = dataset.num_classes

    # ─── 2) Build augmented (subgraph + smoothed) lists ─────────────────────────
    train_aug, test_aug = [], []
    for ds, aug in [(train_ds, train_aug), (test_ds, test_aug)]:
        for data in ds:
            subs = extract_subgraphs(data, WINDOW_SIZE, STEP_SIZE)
            for g in subs:
                g.x = smooth_features(g.x, BETA)
                aug.append(g)

    # ─── 3) DataLoaders with pin_memory & workers ───────────────────────────────
    train_loader = DataLoader(
        train_aug,
        batch_size=BATCH_SIZE,
        shuffle=True,
        pin_memory=True,
        num_workers=4
    )
    test_loader = DataLoader(
        test_aug,
        batch_size=BATCH_SIZE,
        pin_memory=True,
        num_workers=4
    )

    # ─── 4) Model & optimizer ───────────────────────────────────────────────────
    model = GATGraphClassifier(
        in_channels=num_features,
        hidden_dim=HIDDEN_DIM,
        num_classes=num_classes,
        num_layers=NUM_LAYERS,
        num_heads=NUM_HEADS,
        dropout=DROPOUT
    ).to(DEVICE)
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=LR,
        weight_decay=WEIGHT_DECAY
    )

    # ─── 5) Training loop with early saving ─────────────────────────────────────
    best_val = 0.0
    for epoch in range(1, EPOCHS + 1):
        loss = train_one_epoch(model, train_loader, optimizer, BETA)
        acc  = evaluate(model, test_loader)
        if epoch == 1 or epoch % 5 == 0:
            print(f"Epoch {epoch:02d} | Loss: {loss:.4f} | Test Acc: {acc:.4f}")
        if acc > best_val:
            best_val = acc
            torch.save(model.state_dict(), 'best_model.pt')

    print(f"Best Test Accuracy: {best_val:.4f}")


# To run:
# run_pipeline()
# or
# run_pipeline('/mnt/data/train_meta.csv', '/mnt/data/test_meta.csv')


In [None]:
! mv  sample_data/graphs_new_pannuke_edgeAtr ../

mv: cannot stat 'sample_data/graphs_new_pannuke_edgeAtr': No such file or directory


In [None]:
#this is runng the same code on our created graphs_new_pannuke_edgeAtr
run_pipeline('./sample_data/train_meta.csv', './sample_data/test_meta.csv')



Epoch 01 | Loss: 4.1887 | Test Acc: 0.1776
Epoch 05 | Loss: 1.5706 | Test Acc: 0.2110
Epoch 10 | Loss: 153.6502 | Test Acc: 0.3361
Epoch 15 | Loss: 33.6410 | Test Acc: 0.3361
Epoch 20 | Loss: 12.3412 | Test Acc: 0.3361
Epoch 25 | Loss: 4.7610 | Test Acc: 0.3361
Best Test Accuracy: 0.3361
