In [73]:
# Importing Libraries
import numpy as np
import pandas as pd
import scipy.sparse as sp
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [74]:
# Load the data
adj = sp.load_npz("../data/data/adj.npz")
features = np.load("../data/data/features.npy")
labels = np.load("../data/data/labels.npy")
with open("../data/data/splits.json") as f:
    splits = json.load(f)

In [75]:
print(f"Adjacency matrix shape: {adj.shape}")
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Train set size: {len(splits['idx_train'])}, Test set size: {len(splits['idx_test'])}")

Adjacency matrix shape: (2480, 2480)
Features shape: (2480, 1390)
Labels shape: (496,)
Train set size: 496, Test set size: 1984


In [76]:
# Convert to NumPy array (in case it's a list)
idx_train = np.array(splits['idx_train'])
idx_test_a = np.array(splits['idx_test'])

# Extract training features
X_train_features = pd.DataFrame(features[idx_train]).reset_index(drop=True)
X_test_features = pd.DataFrame(features[idx_test]).reset_index(drop=True)

X_train_adj = pd.DataFrame(adj[idx_train]).reset_index(drop=True)
X_test_adj = pd.DataFrame(adj[idx_test]).reset_index(drop=True)

## GCN

In [88]:
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import math

In [89]:
# Load the adjacency matrix (assumed stored as a sparse matrix)
adj = sp.load_npz('../data/data/adj.npz')[idx_train, :][:, idx_train]
adj_test = sp.load_npz('../data/data/adj.npz')[idx_test_a, :][:, idx_test_a]

# Load node features and labels (assumed to be in numpy array format)
features = np.load('../data/data/features.npy')[idx_train]
#features_test = np.load('../data/data/features.npy')[idx_test_a]
labels = np.load('../data/data/labels.npy')

# Load training and testing splits from splits.json
with open('../data/data/splits.json') as f:
    splits = json.load(f)
idx_train = splits['idx_train']
idx_test_a = splits['idx_test']
print(idx_test_a)

[2051, 1788, 1233, 926, 2053, 2083, 2370, 1306, 1354, 603, 571, 1132, 766, 789, 108, 1196, 2462, 2043, 293, 1626, 2376, 1619, 2399, 1887, 467, 1449, 2200, 2425, 2128, 1304, 1613, 1596, 1003, 1645, 1537, 2476, 2358, 33, 73, 1010, 586, 1825, 1880, 1666, 612, 1521, 10, 2254, 1539, 311, 1640, 860, 187, 2417, 878, 1791, 523, 90, 1663, 2084, 1225, 246, 1223, 405, 1042, 864, 297, 1827, 2352, 729, 1032, 1846, 1953, 2409, 1309, 442, 1276, 1702, 906, 2336, 327, 549, 354, 740, 1792, 1715, 370, 1013, 216, 56, 1633, 1116, 2205, 745, 854, 1469, 944, 1379, 493, 1093, 402, 1385, 2294, 748, 1898, 313, 1155, 2247, 2070, 2235, 2363, 1401, 845, 1427, 364, 1858, 1103, 1870, 298, 384, 1681, 1957, 1495, 805, 2314, 68, 96, 2119, 1012, 1584, 1722, 2320, 1453, 1302, 1538, 1481, 1100, 440, 865, 528, 2427, 2478, 489, 1027, 704, 2010, 1817, 1074, 2284, 771, 797, 1377, 1487, 151, 65, 1305, 2242, 366, 798, 817, 1275, 1946, 2178, 682, 23, 1774, 1959, 2007, 791, 2199, 2177, 2069, 91, 962, 2145, 1972, 1217, 2123, 1263,

In [90]:
# Preprocessing: Add self-loops to the adjacency matrix and normalize it.
def normalize_adj(adj):
    """Symmetrically normalize the adjacency matrix."""
    adj = sp.coo_matrix(adj)
    # Add self-loops
    adj = adj + sp.eye(adj.shape[0])
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    # Handle inf entries (in case of isolated nodes)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

adj_normalized = normalize_adj(adj)
adj_normalized_test = normalize_adj(adj_test)
print("Done")

Done


In [91]:
# Convert the normalized sparse matrix to a torch sparse tensor.
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
    )
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

adj_normalized = sparse_mx_to_torch_sparse_tensor(adj_normalized)
print("Done")

Done


In [92]:
# Convert features and labels to torch tensors
features = torch.FloatTensor(features)
labels = torch.LongTensor(labels)
idx_train = torch.LongTensor(idx_train)
idx_test = torch.LongTensor(idx_test)
#print("Done")

### GCN Model

In [93]:
# Graph Convolution layer as described in Kipf & Welling (2017)
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
    
    def forward(self, input, adj):
        # Multiply input features by the weight matrix
        support = torch.mm(input, self.weight)
        # Perform sparse matrix multiplication with the normalized adjacency matrix
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

In [94]:
# Two-layer GCN model
class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dropout = dropout
    
    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = F.relu(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)


In [95]:
# Initialize the model and optimizer
model = GCN(nfeat=features.shape[1],
            nhid=16,
            nclass=labels.max().item() + 1,
            dropout=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


In [96]:
# Training function
def train(epoch):
    model.train()
    optimizer.zero_grad()
    output = model(features, adj_normalized)
    loss_train = F.nll_loss(output, labels)
    loss_train.backward()
    optimizer.step()
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()))

In [97]:
# Testing function
def test():
    model.eval()
    output = model(features, adj_normalized)
    loss_test = F.nll_loss(output, labels)
    preds = output.max(1)[1]
    correct = preds.eq(labels).sum().item()
    acc_test = correct / len(idx_train)
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test))

In [98]:
# Train the model for 200 epochs and evaluate on the test set
for epoch in range(200):
    train(epoch)
test()

Epoch: 0001 loss_train: 2.0859
Epoch: 0002 loss_train: 1.9267
Epoch: 0003 loss_train: 1.8366
Epoch: 0004 loss_train: 1.7656
Epoch: 0005 loss_train: 1.6795
Epoch: 0006 loss_train: 1.6140
Epoch: 0007 loss_train: 1.5581
Epoch: 0008 loss_train: 1.4623
Epoch: 0009 loss_train: 1.4009
Epoch: 0010 loss_train: 1.3168
Epoch: 0011 loss_train: 1.2766
Epoch: 0012 loss_train: 1.2142
Epoch: 0013 loss_train: 1.0920
Epoch: 0014 loss_train: 1.0226
Epoch: 0015 loss_train: 0.9759
Epoch: 0016 loss_train: 0.9060
Epoch: 0017 loss_train: 0.8802
Epoch: 0018 loss_train: 0.7993
Epoch: 0019 loss_train: 0.7937
Epoch: 0020 loss_train: 0.7567
Epoch: 0021 loss_train: 0.6863
Epoch: 0022 loss_train: 0.6361
Epoch: 0023 loss_train: 0.6404
Epoch: 0024 loss_train: 0.5875
Epoch: 0025 loss_train: 0.5973
Epoch: 0026 loss_train: 0.5476
Epoch: 0027 loss_train: 0.5261
Epoch: 0028 loss_train: 0.4982
Epoch: 0029 loss_train: 0.4758
Epoch: 0030 loss_train: 0.4846
Epoch: 0031 loss_train: 0.4208
Epoch: 0032 loss_train: 0.3763
Epoch: 0

### Doing Train Test Split

In [102]:
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# 1. Create a list of indices for all nodes
num_nodes = features.shape[0]  # total number of nodes (e.g., 496)
all_indices = np.arange(num_nodes)

# 2. Perform an 80/20 train-test split using scikit-learn
idx_train, idx_test = train_test_split(all_indices, test_size=0.2, random_state=42)

# Optionally convert indices to a PyTorch tensor if needed:
idx_train = torch.tensor(idx_train, dtype=torch.long)
idx_test = torch.tensor(idx_test, dtype=torch.long)

# 3. Training function (uses global idx_train)
def train(epoch):
    model.train()                       # Set model to training mode
    optimizer.zero_grad()               # Clear gradients
    output = model(features, adj_normalized)  # Forward pass
    # Compute training loss only on the training indices
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    loss_train.backward()               # Backward pass: compute gradients
    optimizer.step()                    # Update model parameters

    print('Epoch: {:04d}'.format(epoch + 1),
          'loss_train: {:.4f}'.format(loss_train.item()))

# 4. Testing/evaluation function (uses global idx_test)
def test():
    model.eval()                        # Set model to evaluation mode
    with torch.no_grad():
        output = model(features, adj_normalized)
        loss_test = F.nll_loss(output[idx_test], labels[idx_test])
        # Compute predictions for test nodes
        pred = output[idx_test].max(1)[1]
        pred_train= output[idx_train].max(1)[1]
        # Calculate accuracy on test set
        correct = pred.eq(labels[idx_test]).sum().item()
        accuracy = correct / len(idx_test)
        correct_train= pred_train.eq(labels[idx_train]).sum().item()
        accuracy_train = correct_train / len(idx_train)
    
    print("Test set results:",
          "Loss: {:.4f}".format(loss_test.item()),
          "Accuracy: {:.4f}".format(accuracy),
         "Accuracy_train: {:.4f}".format(accuracy_train))

def test_overall():
    model.eval()                        # Set model to evaluation mode
    with torch.no_grad():
        output = model(features_test, adj_normalized_test)
        #print(output)
        loss_test = F.nll_loss(output[idx_test_a], labels[idx_test_a])
        # Compute predictions for test nodes
        pred = output[idx_test_a].max(1)[1]
        # Calculate accuracy on test set
        #correct = pred.eq(labels[idx_test]).sum().item()
        #accuracy = correct / len(idx_test)
        #correct_train= pred_train.eq(labels[idx_train]).sum().item()
        #accuracy_train = correct_train / len(idx_train)
        print(pred)
    
    '''print("Test set results:",
          "Loss: {:.4f}".format(loss_test.item()),
          "Accuracy: {:.4f}".format(accuracy),
         "Accuracy_train: {:.4f}".format(accuracy_train))'''

# 5. Training loop
num_epochs = 200  # Adjust the number of epochs as needed
for epoch in range(num_epochs):
    train(epoch)

# 6. Evaluate on the test set after training
test()
test_overall()


Epoch: 0001 loss_train: 0.0889
Epoch: 0002 loss_train: 0.1012
Epoch: 0003 loss_train: 0.1152
Epoch: 0004 loss_train: 0.1336
Epoch: 0005 loss_train: 0.0936
Epoch: 0006 loss_train: 0.1223
Epoch: 0007 loss_train: 0.0959
Epoch: 0008 loss_train: 0.1289
Epoch: 0009 loss_train: 0.1428
Epoch: 0010 loss_train: 0.0942
Epoch: 0011 loss_train: 0.1175
Epoch: 0012 loss_train: 0.1052
Epoch: 0013 loss_train: 0.0932
Epoch: 0014 loss_train: 0.1169
Epoch: 0015 loss_train: 0.1014
Epoch: 0016 loss_train: 0.1163
Epoch: 0017 loss_train: 0.0914
Epoch: 0018 loss_train: 0.1263
Epoch: 0019 loss_train: 0.1339
Epoch: 0020 loss_train: 0.1302
Epoch: 0021 loss_train: 0.1005
Epoch: 0022 loss_train: 0.0974
Epoch: 0023 loss_train: 0.1077
Epoch: 0024 loss_train: 0.1002
Epoch: 0025 loss_train: 0.1143
Epoch: 0026 loss_train: 0.1111
Epoch: 0027 loss_train: 0.1187
Epoch: 0028 loss_train: 0.1306
Epoch: 0029 loss_train: 0.1466
Epoch: 0030 loss_train: 0.1085
Epoch: 0031 loss_train: 0.1095
Epoch: 0032 loss_train: 0.1136
Epoch: 0

TypeError: must be real number, not coo_matrix

In [100]:
def evaluate(features, adj_normalized_test, labels, idx_test_a):
    model.eval()
    with torch.no_grad():
        logits = model(features, adj_normalized_test)
        loss = F.nll_loss(logits[idx_test_a], labels[idx_test_a]).item()
        preds = logits[idx_test_a].max(dim=1)[1]
        acc  = preds.eq(labels[idx_test_a]).sum().item() / idx_test_a.size(0)
    return loss, acc

In [101]:
evaluate(features, adj_normalized_test, labels, idx_test_a)

TypeError: mm(): argument 'input' (position 1) must be Tensor, not coo_matrix

In [54]:
test_overall()

TypeError: mm(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [104]:
"""
GCN training + prediction script
================================
Put this file in the same directory as:

  adj.npz
  features.npy
  labels.npy
  splits.json

Run with:  python run_gcn.py
"""

import json, numpy as np, scipy.sparse as sp, torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# ----------------------------------------------------------------------
# 0.  Helper: scipy sparse COO  ->  torch.sparse_coo_tensor
# ----------------------------------------------------------------------
def coo_to_torch(coo, device=None, dtype=torch.float32):
    """Convert a scipy.sparse.coo_matrix to a torch sparse tensor."""
    coo = coo.tocoo()           # make sure it *is* COO
    idx = torch.tensor([coo.row, coo.col], dtype=torch.long, device=device)
    val = torch.tensor(coo.data, dtype=dtype, device=device)
    return torch.sparse_coo_tensor(idx, val, coo.shape, device=device)

# ----------------------------------------------------------------------
# 1.  Load data
# ----------------------------------------------------------------------
with open("../data/data/splits.json") as fp:
    spl = json.load(fp)

idx_train_full = np.array(spl["idx_train"])   # 496 labelled nodes&#8203;:contentReference[oaicite:0]{index=0}&#8203;:contentReference[oaicite:1]{index=1}
idx_test       = np.array(spl["idx_test"])    # 1 984 unlabelled nodes&#8203;:contentReference[oaicite:2]{index=2}&#8203;:contentReference[oaicite:3]{index=3}

features = np.load("../data/data/features.npy")            # (2480, 1390)
labels   = np.load("../data/data/labels.npy")              # (496,)
adj_coo  = sp.load_npz("../data/data/adj.npz")             # sparse adjacency

# ----------------------------------------------------------------------
# 2.  Tensors + device
# ----------------------------------------------------------------------
device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")

features = torch.as_tensor(features, dtype=torch.float32, device=device)
labels   = torch.as_tensor(labels  , dtype=torch.long   , device=device)
adj      = coo_to_torch(adj_coo, device)

idx_test = torch.tensor(idx_test, dtype=torch.long, device=device)

# ----------------------------------------------------------------------
# 3.  Internal train/validation split (10 % of labelled nodes)
# ----------------------------------------------------------------------
idx_tr, idx_val = train_test_split(
    idx_train_full,
    test_size=0.10,
    stratify=labels[idx_train_full],
    random_state=42,
)
idx_tr  = torch.tensor(idx_tr , dtype=torch.long, device=device)
idx_val = torch.tensor(idx_val, dtype=torch.long, device=device)

# ----------------------------------------------------------------------
# 4.  Define the model  (same API you already had)
# ----------------------------------------------------------------------
class GraphConvolution(torch.nn.Module):
    def __init__(self, in_feats, out_feats, bias=True):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.randn(in_feats, out_feats) * 0.01)
        if bias:
            self.bias = torch.nn.Parameter(torch.zeros(out_feats))
        else:
            self.register_parameter('bias', None)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight)
        out = torch.sparse.mm(adj, support)
        return out + (self.bias if self.bias is not None else 0)

class GCN(torch.nn.Module):
    def __init__(self, n_feat, n_hid, n_class, dropout=0.5):
        super().__init__()
        self.gc1 = GraphConvolution(n_feat , n_hid )
        self.gc2 = GraphConvolution(n_hid  , n_class)
        self.dropout = dropout

    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = F.relu(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)

model = GCN(features.size(1), 64, labels.max().item() + 1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# ----------------------------------------------------------------------
# 5.  Training loop
# ----------------------------------------------------------------------
for epoch in range(1, 201):
    model.train()
    optimizer.zero_grad()
    out = model(features, adj)
    loss_tr = F.nll_loss(out[idx_tr], labels[idx_tr])
    loss_tr.backward()
    optimizer.step()

    if epoch % 10 == 0 or epoch == 1:
        model.eval()
        with torch.no_grad():
            val_out  = model(features, adj)[idx_val]
            val_loss = F.nll_loss(val_out, labels[idx_val]).item()
            val_acc  = (val_out.argmax(1) == labels[idx_val]).float().mean().item()
        print(f"Epoch {epoch:03d} | train_loss {loss_tr.item():.4f} "
              f"| val_loss {val_loss:.4f} | val_acc {val_acc:.4f}")

# ----------------------------------------------------------------------
# 6.  Predict on *unlabelled* test set and save
# ----------------------------------------------------------------------
model.eval()
with torch.no_grad():
    preds_test = model(features, adj)[idx_test].argmax(1).cpu().numpy()

np.save("gcn_test_predictions.npy", preds_test)
print(f"\n✓ Saved predictions for {len(idx_test)} test nodes to 'gcn_test_predictions.npy'")


  idx = torch.tensor([coo.row, coo.col], dtype=torch.long, device=device)


IndexError: index 2252 is out of bounds for dimension 0 with size 496

In [105]:
"""
run_gcn.py
==========

Train a 2‑layer GCN on the labelled nodes (496),
monitor a 10 % validation split, and predict the classes
of the 1 984 un‑labelled test nodes.

Outputs
-------
gcn_test_predictions.npy   (array of shape (1984,))
"""

import json, numpy as np, scipy.sparse as sp, torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split


# ----------------------------------------------------------------------
# 0.  Small helper: SciPy COO  ->  torch sparse tensor
# ----------------------------------------------------------------------
def coo_to_torch(coo, device=None, dtype=torch.float32):
    """Convert scipy.sparse.coo_matrix -> torch.sparse_coo_tensor"""
    coo = coo.tocoo()
    idx = torch.tensor([coo.row, coo.col], dtype=torch.long, device=device)
    val = torch.tensor(coo.data, dtype=dtype, device=device)
    return torch.sparse_coo_tensor(idx, val, coo.shape, device=device)


# ----------------------------------------------------------------------
# 1.  Load data
# ----------------------------------------------------------------------
with open("../data/data/splits.json") as fp:
    splits = json.load(fp)

idx_labelled = np.array(splits["idx_train"])   # 496 labelled node‑IDs
idx_test     = np.array(splits["idx_test"])    # 1 984 un‑labelled node‑IDs

features = np.load("../data/data/features.npy")             # (2480, 1390)
raw_labels = np.load("../data/data/labels.npy")             # (496,) – aligned with idx_labelled
adj_coo   = sp.load_npz("../data/data/adj.npz")             # sparse adjacency

# ----------------------------------------------------------------------
# 2.  Build full‑length label vector  (‑1 = unknown)
# ----------------------------------------------------------------------
num_nodes  = features.shape[0]
labels_all = -torch.ones(num_nodes, dtype=torch.long)  # starts with -1
labels_all[idx_labelled] = torch.as_tensor(raw_labels, dtype=torch.long)

# ----------------------------------------------------------------------
# 3.  Device + tensor conversions
# ----------------------------------------------------------------------
device   = torch.device("cuda" if torch.cuda.is_available() else "cpu")

features = torch.as_tensor(features, dtype=torch.float32, device=device)
labels_all = labels_all.to(device)
adj = coo_to_torch(adj_coo, device)

idx_test = torch.tensor(idx_test, dtype=torch.long, device=device)

# ----------------------------------------------------------------------
# 4.  Internal 90 %/10 % split of the *labelled* nodes
# ----------------------------------------------------------------------
pos = np.arange(len(idx_labelled))                     # 0 … 495
pos_tr, pos_val = train_test_split(
        pos,
        test_size=0.10,
        stratify=raw_labels,
        random_state=42,
)

idx_tr  = torch.as_tensor(idx_labelled[pos_tr] , dtype=torch.long, device=device)
idx_val = torch.as_tensor(idx_labelled[pos_val], dtype=torch.long, device=device)

# ----------------------------------------------------------------------
# 5.  GCN definition
# ----------------------------------------------------------------------
class GraphConvolution(torch.nn.Module):
    def __init__(self, in_feats, out_feats, bias=True):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.randn(in_feats, out_feats) * 0.01)
        if bias:
            self.bias = torch.nn.Parameter(torch.zeros(out_feats))
        else:
            self.register_parameter("bias", None)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight)
        out = torch.sparse.mm(adj, support)
        if self.bias is not None:
            out = out + self.bias
        return out


class GCN(torch.nn.Module):
    def __init__(self, n_feat, n_hid, n_class, dropout=0.5):
        super().__init__()
        self.gc1 = GraphConvolution(n_feat, n_hid)
        self.gc2 = GraphConvolution(n_hid, n_class)
        self.dropout = dropout

    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = F.relu(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)


model = GCN(features.size(1), 64, int(raw_labels.max()) + 1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# ----------------------------------------------------------------------
# 6.  Training loop
# ----------------------------------------------------------------------
for epoch in range(1, 201):
    # --- training step ---
    model.train()
    optimizer.zero_grad()
    logits = model(features, adj)
    loss_tr = F.nll_loss(logits[idx_tr], labels_all[idx_tr])
    loss_tr.backward()
    optimizer.step()

    # --- validation step ---
    if epoch % 10 == 0 or epoch == 1:
        model.eval()
        with torch.no_grad():
            val_logits = model(features, adj)[idx_val]
            val_loss = F.nll_loss(val_logits, labels_all[idx_val]).item()
            val_acc = (val_logits.argmax(1) == labels_all[idx_val]).float().mean().item()
        print(f"Epoch {epoch:03d} | train_loss {loss_tr.item():.4f} "
              f"| val_loss {val_loss:.4f} | val_acc {val_acc:.4f}")

# ----------------------------------------------------------------------
# 7.  Predict on the un‑labelled 1 984 test nodes
# ----------------------------------------------------------------------
model.eval()
with torch.no_grad():
    preds_test = model(features, adj)[idx_test].argmax(1).cpu().numpy()

np.save("gcn_test_predictions.npy", preds_test)
print(f"\n✓ Predictions saved to 'gcn_test_predictions.npy'  "
      f"(shape = {preds_test.shape})")


Epoch 001 | train_loss 1.9627 | val_loss 2.1397 | val_acc 0.2800
Epoch 010 | train_loss 0.8264 | val_loss 1.0054 | val_acc 0.7800
Epoch 020 | train_loss 0.2814 | val_loss 1.0291 | val_acc 0.8000
Epoch 030 | train_loss 0.2565 | val_loss 1.0718 | val_acc 0.8200
Epoch 040 | train_loss 0.1394 | val_loss 0.8482 | val_acc 0.8200
Epoch 050 | train_loss 0.0959 | val_loss 0.9690 | val_acc 0.7800
Epoch 060 | train_loss 0.0912 | val_loss 1.0337 | val_acc 0.8200
Epoch 070 | train_loss 0.0725 | val_loss 1.1590 | val_acc 0.7600
Epoch 080 | train_loss 0.0533 | val_loss 1.1740 | val_acc 0.7600
Epoch 090 | train_loss 0.0538 | val_loss 1.1502 | val_acc 0.7600
Epoch 100 | train_loss 0.0396 | val_loss 1.1006 | val_acc 0.7800
Epoch 110 | train_loss 0.0340 | val_loss 1.2348 | val_acc 0.7800
Epoch 120 | train_loss 0.0470 | val_loss 1.3502 | val_acc 0.7600
Epoch 130 | train_loss 0.0444 | val_loss 1.2822 | val_acc 0.7800
Epoch 140 | train_loss 0.0343 | val_loss 1.4092 | val_acc 0.7800
Epoch 150 | train_loss 0.

In [107]:
import json, numpy as np, pandas as pd

# 1) read the test‑node IDs (1 984 integers) -----------------------------
idx_test = np.array(json.load(open("../data/data/splits.json"))["idx_test"])

# 2) read the predictions you just saved --------------------------------
preds = np.load("gcn_test_predictions.npy")     # shape = (1984,)

assert len(idx_test) == len(preds), "ID / prediction length mismatch"

# 3) make a table and save it -------------------------------------------
df = pd.DataFrame(
        {"node_id": idx_test,
         "predicted_label": preds.astype(int)}
)
# Sort by node_id so the CSV is in natural order (optional)
df = df.sort_values("node_id", ignore_index=True)

df.to_csv("test_predictions.csv", index=False)
print("✓ Wrote", len(df), "rows to 'test_predictions.csv'")
print(df.head())


✓ Wrote 1984 rows to 'test_predictions.csv'
   node_id  predicted_label
0        0                5
1        4                2
2        7                6
3       10                5
4       11                6


In [108]:
df

Unnamed: 0,node_id,predicted_label
0,0,5
1,4,2
2,7,6
3,10,5
4,11,6
...,...,...
1979,2475,6
1980,2476,1
1981,2477,1
1982,2478,2


In [109]:
"""
run_gcn_plus.py
===============

Improved GCN with self‑loops, feature normalisation, wider hidden layer,
lower dropout and early‑stopping.  Produces:

    test_predictions.csv   (node_id, predicted_label)
"""

import json, numpy as np, scipy.sparse as sp, torch, time
import torch.nn.functional as F
from sklearn.model_selection import train_test_split


# --------------------------------------------------------------------- #
# 0.  Helper functions
# --------------------------------------------------------------------- #
def coo_to_torch(coo, device=None, dtype=torch.float32):
    coo = coo.tocoo()
    idx = torch.tensor([coo.row, coo.col], dtype=torch.long, device=device)
    val = torch.tensor(coo.data, dtype=dtype, device=device)
    return torch.sparse_coo_tensor(idx, val, coo.shape, device=device)

def normalize_adj(coo):
    """ Ā = D^‑½ (A + I) D^‑½   (symmetric) """
    coo = (coo + sp.eye(coo.shape[0], dtype=coo.dtype)).tocoo()
    deg = np.asarray(coo.sum(1)).flatten()
    deg_inv_sqrt = np.power(deg, -0.5);  deg_inv_sqrt[np.isinf(deg_inv_sqrt)] = 0
    D_inv = sp.diags(deg_inv_sqrt)
    return D_inv @ coo @ D_inv      # still sparse

# --------------------------------------------------------------------- #
# 1.  Load data
# --------------------------------------------------------------------- #
with open("../data/data/splits.json") as fp:
    splits = json.load(fp)

idx_labelled = np.array(splits["idx_train"])          # 496 labelled nodes
idx_test     = np.array(splits["idx_test"])           # 1 984 un‑labelled

features = np.load("../data/data/features.npy")                    # (2480, 1390)
raw_labels = np.load("../data/data/labels.npy")                    # (496,)
adj_raw   = sp.load_npz("../data/data/adj.npz")

num_nodes = features.shape[0]

# --------------------------------------------------------------------- #
# 2.  Pre‑processing
# --------------------------------------------------------------------- #
# -- adjacency --
adj_norm = normalize_adj(adj_raw)                     # +I and symmetric‑norm

# -- feature row L2 norm --
row_norm = np.linalg.norm(features, axis=1, keepdims=True)
row_norm[row_norm == 0] = 1
features = features / row_norm

# -- full label vector (‑1 = unknown) --
labels_all = -torch.ones(num_nodes, dtype=torch.long)
labels_all[idx_labelled] = torch.as_tensor(raw_labels, dtype=torch.long)

# --------------------------------------------------------------------- #
# 3.  Torch tensors + device
# --------------------------------------------------------------------- #
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

features = torch.as_tensor(features, dtype=torch.float32, device=device)
labels_all = labels_all.to(device)
adj = coo_to_torch(adj_norm, device)

idx_test = torch.tensor(idx_test, dtype=torch.long, device=device)

# train/val split inside the 496 nodes
pos = np.arange(len(idx_labelled))
pos_tr, pos_val = train_test_split(
        pos, test_size=0.10, stratify=raw_labels, random_state=42)
idx_tr  = torch.as_tensor(idx_labelled[pos_tr] , dtype=torch.long, device=device)
idx_val = torch.as_tensor(idx_labelled[pos_val], dtype=torch.long, device=device)

# --------------------------------------------------------------------- #
# 4.  Model
# --------------------------------------------------------------------- #
class GraphConvolution(torch.nn.Module):
    def __init__(self, in_feats, out_feats, bias=True):
        super().__init__()
        self.weight = torch.nn.Parameter(torch.randn(in_feats, out_feats) * 0.01)
        self.bias   = torch.nn.Parameter(torch.zeros(out_feats)) if bias else None

    def forward(self, x, adj):
        support = torch.mm(x, self.weight)
        out = torch.sparse.mm(adj, support)
        return out + (self.bias if self.bias is not None else 0)

class GCN(torch.nn.Module):
    def __init__(self, n_feat, n_hid, n_class, dropout=0.4):
        super().__init__()
        self.gc1 = GraphConvolution(n_feat, n_hid)
        self.gc2 = GraphConvolution(n_hid, n_class)
        self.dropout = dropout

    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = F.relu(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)

model = GCN(features.size(1), 128, int(raw_labels.max())+1).to(device)
optim = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

# --------------------------------------------------------------------- #
# 5.  Training with early‑stopping
# --------------------------------------------------------------------- #
best_val   = float("inf")
best_epoch = 0
patience   = 20
start      = time.time()

for epoch in range(1, 501):                      # up to 500 epochs
    model.train()
    optim.zero_grad()
    out = model(features, adj)
    loss_tr = F.nll_loss(out[idx_tr], labels_all[idx_tr])
    loss_tr.backward();  optim.step()

    # ---- validation ----
    model.eval()
    with torch.no_grad():
        out_val  = model(features, adj)[idx_val]
        loss_val = F.nll_loss(out_val, labels_all[idx_val]).item()
        acc_val  = (out_val.argmax(1) == labels_all[idx_val]).float().mean().item()

    if loss_val < best_val - 1e-4:               # improvement margin
        best_val, best_epoch = loss_val, epoch
        best_state = {k: v.clone() for k, v in model.state_dict().items()}

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | tr_loss {loss_tr.item():.4f} "
              f"| val_loss {loss_val:.4f} | val_acc {acc_val:.4f}")

    # early stopping
    if epoch - best_epoch >= patience:
        print(f"> Early‑stopped at epoch {epoch}  (best epoch = {best_epoch})")
        break

print(f"Training time: {time.time()-start:.1f}s | best_val_loss = {best_val:.4f}")

# restore best weights
model.load_state_dict(best_state)

# --------------------------------------------------------------------- #
# 6.  Predict on the test nodes & save CSV
# --------------------------------------------------------------------- #
model.eval()
with torch.no_grad():
    preds = model(features, adj)[idx_test].argmax(1).cpu().numpy()

import pandas as pd
pd.DataFrame({"node_id": idx_test.cpu().numpy(),
              "predicted_label": preds}) \
  .sort_values("node_id") \
  .to_csv("test_predictions.csv", index=False)

print(f"\n✓ Saved predictions for {len(preds)} nodes to 'test_predictions.csv'")


Epoch 001 | tr_loss 1.9459 | val_loss 1.9322 | val_acc 0.2800
Epoch 010 | tr_loss 1.4794 | val_loss 1.4781 | val_acc 0.3400
Epoch 020 | tr_loss 0.7835 | val_loss 0.9474 | val_acc 0.7400
Epoch 030 | tr_loss 0.3994 | val_loss 0.6705 | val_acc 0.7600
Epoch 040 | tr_loss 0.2413 | val_loss 0.5655 | val_acc 0.8000
Epoch 050 | tr_loss 0.1678 | val_loss 0.5126 | val_acc 0.8200
Epoch 060 | tr_loss 0.1371 | val_loss 0.5155 | val_acc 0.8000
Epoch 070 | tr_loss 0.1305 | val_loss 0.5130 | val_acc 0.8200
Epoch 080 | tr_loss 0.1124 | val_loss 0.5297 | val_acc 0.8200
Epoch 090 | tr_loss 0.1069 | val_loss 0.5214 | val_acc 0.8400
> Early‑stopped at epoch 94  (best epoch = 74)
Training time: 8.6s | best_val_loss = 0.5012

✓ Saved predictions for 1984 nodes to 'test_predictions.csv'


In [118]:
import json, numpy as np, pandas as pd

# 1 ── read the 1 984 test‑node IDs in their original order
idx_test = np.array(json.load(open("../data/data/splits.json"))["idx_test"])

# 2 ── read the (sorted) predictions we just produced
pred_df = pd.read_csv("test_predictions.csv")          # 2 columns

# 3 ── map node_id -> label and reorder
pred_dict = dict(zip(pred_df.node_id, pred_df.predicted_label))
preds_ordered = np.array([pred_dict[node] for node in idx_test], dtype=int)

# 4 ── save BOTH formats
pd.DataFrame({"node_id": idx_test,
              "predicted_label": preds_ordered}).to_csv(
                  "test_predictions_original_order.csv", index=False)

np.save("gcn_test_predictions.npy", preds_ordered)

print("✓ wrote test_predictions_original_order.csv  (rows match idx_test order)")
print("✓ wrote gcn_test_predictions.npy             (shape =", preds_ordered.shape, ")")


✓ wrote test_predictions_original_order.csv  (rows match idx_test order)
✓ wrote gcn_test_predictions.npy             (shape = (1984,) )


In [121]:
pred_df

Unnamed: 0,node_id,predicted_label
0,0,5
1,4,2
2,7,6
3,10,5
4,11,6
...,...,...
1979,2475,6
1980,2476,1
1981,2477,1
1982,2478,2


In [123]:
import pandas as pd

pred_df = pd.read_csv("test_predictions.csv")   # the file you already have

# ── 1. tab‑separated text  (common choice) ────────────────────────────
pred_df['predicted_label'].to_csv("test_predictions.txt",
               sep="\t",        # tab character
               index=False,     # no row numbers
               header=True)     # keep column names
print("✓ wrote test_predictions.txt  (tab‑separated)")

# ── 2. space‑separated text (if you prefer) ───────────────────────────
# pred_df.to_csv("test_predictions.txt", sep=" ", index=False, header=False)

# ── 3. plain fixed‑width table (no delimiter, human‑readable) ─────────
# with open("test_predictions.txt", "w") as f:
#     f.write(pred_df.to_string(index=False))


✓ wrote test_predictions.txt  (tab‑separated)
