In [22]:
# Importing Libraries
import numpy as np
import pandas as pd
import scipy.sparse as sp
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [18]:
# Load the data
adj = sp.load_npz("../data/data/adj.npz")
features = np.load("../data/data/features.npy")
labels = np.load("../data/data/labels.npy")
with open("../data/data/splits.json") as f:
    splits = json.load(f)

In [19]:
print(f"Adjacency matrix shape: {adj.shape}")
print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Train set size: {len(splits['idx_train'])}, Test set size: {len(splits['idx_test'])}")

Adjacency matrix shape: (2480, 2480)
Features shape: (2480, 1390)
Labels shape: (496,)
Train set size: 496, Test set size: 1984


In [23]:
# Convert to NumPy array (in case it's a list)
idx_train = np.array(splits['idx_train'])
idx_test = np.array(splits['idx_test'])

# Extract training features
X_train_features = pd.DataFrame(features[idx_train]).reset_index(drop=True)
X_test_features = pd.DataFrame(features[idx_test]).reset_index(drop=True)

X_train_adj = pd.DataFrame(adj[idx_train]).reset_index(drop=True)
X_test_adj = pd.DataFrame(adj[idx_test]).reset_index(drop=True)

## GCN

In [5]:
import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import math

In [24]:
# Load the adjacency matrix (assumed stored as a sparse matrix)
adj = sp.load_npz('../data/data/adj.npz')[idx_train, :][:, idx_train]


# Load node features and labels (assumed to be in numpy array format)
features = np.load('../data/data/features.npy')[idx_train]
labels = np.load('../data/data/labels.npy')

# Load training and testing splits from splits.json
with open('../data/data/splits.json') as f:
    splits = json.load(f)
idx_train = splits['idx_train']
idx_test = splits['idx_test']

In [25]:
# Preprocessing: Add self-loops to the adjacency matrix and normalize it.
def normalize_adj(adj):
    """Symmetrically normalize the adjacency matrix."""
    adj = sp.coo_matrix(adj)
    # Add self-loops
    adj = adj + sp.eye(adj.shape[0])
    rowsum = np.array(adj.sum(1))
    d_inv_sqrt = np.power(rowsum, -0.5).flatten()
    # Handle inf entries (in case of isolated nodes)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
    return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()

adj_normalized = normalize_adj(adj)

In [26]:
# Convert the normalized sparse matrix to a torch sparse tensor.
def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)
    )
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

adj_normalized = sparse_mx_to_torch_sparse_tensor(adj_normalized)

In [27]:
# Convert features and labels to torch tensors
features = torch.FloatTensor(features)
labels = torch.LongTensor(labels)
idx_train = torch.LongTensor(idx_train)
idx_test = torch.LongTensor(idx_test)

### GCN Model

In [28]:
# Graph Convolution layer as described in Kipf & Welling (2017)
class GraphConvolution(nn.Module):
    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    
    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)
    
    def forward(self, input, adj):
        # Multiply input features by the weight matrix
        support = torch.mm(input, self.weight)
        # Perform sparse matrix multiplication with the normalized adjacency matrix
        output = torch.spmm(adj, support)
        if self.bias is not None:
            return output + self.bias
        else:
            return output

In [29]:
# Two-layer GCN model
class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout):
        super(GCN, self).__init__()
        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dropout = dropout
    
    def forward(self, x, adj):
        x = self.gc1(x, adj)
        x = F.relu(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.gc2(x, adj)
        return F.log_softmax(x, dim=1)


In [30]:
# Initialize the model and optimizer
model = GCN(nfeat=features.shape[1],
            nhid=16,
            nclass=labels.max().item() + 1,
            dropout=0.5)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


In [46]:
# Training function
def train(epoch):
    model.train()
    optimizer.zero_grad()
    output = model(features, adj_normalized)
    loss_train = F.nll_loss(output, labels)
    loss_train.backward()
    optimizer.step()
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()))

In [49]:
# Testing function
def test():
    model.eval()
    output = model(features, adj_normalized)
    loss_test = F.nll_loss(output, labels)
    preds = output.max(1)[1]
    correct = preds.eq(labels).sum().item()
    acc_test = correct / len(idx_train)
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test))

In [50]:
# Train the model for 200 epochs and evaluate on the test set
for epoch in range(200):
    train(epoch)
test()

Epoch: 0001 loss_train: 0.1231
Epoch: 0002 loss_train: 0.1576
Epoch: 0003 loss_train: 0.1181
Epoch: 0004 loss_train: 0.1263
Epoch: 0005 loss_train: 0.1335
Epoch: 0006 loss_train: 0.1238
Epoch: 0007 loss_train: 0.1285
Epoch: 0008 loss_train: 0.1308
Epoch: 0009 loss_train: 0.1309
Epoch: 0010 loss_train: 0.1237
Epoch: 0011 loss_train: 0.1307
Epoch: 0012 loss_train: 0.1089
Epoch: 0013 loss_train: 0.1342
Epoch: 0014 loss_train: 0.1642
Epoch: 0015 loss_train: 0.1455
Epoch: 0016 loss_train: 0.1257
Epoch: 0017 loss_train: 0.1231
Epoch: 0018 loss_train: 0.1224
Epoch: 0019 loss_train: 0.1307
Epoch: 0020 loss_train: 0.1293
Epoch: 0021 loss_train: 0.1397
Epoch: 0022 loss_train: 0.1266
Epoch: 0023 loss_train: 0.1280
Epoch: 0024 loss_train: 0.1167
Epoch: 0025 loss_train: 0.1083
Epoch: 0026 loss_train: 0.1231
Epoch: 0027 loss_train: 0.1410
Epoch: 0028 loss_train: 0.1502
Epoch: 0029 loss_train: 0.1332
Epoch: 0030 loss_train: 0.1286
Epoch: 0031 loss_train: 0.1380
Epoch: 0032 loss_train: 0.1470
Epoch: 0

In [53]:
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

# 1. Create a list of indices for all nodes
num_nodes = features.shape[0]  # total number of nodes (e.g., 496)
all_indices = np.arange(num_nodes)

# 2. Perform an 80/20 train-test split using scikit-learn
idx_train, idx_test = train_test_split(all_indices, test_size=0.2, random_state=42)

# Optionally convert indices to a PyTorch tensor if needed:
idx_train = torch.tensor(idx_train, dtype=torch.long)
idx_test = torch.tensor(idx_test, dtype=torch.long)

# 3. Training function (uses global idx_train)
def train(epoch):
    model.train()                       # Set model to training mode
    optimizer.zero_grad()               # Clear gradients
    output = model(features, adj_normalized)  # Forward pass
    # Compute training loss only on the training indices
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    loss_train.backward()               # Backward pass: compute gradients
    optimizer.step()                    # Update model parameters

    print('Epoch: {:04d}'.format(epoch + 1),
          'loss_train: {:.4f}'.format(loss_train.item()))

# 4. Testing/evaluation function (uses global idx_test)
def test():
    model.eval()                        # Set model to evaluation mode
    with torch.no_grad():
        output = model(features, adj_normalized)
        loss_test = F.nll_loss(output[idx_test], labels[idx_test])
        # Compute predictions for test nodes
        pred = output[idx_test].max(1)[1]
        pred_train= output[idx_train].max(1)[1]
        # Calculate accuracy on test set
        correct = pred.eq(labels[idx_test]).sum().item()
        accuracy = correct / len(idx_test)
        correct_train= pred_train.eq(labels[idx_train]).sum().item()
        accuracy_train = correct_train / len(idx_train)
    
    print("Test set results:",
          "Loss: {:.4f}".format(loss_test.item()),
          "Accuracy: {:.4f}".format(accuracy),
         "Accuracy_train: {:.4f}".format(accuracy_train))

# 5. Training loop
num_epochs = 200  # Adjust the number of epochs as needed
for epoch in range(num_epochs):
    train(epoch)

# 6. Evaluate on the test set after training
test()


Epoch: 0001 loss_train: 0.0931
Epoch: 0002 loss_train: 0.1124
Epoch: 0003 loss_train: 0.1205
Epoch: 0004 loss_train: 0.1111
Epoch: 0005 loss_train: 0.1252
Epoch: 0006 loss_train: 0.1277
Epoch: 0007 loss_train: 0.1142
Epoch: 0008 loss_train: 0.1261
Epoch: 0009 loss_train: 0.0958
Epoch: 0010 loss_train: 0.1056
Epoch: 0011 loss_train: 0.1379
Epoch: 0012 loss_train: 0.1020
Epoch: 0013 loss_train: 0.0941
Epoch: 0014 loss_train: 0.0985
Epoch: 0015 loss_train: 0.1196
Epoch: 0016 loss_train: 0.1026
Epoch: 0017 loss_train: 0.0860
Epoch: 0018 loss_train: 0.1006
Epoch: 0019 loss_train: 0.1201
Epoch: 0020 loss_train: 0.1183
Epoch: 0021 loss_train: 0.1141
Epoch: 0022 loss_train: 0.1028
Epoch: 0023 loss_train: 0.1026
Epoch: 0024 loss_train: 0.0927
Epoch: 0025 loss_train: 0.1169
Epoch: 0026 loss_train: 0.0990
Epoch: 0027 loss_train: 0.1384
Epoch: 0028 loss_train: 0.1252
Epoch: 0029 loss_train: 0.1205
Epoch: 0030 loss_train: 0.1143
Epoch: 0031 loss_train: 0.1167
Epoch: 0032 loss_train: 0.1006
Epoch: 0