In [1]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv, GINConv, SAGEConv, SGConv
from torch import  nn

from torch_geometric.loader import NeighborSampler
from tqdm import tqdm

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score

import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import pickle
import matplotlib.pyplot as plt
from torch import Tensor
from torch_sparse import SparseTensor
from typing import Union
import torch_geometric.transforms as T

In [2]:
with open("extended_graphs_data.pkl", "rb") as file:
    data = pickle.load(file)
    file.close()

In [3]:
print(data)

Data(x=[579157, 51], edge_index=[2, 335118], edge_attr=[335118], y=[579157], train_mask=[231662], valid_mask=[154442], test_mask=[193053], edge_direct=[335118])


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
split_idx = {'train':data.train_mask, 'valid':data.valid_mask, 'test':data.test_mask}
data = data.to(device)
train_idx = split_idx['train'].to(device)

In [6]:
print(np.sum(data.y.cpu().numpy() == 1))
print(np.sum(data.y[data.train_mask].cpu().numpy() == 1))
print(np.sum(data.y[data.valid_mask].cpu().numpy() == 1))

4492
2671
1821


In [7]:
class GNN(torch.nn.Module):
    def __init__(self
                 , in_channels
                 , hidden_channels
                 , out_channels
                 , num_layers
                 , dropout
                 , batchnorm=True):
        super(GNN, self).__init__()
        
        myConv = SAGEConv

        self.convs = torch.nn.ModuleList()
        self.convs.append(myConv(in_channels, hidden_channels, cached=True))
        self.batchnorm = batchnorm
        if self.batchnorm:
            self.bns = torch.nn.ModuleList()
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(
                myConv(hidden_channels, hidden_channels, cached=True))
            if self.batchnorm: 
                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        # self.convs.append(myConv(hidden_channels, out_channels, cached=True))
        self.regression = nn.Sequential(
            nn.Linear(hidden_channels, 32),
            nn.Dropout(0.1),
            nn.ELU(),
            nn.Linear(32, out_channels),
        )

        self.dropout = dropout
        
    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        if self.batchnorm:
            for bn in self.bns:
                bn.reset_parameters()

    def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            if self.batchnorm: 
                x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        # x = self.convs[-1](x, edge_index)
        x = self.regression(x)
        
        return x.log_softmax(dim=-1)

In [8]:
from torch_geometric.utils.subgraph import k_hop_subgraph

def train(model, data, train_idx, optimizer):
    # data.y is labels of shape (N, ) 
    model.train()

    optimizer.zero_grad()
    
    nodeandneighbor, edge_index, node_map, mask = k_hop_subgraph(
        train_idx, 5, data.edge_index, relabel_nodes=True, num_nodes=data.x.size(0)
    )
    
    out = model(
        data.x[nodeandneighbor],
        data.edge_index
    )[train_idx]
    
    # out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()

In [9]:
@torch.no_grad()
def test(model, data, split_idx):
    # data.y is labels of shape (N, )
    model.eval()
    
    out = model(data.x, data.edge_index)
        
    y_pred = out.exp()  # (N,num_classes)
    
    losses = dict()
    for key in ['train', 'valid']:
        node_id = split_idx[key]
        losses[key] = F.nll_loss(out[node_id], data.y[node_id]).item()
            
    return losses, y_pred

In [10]:
parameters = {'lr':0.001
              , 'num_layers':3
              , 'hidden_channels':256
              , 'dropout':0.1
              , 'batchnorm': True
              , 'l2':1e-7
             }
epochs = 50

In [11]:
para_dict = parameters
model_para = parameters.copy()
model_para.pop('lr')
model_para.pop('l2')        
model = GNN(in_channels = data.x.size(-1), out_channels = 2, **model_para).to(device)

model.reset_parameters()
optimizer = torch.optim.Adam(model.parameters(), lr=para_dict['lr'], weight_decay=para_dict['l2'])
min_valid_loss = 1000000.0
best_model_path = 'model.pth' 

train_losses = []
valid_losses = []

print(f'Model  initialized')

for epoch in range(1, epochs+1):
    loss = train(model, data, train_idx, optimizer)
    losses, out = test(model, data, split_idx)
    train_loss, valid_loss = losses['train'], losses['valid']
    
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    if valid_loss < min_valid_loss:
        min_valid_loss = valid_loss
        torch.save(model.state_dict(), best_model_path)

    print(f'Epoch: {epoch:}, '
        f'Loss: {loss:.6f}, '
        f'Train: {train_loss:.6f}, '
        f'Valid: {valid_loss:.6f}')

Model  initialized


RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
plt.plot(range(1, epochs + 1), train_losses, label='Train Loss')
plt.plot(range(1, epochs + 1), valid_losses, label='Valid Loss')


plt.title('Loss Variation Over Epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')

plt.legend()
plt.show()

In [None]:
@torch.no_grad()
def test2(model, data):
    # data.y is labels of shape (N, )
    model.eval()
    
    out = model(data.x, data.edge_index)
        
    y_pred = out.exp()  # (N,num_classes)
            
    return y_pred

In [None]:
model = GNN(in_channels = data.x.size(-1), out_channels = 2, **model_para)
model.load_state_dict(torch.load(best_model_path))
model = model.to(device)

out = test2(model, data)

# Extract predictions for training and validation sets
preds_train, preds_valid = out[data.train_mask].cpu().numpy(), out[data.valid_mask].cpu().numpy()
y_train, y_valid = data.y[data.train_mask].cpu().numpy(), data.y[data.valid_mask].cpu().numpy()

# Threshold predictions to obtain binary values (0 or 1)
threshold = 0.5
binary_preds_train = (preds_train[:, 1] > threshold).astype(int)
binary_preds_valid = (preds_valid[:, 1] > threshold).astype(int)

# Compute accuracy
accuracy_train = accuracy_score(y_train, binary_preds_train)
accuracy_valid = accuracy_score(y_valid, binary_preds_valid)

# Compute ROC AUC
train_auc = roc_auc_score(y_train, preds_train[:, 1])
valid_auc = roc_auc_score(y_valid, preds_valid[:, 1])

# Print the results
print('Train Accuracy:', accuracy_train)
print('Valid Accuracy:', accuracy_valid)
print('Train ROC AUC:', train_auc)
print('Valid ROC AUC:', valid_auc)
# 231662 228991
# 154442 152621