In [1]:
# Import torch & Check CUDA availability
import torch

print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.current_device())

True
1
0


In [2]:
# Get CUDA device name
print(torch.cuda.device(0))
print(torch.cuda.get_device_name(0))

<torch.cuda.device object at 0x7f3521366ed0>
NVIDIA A30


#### Import Reddit

In [3]:
from torch_geometric.datasets import Reddit
import torch_geometric.transforms as T

# Import dataset from PyTorch Geometric
dataset = Reddit(root="/dfs6/pub/seminl1/Reddit", transform=T.ToSparseTensor())
data = dataset[0]

# Store the dataset to GPU
data = data.pin_memory()
data = data.to('cuda:0', non_blocking=True)

# Print information about the dataset
print(f'Dataset: {dataset}')
print('-------------------')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of nodes: {data.x.shape[0]}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

# Print information about the graph
print(f'\nGraph:')
print('------')
print(f'Edges are directed: {data.is_directed()}')
print(f'Graph has isolated nodes: {data.has_isolated_nodes()}')
print(f'Graph has loops: {data.has_self_loops()}')

Dataset: Reddit()
-------------------
Number of graphs: 1
Number of nodes: 232965
Number of features: 602
Number of classes: 41

Graph:
------
Edges are directed: False
Graph has isolated nodes: False
Graph has loops: False


#### Graph Information

In [4]:
# Check whether the dataset is stored on the GPU or not
print(f'Graph is stored on the GPU: {data.is_cuda}')

Graph is stored on the GPU: True


In [5]:
# Print first element
print(f'Graph: {data}')

Graph: Data(x=[232965, 602], y=[232965], train_mask=[232965], val_mask=[232965], test_mask=[232965], adj_t=[232965, 232965, nnz=114615892])


In [6]:
# Node feature matrix information
print(f'x = {data.x.shape}')
print(data.x)

x = torch.Size([232965, 602])
tensor([[ 1.2334,  9.0430, -0.9233,  ..., -0.2579,  0.3112, -0.3772],
        [-0.1386, -0.2022,  0.1277,  ...,  0.1563,  0.1048, -0.6534],
        [-0.1330, -0.1962, -0.0296,  ...,  0.0358,  0.2864,  0.2744],
        ...,
        [-0.0614, -0.2022,  0.9698,  ...,  1.1064, -1.4323, -0.2398],
        [-0.1606, -0.2022, -0.0892,  ...,  0.7440, -0.5046, -2.2288],
        [ 0.0929,  0.2822,  0.1768,  ...,  0.2196,  0.5967,  0.5588]],
       device='cuda:0')


In [7]:
# Adjacency matrix for the edges
print(data.adj_t)

SparseTensor(row=tensor([     0,      0,      0,  ..., 232964, 232964, 232964], device='cuda:0'),
             col=tensor([   242,    249,    524,  ..., 231806, 232594, 232634], device='cuda:0'),
             size=(232965, 232965), nnz=114615892, density=0.21%)


In [8]:
# Ground-truth labels
print(f'y = {data.y.shape}')
print(data.y)

y = torch.Size([232965])
tensor([30, 17, 18,  ...,  3, 13, 13], device='cuda:0')


In [9]:
# Train mask
print(f'train_mask = {data.train_mask.shape}')
print(data.train_mask)

train_mask = torch.Size([232965])
tensor([False,  True, False,  ...,  True,  True, False], device='cuda:0')


#### Single-layer GCN

In [10]:
# Create a simple GCN with only one GCN layer
import torch.nn.functional as F

from torch.nn import Linear
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.gcn1 = GCNConv(dataset.num_features, dataset.num_classes)
        self.optimizer = torch.optim.Adam(self.parameters(),
                                          lr=0.01,
                                          weight_decay=5e-4)

    def forward(self, x, adj_t):
        x = self.gcn1(x, adj_t)
        z = F.log_softmax(x, dim=1)
        return x, z

#### Do use mini-batch

In [21]:
from pynvml import *

def accuracy(pred_y, y):
    """Calculate accuracy."""
    return ((pred_y == y).sum() / len(y)).item()

# PCIe Throughput Check
nvmlInit()
gpu=0
handle=nvmlDeviceGetHandleByIndex(gpu)
elapsed_time = []
def train(model, data, train_loader, device):
    """Train a GNN model and return the trained model."""
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = model.optimizer
    epochs = 100

    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    model.train()
    for epoch in range(epochs+1):
        # Training
        #total_loss = 0
        #acc = 0
        #val_loss = 0
        #val_acc = 0
        
        start.record()
        # Train on batches
        for batch in train_loader:
            tx_byte = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_TX_BYTES)
            print('TX:', tx_byte, 'KB/s')
            rx_byte = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_RX_BYTES)
            print('RX:', rx_byte, 'KB/s')
            batch = batch.to(device)
            tx_byte = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_TX_BYTES)
            print('TX:', tx_byte, 'KB/s')
            rx_byte = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_RX_BYTES)
            print('RX:', rx_byte, 'KB/s')
            optimizer.zero_grad()
            h, out = model(batch.x, batch.adj_t)
            tx_byte = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_TX_BYTES)
            print('TX:', tx_byte, 'KB/s')
            rx_byte = nvmlDeviceGetPcieThroughput(handle, NVML_PCIE_UTIL_RX_BYTES)
            print('RX:', rx_byte, 'KB/s')
            #loss = criterion(out[batch.train_mask], batch.y[batch.train_mask])
            #total_loss += loss
            #acc += accuracy(out[batch.train_mask].argmax(dim=1), batch.y[batch.train_mask])
            #loss.backward()
            optimizer.step()
        end.record()
        torch.cuda.synchronize()
        elapTime = start.elapsed_time(end)
        elapsed_time.append(elapTime)
        
        # Print metrics every 10 epochs
        if(epoch % 10 == 0):
            print('Epoch:', epoch)
            #print(f'Epoch {epoch:>3} | Train Loss: {total_loss/len(train_loader):.3f} | Train Acc: '
            #      f'{acc/len(train_loader)*100:>6.2f}%')

    return model, h, out

In [22]:
from torch_geometric.loader import NeighborLoader

# Create GCN model
gcn = GCN()
print(gcn)
print()

# Set device as GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create batches with neighbor sampling
train_loader = NeighborLoader(
    data,
    num_neighbors=[-1],
    batch_size=2048,
    num_workers=8,
    prefetch_factor=1,
)

# Train
gcn_model, gcn_output, final_output = train(gcn.to(device), data.to(device), train_loader, device)

GCN(
  (gcn1): GCNConv(602, 41)
)

TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB/s
TX: 0 KB/s
RX: 0 KB

KeyboardInterrupt: 

In [13]:
final_elapsed_time = 0.00
for i in range(1, 101):
    final_elapsed_time = final_elapsed_time + elapsed_time[i]
print('Elapsed Time (100 Epochs):', final_elapsed_time*0.001, 'seconds')

Elapsed Time (100 Epochs): 192.92786071777346 seconds


In [14]:
print(device)

cuda


#### Do not use mini-batch

In [11]:
def accuracy(pred_y, y):
    """Calculate accuracy."""
    return ((pred_y == y).sum() / len(y)).item()

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
elapsed_time = []
def train(model, data):
    """Train a GNN model and return the trained model."""
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = model.optimizer
    epochs = 100

    model.train()
    for epoch in range(epochs+1):
        # Training
        optimizer.zero_grad()
        start.record()
        h, out = model(data.x, data.adj_t)
        end.record()
        torch.cuda.synchronize()
        elapTime = start.elapsed_time(end)
        elapsed_time.append(elapTime)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        # Print metrics every 10 epochs
        if(epoch % 10 == 0):
            print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: {acc*100:>6.2f}%')
          
    return model, h, out

In [12]:
# Create GCN model
gcn = GCN()
print(gcn)
print()

# Train and test
# Train
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gcn_model, gcn_output, final_output = train(gcn.to(device), data.to(device))

GCN(
  (gcn1): GCNConv(602, 41)
)

Epoch   0 | Train Loss: 3.883 | Train Acc:   2.07%
Epoch  10 | Train Loss: 0.901 | Train Acc:  85.97%
Epoch  20 | Train Loss: 0.560 | Train Acc:  90.98%
Epoch  30 | Train Loss: 0.460 | Train Acc:  92.12%
Epoch  40 | Train Loss: 0.417 | Train Acc:  92.69%
Epoch  50 | Train Loss: 0.396 | Train Acc:  93.08%
Epoch  60 | Train Loss: 0.385 | Train Acc:  93.31%
Epoch  70 | Train Loss: 0.378 | Train Acc:  93.44%
Epoch  80 | Train Loss: 0.374 | Train Acc:  93.52%
Epoch  90 | Train Loss: 0.370 | Train Acc:  93.60%
Epoch 100 | Train Loss: 0.367 | Train Acc:  93.64%


In [13]:
final_elapsed_time = 0.00
for i in range(1, 101):
    final_elapsed_time = final_elapsed_time + elapsed_time[i]
print('Elapsed Time (100 Epochs):', final_elapsed_time*0.001, 'seconds')

Elapsed Time (100 Epochs): 26.311081634521486 seconds


In [14]:
print(device)

cuda
