In [1]:
import sys
import torch
from pathlib import Path
from platform import system
sys.path.append(str(Path.home() / "orguel_ml"))

system = system()

if system == 'Windows': dataset_path = "D:\\ml\\graph_dataset.pt"
elif system == 'Linux': dataset_path = "/media/rafael/HD/ml/graph_dataset.pt"

dataset = torch.load(dataset_path)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(f"Graphs loaded: {len(dataset)} / OS: {system} / device: {device}")

  dataset = torch.load(dataset_path)


Graphs loaded: 1 / OS: Linux / device: cuda


In [None]:
import orguel_ml
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader
from torch.utils.tensorboard import SummaryWriter

# Setup
test_size = 0.1
batch_size = 1
learning_rate = 0.0025
weight_decay = 1e-4
n_targets = 7
smoothing_exp = 0.2
label_smoothing = 0.1
epochs = 15

# split the dataset:
training_dataset, validation_dataset = train_test_split(dataset, test_size=test_size, shuffle=True, random_state=42)

# DataLoaders
training_loader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

# degree histogram from *training* graphs only
pna_degree = orguel_ml.compute_pna_degree(training_dataset)
model = orguel_ml.GraphNeuralNetwork(pna_degree).to(device)

# Optimizer, scheduler, loss
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, min_lr=1e-5, threshold=1e-5, verbose=True)

class_weights, label_counts = orguel_ml.balance_class_weights(training_dataset, n_targets, smoothing_exp, device)
criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=label_smoothing)

# TensorBoard writer
writer = SummaryWriter(log_dir="TensorBoard")

# Train/Eval loops
_is_device_cuda = True if device.type=='cuda' else False
scaler = torch.amp.GradScaler(device.type, enabled=_is_device_cuda) # mixed precision for speed

def run_epoch(loader, training=False):
    
    model.train() if training else model.eval()
    
    n_batches = len(loader)
    total_loss, correct, total_nodes = 0., 0, 0
    
    with torch.enable_grad() if training else torch.inference_mode():
        for batch in loader:
            batch = batch.to(device)
            
            with torch.amp.autocast(device.type, enabled=_is_device_cuda):
                logits = model(batch); loss = criterion(logits, batch.y)
            
            if training:
                optimizer.zero_grad(set_to_none=True)
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
            
            total_loss += loss.item()
            predicted = logits.argmax(dim=1)
            correct += (predicted==batch.y).sum().item()
            total_nodes += batch.num_nodes
    
    average_loss = total_loss / n_batches
    accuracy = correct / total_nodes
    
    return average_loss, accuracy

print(f"label counts: {dict(sorted(label_counts.items()))}")

In [None]:
# Training/Validation loop
for epoch in range(1, epochs):
    training_loss, training_accuracy = run_epoch(training_loader, training=True)
    validation_loss, validation_accuracy = run_epoch(validation_loader)
    
    scheduler.step(validation_loss)  # ReduceLROnPlateau on validation loss
    learning_rate = optimizer.param_groups[0]['lr']
    
    # Logs
    writer.add_scalar("Loss/train", training_loss,       epoch)
    writer.add_scalar("Loss/val",   validation_loss,     epoch)
    writer.add_scalar("Acc/train",  training_accuracy,   epoch)
    writer.add_scalar("Acc/val",    validation_accuracy, epoch)
    writer.add_scalar("LR",         learning_rate,       epoch)
    
    print(f"Epoch {epoch:03d} | "
          f"TrainLoss {training_loss:.4f} Acc {training_accuracy:.3f} | "
          f"ValLoss {validation_loss:.4f} Acc {validation_accuracy:.3f} | "
          f"LR {learning_rate:.6f}")
    
writer.close()
print("\nDone. Launch TensorBoard with: tensorboard --logdir TensorBoard")

Epoch 001 | TrainLoss nan Acc 0.195 | ValLoss nan Acc 0.179 | LR 0.002500
Epoch 002 | TrainLoss nan Acc 0.187 | ValLoss nan Acc 0.179 | LR 0.002500
Epoch 003 | TrainLoss nan Acc 0.187 | ValLoss nan Acc 0.179 | LR 0.002500
Epoch 004 | TrainLoss nan Acc 0.187 | ValLoss nan Acc 0.179 | LR 0.001250
Epoch 005 | TrainLoss nan Acc 0.187 | ValLoss nan Acc 0.179 | LR 0.001250
