Je me suis basé sur le tutoriel de Titouan Parcollet (https://github.com/TParcollet/Tutoriel-Graph-Neural-Networks) pour le modèle, le train et le test que j'ai adapté à la tâche demandée.

Le traitement des données (Dataloader, AtomEncoder), ainsi que l'évaluation (Evaluator) sont donnés par le sujet du défi : https://ogb.stanford.edu/docs/graphprop/#ogbg-mol

La gestion des données (i.e les masques) est inspirée du tutoriel d'Antonio Longa (https://github.com/AntonioLonga/AdvancePyTorchGeometricTutorials/blob/main/Tutorial1/Tutorial_1.ipynb)

In [None]:
!pip install ogb
!pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cu113.html

In [2]:
from ogb.nodeproppred import PygNodePropPredDataset, Evaluator

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch_geometric.transforms as T
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, GATv2Conv
import torch_geometric.nn as pyg_nn

In [3]:
class GNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, criterion, conv_type="GCN", num_layer=3, dropout=0.25):
        super(GNN, self).__init__()
        self.dropout = dropout
        self.num_layers = num_layer
        self.conv_type = conv_type
        self.criterion = criterion
        self.conv_bn = pyg_nn.BatchNorm(hidden_dim)
        self.convs = nn.ModuleList()
        self.convs.append(self.build_conv_model(input_dim, hidden_dim))
        
        for l in range(num_layer-1):
            self.convs.append(self.build_conv_model(hidden_dim, hidden_dim))

        self.post_mp = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.Dropout(dropout),
            nn.Linear(hidden_dim, output_dim))

    def build_conv_model(self, input_dim, hidden_dim):
        if self.conv_type == "SAGE":
            return pyg_nn.SAGEConv(input_dim, hidden_dim)
        elif self.conv_type == "GIN":
            return pyg_nn.GINConv(nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.ReLU(), nn.Linear(hidden_dim, hidden_dim)))
        else:
            return pyg_nn.GCNConv(input_dim, hidden_dim)

    def forward(self, x, edge_index):
        for i in range(self.num_layers):
            x = self.convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv_bn(x)
        x = self.post_mp(x)
        return F.log_softmax(x, dim=1)

    def loss(self, pred, label):
        return self.criterion(pred, label)

In [8]:
def train(model, dataset, data, epochs, print_steps, batch_size, optimizer, evaluator):
    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]

    train_loader = DataLoader(train_idx, batch_size=batch_size, shuffle=True)
    valid_loader = DataLoader(valid_idx, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_idx, batch_size=batch_size, shuffle=False)

    # Apprentissage
    print("Learning...")
    best_valid_acc = 0
    best_epoch = 0
    best_model = model
    best_loss = 0

    for epoch in range(1, epochs + 1):
        total_loss = 0
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            x, edge_index = data.x, data.adj_t
            out = model(x, edge_index)[train_idx]
            label = data.y[train_idx]
            loss = model.loss(out, label.squeeze(1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        total_loss /= len(train_loader)

        valid_acc = test(valid_loader, data, valid_idx, model, evaluator)

        if valid_acc > best_valid_acc:
            best_valid_acc = valid_acc
            torch.save(model, 'model.pth')
            best_epoch = epoch
            best_loss = total_loss

        if epoch % print_steps == 0:
            print("Itération {}. Loss: {:.4f}. Validation: {:.4f}".format(
                epoch, total_loss, valid_acc))

    print()
    print()
    print("Evaluating...")
    print()
    best_model = torch.load('model.pth')
    test_acc = test(test_loader, data, test_idx, best_model, evaluator)
    print(f'Best model at epoch: {best_epoch:02d}')
    print(f'Loss: {best_loss:.4f}, '
          f'Test: {100 * test_acc:.2f}%')

    return model

In [9]:
def test(loader, data, data_idx, model, evaluator):
    model.eval()

    preds = []
    labels = []

    with torch.no_grad():
      for batch in loader:
            x, edge_index = data.x, data.adj_t
            out = model(x, edge_index)
            pred = out.argmax(dim=-1, keepdim=False)
            label = data.y[data_idx]
            preds.append(pred[data_idx].detach().cpu())
            labels.append(label.view(label.shape).detach().cpu())

    preds = torch.cat(preds, dim=0).unsqueeze(1).numpy()
    labels = torch.cat(labels, dim=0).numpy()
    
    input_dict = {"y_true": labels, "y_pred": preds}
    return evaluator.eval(input_dict)['acc']

In [10]:
def run():
    d_name = "ogbn-arxiv"

    dataset = PygNodePropPredDataset(name=d_name, transform=T.ToSparseTensor())
    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)
    evaluator = Evaluator(name=d_name)

    epochs = 10
    print_steps = 1

    lr = 0.01
    batch_size = 256  # 2048 ou 169343 // 10 (pour aller plus vite pour tester, mais perfs possiblement moins bonnes)

    input_dim = dataset.num_node_features
    hidden_dim = 256
    conv_type = "GCN"
    num_conv_layer = 3
    dropout = 0.25

    criterion = nn.CrossEntropyLoss()
    model = GNN(input_dim, hidden_dim, dataset.num_classes, criterion, conv_type=conv_type, num_layer=num_conv_layer, dropout=dropout).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    model = train(model, dataset, data, epochs, print_steps, batch_size, optimizer, evaluator)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = torch.device(device)
print("Device : ", device)

run()

Device :  cuda
Learning...
