In [None]:
!pip install -U ogb
!pip install torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu111.html
!pip install torch-geometric
!pip install -U scikit-learn

In [None]:
import pandas as pd
import numpy as np
import torch_geometric
import urllib3
import outdated
import torch

import torch.nn.functional as F
from ogb.graphproppred import PygGraphPropPredDataset, Evaluator
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv

!python -c "import ogb; print(ogb.__version__)"
!python --version
print(pd.__version__)
print(np.__version__)
print(torch.__version__)
print(torch.version.cuda)
print(torch_geometric.__version__)
print(urllib3.__version__)
print(outdated.__version__)

!python3 -m pip show scikit-learn

Nous partons du corpus ogbg-molhiv dans le but d'effectuer une classication de graphe. La tâche à effectuer est à partie d'une molécule de savoir si elle va permettre l'inhibation ou la réplication du virus du VIH.

In [None]:
from torch_geometric.data import DataLoader
import torch.optim as optim
from torch_geometric.nn import global_mean_pool
import torch
import torch.nn.functional as F
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch_geometric.nn import GCNConv
from ogb.graphproppred import PygGraphPropPredDataset, Evaluator
from tqdm import tqdm

class GCN(torch.nn.Module):

    def __init__(self, num_classes_end, num_layer = 3, emb_dim=300, dropout=0.5):
        super(GCN, self).__init__()

        # Nombre de couches cachées : profondeur du modèle
        self.num_layer = num_layer
        # dropout : permet d'éviter de tomber dans un minimum local
        # On active pas tous les paramètres 
        self.dropout = dropout
        # Initialise les embeddings des nodes
        self.atom_encoder = AtomEncoder(emb_dim)

        # liste des couches de convolution
        self.convs = torch.nn.ModuleList()
        # liste des couches de normalisation pour la convolution
        self.batch_norms = torch.nn.ModuleList()

        # on empile les couche de convolution dans la liste
        for layer in range(num_layer):
          # calcul de la convolution en focntions des dimensions choisies
          self.convs.append(GCNConv(emb_dim, emb_dim))
          # evite l'overfitting en normalisant les données : recentrage des données
          self.batch_norms.append(torch.nn.BatchNorm1d(emb_dim))
        
        # liste des couches linéaires (MLP)
        self.lins = torch.nn.ModuleList()
        # Liste des couches de normalisation pour le MLM
        self.bns_lins = torch.nn.ModuleList()

        # on empile les couches linéaires du MLP
        # -1 car on ne compte la couche de sortie
        for _ in range(self.num_layer - 1):
            self.lins.append(torch.nn.Linear(emb_dim, emb_dim))
            self.bns_lins.append(torch.nn.BatchNorm1d(emb_dim))
        
        # couche de sortie pour la classification
        self.lins.append(torch.nn.Linear(emb_dim, num_classes_end))

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        for bn in self.batch_norms:
            bn.reset_parameters()
        for lin in self.lins:
            lin.reset_parameters()
        for bn in self.bns_lins:
            bn.reset_parameters()

    def forward(self, batched_data):

        # on récupère les features node 9d, les liens entre les noeuds, batch = les noeuds
        x, edge_index, batch = batched_data.x, batched_data.edge_index, batched_data.batch

        # on récupère les embeddinds initaux des atoms
        # on créer une liste de 1
        h_list = [self.atom_encoder(x)]

        # apprentissage des nodes embeddings
        # on ajoute les couches de convolution
        for i in range(self.num_layer):
            # representer une nodes en fonction de ses voisisn (agrégation)
            h = self.convs[i](h_list[i], edge_index)
            h = self.batch_norms[i](h)         
            h = F.relu(h)
            # ajout du dropout
            h = F.dropout(h, self.dropout, training = self.training)
            # on ajoute le nouvel embedding dans la liste
            h_list.append(h)

        # on récupère la dernière node representation
        #node_representation = h_list[-1]

        # on concatène tous les embeddings sauvegardés
        node_representation = 0
        for i in range(self.num_layer + 1):
            node_representation += h_list[i]
    
        # on effectue le pooling qui donne la valeurs moyenne des voisins d'une node
        # réduit la dimension
        # moyenne des embeddings des voisins
        x = global_mean_pool(node_representation, batch)

        # on ajoute les couches cachées du MLP
        for i in range(self.num_layer - 1):
           x = self.lins[i](x)
           x = self.bns_lins[i](x) 
           # relu fonction activation : active un neurone
           x = F.relu(x)
           x = F.dropout(x, p=self.dropout, training=self.training)

        # couche linéaire de sortie
        x = self.lins[-1](x)
        #x = self.lin(x)

        return x

# permet d'obtenir une classification binaire
cls_criterion = torch.nn.BCEWithLogitsLoss()

def train(model, loader, optimizer):
    model.train()

    for step, batch in enumerate(tqdm(loader)):
        batch = batch.to(device)
        pred = model(batch)
        optimizer.zero_grad()
        loss = cls_criterion(pred.to(torch.float32), batch.y.to(torch.float32))
        loss_final = loss
        loss.backward()
        optimizer.step()     
    
    return loss.item()

def eval(model, loader, evaluator):
    model.eval()

    y_true = []
    y_pred = []

    for step, batch in enumerate(tqdm(loader)):
        batch = batch.to(device)

        with torch.no_grad():
            pred = model(batch)

        y_true.append(batch.y.view(pred.shape).detach().cpu())
        y_pred.append(pred.detach().cpu())

    y_true = torch.cat(y_true, dim = 0).numpy()
    y_pred = torch.cat(y_pred, dim = 0).numpy()

    input_dict = {
        "y_true": y_true, 
        "y_pred": y_pred
        }

    return evaluator.eval(input_dict)["rocauc"]

# Training settings
batch_size = 64
epochs = 50
drop_ratio = 0.5
emb_dim = 100
num_layer = 4
lr = 0.001
runs = 1

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

dataset = PygGraphPropPredDataset(name = "ogbg-molhiv")

split_idx = dataset.get_idx_split()

evaluator = Evaluator(name = "ogbg-molhiv")

train_loader = DataLoader(dataset[split_idx["train"]], batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=batch_size, shuffle=False)
test_loader = DataLoader(dataset[split_idx["test"]], batch_size=batch_size, shuffle=False)

model = GCN(num_classes_end=dataset.num_tasks).to(device)

for i in range(runs):
  print(f"[RUUUN {i}]")
  best_test = 0
  best_valid = 0
  model.reset_parameters()
  optimizer = torch.optim.Adam(model.parameters(), lr=lr)

  for epoch in range(1, 1 + epochs):
    print(f"[Epoch {epoch}]")
    # train model
    loss = train(model, train_loader, optimizer)  
    
    # print loss last iter
    print(f'Loss : {loss:.2f}')    
    print('Evaluating model with valid loader')
    valid_acc = eval(model, valid_loader, evaluator)
    print(f'Valid acc : {valid_acc:2f}')

    print('Evaluating model with test loader : ')
    test_acc = eval(model, test_loader, evaluator)
    print(f'Test acc : {test_acc:2f}')
                    
    if test_acc > best_test:
      best_test = test_acc
    if valid_acc > best_valid:
      best_valid = valid_acc

  print(best_test, best_valid)
  best_acc_valid.append(best_valid)
  best_acc_test.append(best_test)

moy_valid = sum(best_acc_valid) / len(best_acc_valid)
moy_test = sum(best_acc_test) / len(best_acc_test)

print(f'Validation {moy_valid} Test {moy_test}')



[RUUUN 0]
[Epoch 1]


 12%|█▏        | 62/515 [00:10<01:23,  5.44it/s]

In [None]:
!nvidia-smi