### Notebook de test des GNNs sur des Toys Examples : 

In [1]:
import sys
import numpy as np
sys.path.append('../datasets')
from datasets.manager import IMDBBinary, DD
import torch 
import itertools
from tqdm import tqdm

#from utils.utils import visualise_graph, get_adjacency_and_features
from utils.utils import get_adjacency_and_features
#from src.gnn import GNNClassifier

from datasets.dataset import *
from train import Training

import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
IMDB = IMDBBinary()
DD = DD()

#### **Visualisation du dataset**

In [3]:
x_IMBD, y_IMDB = IMDB.dataset.get_data(), IMDB.dataset.get_targets()

In [4]:
node8 = x_IMBD[8]
node8

GraphData(x=torch.Size([18, 1]), edge_index=torch.Size([2, 153]))

`GraphData(x=torch.Size([18, 1]), edge_index=torch.Size([2, 153]))` refers to a graph that has: 
- 18 nodes, each having features of dimension 1
- 153 edges (each between 2 nodes)

In [5]:
visualise_graph(8, x_IMBD)

NameError: name 'visualise_graph' is not defined

The adjency matrix of such a graph is computed by `get_adjacency_and_features` and given by:

In [3]:
adjency_mat, feature_vect = get_adjacency_and_features(node8)
print("Adjency matrix:\n")
print(adjency_mat)
print("Feature vector: \n")
print(feature_vect)

NameError: name 'node8' is not defined

Implémentation de la cross-validation ; 

Les hyperparamètres: 
- Learning Rate
- Nombre de Couche de Convolution
- Dimension de l'espace d'embedding
- Critère d'Early Stopping
- dropout / Weight Decay
- Batch Size
- Nombre d'Epochs

Question de mette ou non la feature du degré des noeuds pour voir l'impact sur la performance

In [3]:
class grid: 
    
    def __init__(self, params_dict): 
        self.params_dict = params_dict

    def get_combinations(self):
        keys, values = zip(*self.param_dict.items())
        return [dict(zip(keys, v)) for v in itertools.product(*values)]

    def __repr__(self):
        return f"HyperparameterGrid({self.param_dict})"

In [4]:
class ModelSelection(): 
    
    def __init__(self, DATASET, data_split, model_class, grid): 
        self.model_class = model_class
        self.grid = grid
        self.DATASET = DATASET
        self.train_split = GraphDatasetSubset(self.DATASET.dataset.get_data(), data_split["train"])
        self.validation_split = GraphDatasetSubset(self.DATASET.dataset.get_data(), data_split["validation"])
        self.best_config = None

    def fit(self):
        for config in self.grid.get_combinations():
            print(f"Training with configuration: {config}")
            
            model = self.model_class(**config)
            model.fit(self.train_split)
            accuracy = model.evaluate(self.validation_split)

            print(f"Validation accuracy: {accuracy}")

            if self.best_config is None or accuracy > self.best_config["accuracy"]:
                self.best_config = {"config": config, "accuracy": accuracy}
        
    def get_best_config(self):
        if self.best_config is None:
            raise ValueError("No configurations have been evaluated.")
        return self.best_config["config"], self.best_config["accuracy"]
    

In [5]:
class Holdout(): 
    def __init__(self, DATASET, train_split, train_size=0.9): 
        self.DATASET = DATASET
        self.train_split = train_split
        self.train_size = train_size
        self.num_samples = len(self.train_split)
        self.train_indices = None

    def split(self):

        indices = np.arange(self.num_samples)
        np.random.shuffle(indices)
        self.train_indices = indices[:int(self.num_samples * self.train_size)]
       

    def get_splits(self):
        if self.train_indices is None :
            raise ValueError("Data has not been split yet.")
        return self.train_indices

In [6]:
class ModelAssessment(): 
    
    def __init__(self, model_class, DATASET, grid, Kfold=10, holdout=3): 
        self.model_class = model_class
        self.DATASET = DATASET
        self.grid = grid
        self.Kfold = Kfold
        self.best_configs = []
        self.splits = DATASET.splits
        self.holdout = holdout
        self.fold_accuracy = []

    def assess(self):

        for i in tqdm(range(self.Kfold)):
            train_data_idxs = self.splits[i]["model_selection"][0]
            test_data_idxs = self.splits[i]["test"]
            test_data = GraphDatasetSubset(self.DATASET.dataset.get_data(), test_data_idxs)

            model_selection = ModelSelection(self.DATASET, train_data_idxs, self.model_class, self.grid)
            model_selection.fit()
            best_config, _ = model_selection.get_best_config()
            self.best_configs.append(best_config)
            inner_accuracy = []

            for j in range(self.holdout): 
                
                train_holdout = Holdout(self.DATASET, train_data_idxs)
                train_holdout.split()
                train_indices = train_holdout.get_splits()
                
                train_data = GraphDatasetSubset(self.DATASET.dataset.get_data(), train_indices)
                model = self.model_class(**best_config)
                model.fit(train_data)
                accuracy = model.evaluate(test_data)
                inner_accuracy.append(accuracy)
            
            self.fold_accuracy.append(np.mean(inner_accuracy))
        
        return np.mean(self.fold_accuracy), np.std(self.fold_accuracy)

In [7]:
idxs = IMDB.splits[0]["model_selection"][0]
train_data = GraphDatasetSubset(IMDB.dataset.get_data(), idxs["train"])
val_data = GraphDatasetSubset(IMDB.dataset.get_data(), idxs["validation"])

On tente avec une configuration random pour voir si l'entraînement se lance

In [8]:
params_list = {"model_type": "GCN",  
               "n_graph_subsampling": 0, # the number of running graph subsampling each train graph data run subsampling 5 times: increasing graph data 5 times
               "graph_node_subsampling": True, # TRUE: removing node randomly to subsampling and augmentation of graph dataset \n'+
                # FALSE: removing edge randomly to subsampling and augmentation of graph dataset
               "graph_subsampling_rate": 0.2, # graph subsampling rate
               "dataset": "IMDB", 
               "pooling_type": "mean", 
               "seed": 42,
               "n_folds": 10, 
               "cuda": True, 
               "lr": 0.001, 
               "epochs": 50, 
               "weight_decay":5e-4,
               "batch_size": 32, 
               "dropout": 0, # dropout rate of layer
               "num_lay": 5, 
               "num_agg_layer": 2, # the number of graph aggregation layers
               "hidden_agg_lay_size": 64, # size of hidden graph aggregation layer
               "fc_hidden_size": 128, # size of fully-connected layer after readout
               "threads":10, # how many subprocesses to use for data loading
               "random_walk":True,
               "walk_length": 20, # walk length of random walk, 
               "num_walk": 10, # num of random walk
               "p": 0.65, # Possibility to return to the previous vertex, how well you navigate around
               "q": 0.35, # Possibility of moving away from the previous vertex, how well you are exploring new places
               "print_logger": 10  # printing rate
               }


In [9]:
dataaaa = GraphDatasetSubset(DD.dataset.get_data(), idxs["train"])
adj, features = get_adjacency_and_features(dataaaa[0])
print("Adjency matrix:\n")
print(adj)
print("Feature vector: \n")
print(features)

Adjency matrix:

tensor([[0., 1., 1.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 1., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
Feature vector: 

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [1., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


In [10]:
import numpy as np
import time
import statistics
from tqdm import tqdm

import torch
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

from src.models import GCN, GAT, GraphDenseNet
from datasets.dataloader import DataLoader


class Train:
    def __init__(self, params, dataset):
      self.params = params 
      self.dataset = dataset

      if self.params["cuda"] and torch.cuda.is_available():
          self.device = "cuda:0"
      else: 
          self.device = "cpu"

      self.model = self.get_model()
      self.loss_fn = F.cross_entropy 

    
    def get_model(self):
        
        adjacency, features = get_adjacency_and_features(self.dataset[0])
        print(len(features))
      
        if self.params["model_type"] == 'GCN':
          
            model = GCN(n_feat=89,
                    n_class=2,
                    n_layer=self.params['num_agg_layer'],
                    agg_hidden=self.params['hidden_agg_lay_size'],
                    fc_hidden=self.params['fc_hidden_size'],
                    dropout=self.params['dropout'],
                    pool_type=self.params['pooling_type'],
                    device=self.device).to(self.device)
            
        elif self.params["model_type"] == 'GAT':
            
            model = GAT(n_feat=89,
                    n_class=2,
                    n_layer=self.params['num_agg_layer'],
                    agg_hidden=self.params['hidden_agg_lay_size'],
                    fc_hidden=self.params['fc_hidden_size'],
                    dropout=self.params['dropout'],
                    pool_type=self.params['pooling_type'],
                    device=self.device).to(self.device)
            
        elif self.params["model_type"] == 'GraphDenseNet':
            
            model = GraphDenseNet(n_feat=89,
                    n_class=2,
                    n_layer=self.params['num_agg_layer'],
                    agg_hidden=self.params['hidden_agg_lay_size'],
                    fc_hidden=self.params['fc_hidden_size'],
                    dropout=self.params['dropout'],
                    pool_type=self.params["pooling_type"],
                    device=self.device).to(self.device)
            
        return model


    def loaders_train_test_setup(self):

        loader = DataLoader(self.dataset,
                            batch_size=self.params["batch_size"],
                            shuffle=True,
                            num_workers=0,#self.params["threads"],
                            pin_memory=True,
                            drop_last=False)
      
      # Total trainable param
        c = 0
        for p in filter(lambda p: p.requires_grad, self.model.parameters()):
            c += p.numel()
        print('N trainable parameters:', c)

        optimizer = optim.Adam(
                        filter(lambda p: p.requires_grad, self.model.parameters()),
                        lr=self.params["lr"],
                        weight_decay=self.params["weight_decay"],
                        betas=(0.5, 0.999))
    
        scheduler = lr_scheduler.MultiStepLR(optimizer, [20, 30], gamma=0.1)
            
        return loader, optimizer, scheduler

    def train(self, train_loader, optimizer, scheduler, epoch):

      total_time_iter = 0
      self.model.train()
      start = time.time()
      train_loss, n_samples = 0, 0
      for batch_idx, data in enumerate(train_loader):
          optimizer.zero_grad()
          output = self.model(data)
          loss = self.loss_fn(output, data[4])
          loss.backward()
          optimizer.step()
          time_iter = time.time() - start
          total_time_iter += time_iter
          train_loss += loss.item() * len(output)
          n_samples += len(output)
          if batch_idx % self.params["print_logger"] == 0 or batch_idx == len(train_loader) - 1:
              print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f} (avg: {:.6f}) \tsec/iter: {:.4f}'.format(
                  epoch, n_samples, len(train_loader.dataset),
                  100. * (batch_idx + 1) / len(train_loader), loss.item(), train_loss / n_samples, time_iter / (batch_idx + 1) ))
      scheduler.step()
      return total_time_iter / (len(train_loader) + 1)
      

    def test(self, test_loader, epoch):

      print('Test model ...')

      self.model.eval()
      test_loss, correct, n_samples = 0, 0, 0

      for batch_idx, data in enumerate(test_loader):
          for i in range(len(data)):
            data[i] = data[i].to(self.device)
          
          output = self.model(data)
          loss = self.loss_fn(output, data[4], reduction='sum')
          test_loss += loss.item()
          n_samples += len(output)
          pred = output.detach().cpu().max(1, keepdim=True)[1]

          correct += pred.eq(data[4].detach().cpu().view_as(pred)).sum().item()

      test_loss /= n_samples

      acc = 100. * correct / n_samples

      print('Test set (epoch {}): Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(epoch, 
                                                                                            test_loss, 
                                                                                            correct, 
                                                                                            n_samples, acc))
      
      return acc
            
              
    def fit(self): 
        for fold_id in tqdm(range(self.params["n_folds"]), desc="Folds", position=0, leave=True):
            loaders, optimizer, scheduler = self.loaders_train_test_setup()
            total_time = 0

            for epoch in tqdm(range(self.params["epochs"]), desc=f"Epochs (Fold {fold_id})", position=1, leave=False):
                total_time_iter = self.train(loaders, optimizer, scheduler, epoch)
                total_time += total_time_iter
                acc = self.test(loaders, epoch)

            self.acc_folds.append(round(acc, 2))
            self.time_folds.append(round(total_time / self.params["epochs"], 2))
          
        print(self.acc_folds)
        print('{}-fold cross validation avg acc (+- std): {} ({})'.format(
            self.params["n_folds"], statistics.mean(self.acc_folds), statistics.stdev(self.acc_folds)))
        
        result_list = [self.params["dataset"], self.params["dataset"]]
        result_list.extend(str(acc_fold) for acc_fold in self.acc_folds)
        result_list.extend([
            statistics.mean(self.acc_folds),
            statistics.stdev(self.acc_folds),
            statistics.mean(self.time_folds)
        ])

In [11]:
trainer = Train(params_list, dataaaa)

294
89
89


In [12]:
trainer.fit()

Folds:   0%|          | 0/10 [00:00<?, ?it/s]

N trainable parameters: 8578


Folds:   0%|          | 0/10 [00:01<?, ?it/s]


torch.Size([9515, 89])


RuntimeError: shape '[2, 23323, 64]' is invalid for input of size 608960