In [1]:
import json
import os
import numpy as np
from scipy.spatial.distance import cdist
import matplotlib.pyplot as plt

import argparse
import torch
from torch_geometric.datasets import TUDataset
from torch_geometric.loader import DataLoader


from Utilities import score
from Utilities import plot_matrix_runs, plot_results, save_plot_losses
from Utilities import Add_ID_Count_Neighbours, PairData, prepare_dataloader_distance_scale

from training import training_loop

from models import GCN_k_m

In [2]:
# Setup device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
# Load dataset
dataset = TUDataset(root='/tmp/ENZYMES_transformed', name='ENZYMES', pre_transform=Add_ID_Count_Neighbours(), use_node_attr=True)

In [4]:
# Load the homomorphism counts vectors:
name_dataset = 'MUTAG'
n_homs=50
hom_counts_path = 'data/homomorphism_counts/' + name_dataset + "_" + str(n_homs) + ".homson"
if not os.path.exists(hom_counts_path):
    raise FileNotFoundError(f"The file '{hom_counts_path}' was not found.")

# Load dataset
dataset = TUDataset(root='/tmp/MUTAG_transformed', name='MUTAG', pre_transform=Add_ID_Count_Neighbours(), use_node_attr=True)

In [5]:
# torch.manual_seed(args.seed)
batch_size = 32
distance = 'cosine'
distance_scaling = 'counts'
train_loader, val_loader, test_loader = prepare_dataloader_distance_scale(hom_counts_path, dataset, batch_size=batch_size, dist=distance, device = device, scaling = distance_scaling)

In [None]:
Number of GCN_layers: 2
Number of Linear layers: 1
Relu After: True
MLP Distance: True

In [17]:
"""
This file contains the implementation of the different classes considered.
"""
import torch
from torch.nn import Linear, Parameter, PairwiseDistance, CosineSimilarity
from torch_geometric.nn import MessagePassing, global_mean_pool, GCNConv

class GCN_k_m(torch.nn.Module):
    """
    Takes as input a pair of graphs which are both fed through k graph convolutional layers and m linear layers.
    conv1 -> dropout -> ...         -> convk -> meanpool -> relu -> compute dist(x1, x2)
    If apply_relu_conv is True:
    conv1 -> dropout -> relu -> ... -> convk -> meanpool -> relu -> compute dist(x1, x2)
    If a number of linear layers m >= 1 is specified:
    conv1 -> dropout -> ...         -> convk -> meanpool -> relu -> linear1 -> relu -> ... -> linearm -> compute dist(x1, x2)
    
    If mlp_dist is set to True:
    The distance between the two embedding vectors is obtained by applying a linear transformation on the difference between the two.
    """
    def __init__(self, input_features, hidden_channels, output_embeddings, n_conv_layers, n_linear_layers, p, name, apply_relu_conv = False, dist = 'L1', mlp_dist = False):
        super(GCN_k_m, self).__init__()

        if n_conv_layers < 1:
            raise ValueError("Invalid value for n_conv_layers. n_conv_layers should be an integer larger than 0")
        if dist not in ['cosine', 'L1', 'L2']:
            raise ValueError("Invalid value for 'dist'. Expected 'cosine', 'L1', or 'L2'.")
        
        # Details for the architecture
        self.dist = dist
        self.mlp_dist = mlp_dist
        self.input_features = input_features
        self.output_embeddings = output_embeddings 
        self.name = name

        # Hyper-parameters.
        self.apply_relu_conv = apply_relu_conv # If True applies relu after each convolutional layer
        self.p = p                             # Sets dropout probability. If 0, no dropout is allowed
        self.hidden_channels = hidden_channels # Sets dimension of hidden channels
        self.n_conv_layers = n_conv_layers     # Sets number of convolutional layers.
        self.n_linear_layers = n_linear_layers # Sets number of linear layers.


        # GCN and Linear layers employed by the model.
        self.GCN_layers = torch.nn.ModuleList()
        self.Linear_layers = torch.nn.ModuleList()

        # If no linear layers
        if self.n_linear_layers == 0:
            for i in range(self.n_conv_layers-1): 
                self.GCN_layers.append(GCNConv(input_features, hidden_channels))
                input_features = hidden_channels # From second layer we'll need this.and
            # Final GCN layer
            if self.n_conv_layers == 1:
                self.final_GCN = GCNConv(input_features, output_embeddings)
            else:
                self.final_GCN = GCNConv(hidden_channels, output_embeddings)
        
        # If there are some linear layers
        else:
            for i in range(self.n_conv_layers): 
                self.GCN_layers.append(GCNConv(input_features, hidden_channels))
                input_features = hidden_channels # From second layer we'll need this.and
            # Linear layers
            for i in range(self.n_linear_layers - 1):
                self.Linear_layers.append(Linear(hidden_channels, hidden_channels))
            # Final linear layer
            self.final_linear = Linear(hidden_channels, output_embeddings)

        # Additional layers required later
        self.dropout = torch.nn.Dropout(self.p)
        self.relu = torch.nn.ReLU()

        if self.mlp_dist:
            self.linear_dist = Linear(output_embeddings, 1)
        
        # Define the operation to be performed on the distances based on the the specification.
        if self.dist == 'L1':
            self.pdist = PairwiseDistance(p=1)
        elif self.dist == 'L2':
            self.pdist = PairwiseDistance(p=2)
        elif self.dist == 'cosine':
            self.pdist = CosineSimilarity()

    def forward(self, x1, edge_index1, batch1, x2, edge_index2, batch2): # Need a way to extract these from dataloader

        # 1. Obtain node embeddings for graph 1 and 2
        for layer in self.GCN_layers: 
            x1 = layer(x1, edge_index1)
            x1 = self.dropout(x1)
            x2 = layer(x2, edge_index2)
            x2 = self.dropout(x2)
            if self.apply_relu_conv:
                x1 = self.relu(x1)
                x2 = self.relu(x2)
        
        if self.n_linear_layers == 0:
            x1 = self.final_GCN(x1, edge_index1)
            x2 = self.final_GCN(x2, edge_index2)

        # 2. Readout layer followed by RELU (and Linear layers).
        x1 = global_mean_pool(x1, batch1)
        x1 = self.relu(x1)
        x2 = global_mean_pool(x2, batch2)
        x2 = self.relu(x2)

        if self.n_linear_layers > 0:
            for layer in self.Linear_layers:
                x1 = layer(x1)
                x1 = self.relu(x1)
                x2 = layer(x2)
                x2 = self.relu(x2)
            
            x1 = self.final_linear(x1)
            x2 = self.final_linear(x2)

        if self.mlp_dist:
            # Maybe try Euclidean distance
            x = torch.abs(x1 - x2)
            print(x.shape)
            vdist = self.linear_dist(x).reshape(-1)
            return vdist

        # Compute the corresponding distance between the embeddings.
        if self.dist == 'cosine':
            vdist = 1 - self.pdist(x1, x2)
        else:
            vdist = self.pdist(x1, x2)
        return vdist
    
    def save(self):
        """
        Saves the model state dictionary in models folder
        """
        path = 'models/' + self.name + '.pt'
        torch.save(self.state_dict(), path)

In [18]:
# Prepare the model:
hid_size = 32
emb_size = n_homs
lr = 0.01
name = 'GCN_k_m_trial'
model = GCN_k_m(input_features=dataset.num_node_features, hidden_channels=hid_size, output_embeddings=emb_size, n_conv_layers=2, n_linear_layers=1, p=0.2, name = name, apply_relu_conv=False, dist='cosine', mlp_dist=True).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.MSELoss().to(device)

In [19]:
print(model)

GCN_k_m(
  (GCN_layers): ModuleList(
    (0): GCNConv(1, 32)
    (1): GCNConv(32, 32)
  )
  (Linear_layers): ModuleList()
  (final_linear): Linear(in_features=32, out_features=50, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
  (relu): ReLU()
  (linear_dist): Linear(in_features=50, out_features=1, bias=True)
  (pdist): CosineSimilarity()
)


In [None]:
train_losses, validation_losses = training_loop(model, train_loader, optimizer, criterion, val_loader, epoch_number=10, patience=10, return_losses=True)

In [10]:
y, predictions = score(model, test_loader, device)

MSE Loss: 0.0004790598468389362
