In [2]:
import spektral
import networkx as nx
import numpy as np
import tensorflow as tf
from spektral.data import DisjointLoader
from spektral.data.loaders import tf_loader_available
import scipy.sparse as sp
from spektral.data.utils import (
    prepend_none,
    sp_matrices_to_sp_tensors,
    to_disjoint,
    collate_labels_disjoint
)
import numpy as np
from ogb.graphproppred import GraphPropPredDataset
from spektral.data import Dataset, Graph
from spektral.datasets import TUDataset, QM9


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = {
    'seed': 1,
    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 0.001,
    'dataset': 'ogbg-molesol', #JA: QM9, ogbg-molesol, ogbg-molfreesolv, ogbg-mollipo, ZINC| NEIN: aspirin
    'train_test_split': 0.8
}

np.random.seed(config['seed'])
tf.random.set_seed(config['seed'])

In [4]:
class OGBDataset(Dataset):
    '''
    (spektral) Dataset class wrapper for Open Graph Benchmark datasets.
    '''
    def __init__(self, name, **kwargs):
        self.name = name
        super().__init__(**kwargs)

    def read(self):
        dataset = GraphPropPredDataset(name=self.name)
        graphs = []
        for data in dataset:
            edge_index = data[0]['edge_index']
            edge_feat = data[0]['edge_feat']
            node_feat = data[0]['node_feat']
            label = data[1]

            # Create adjacency matrix
            num_nodes = node_feat.shape[0]
            adj = np.zeros((num_nodes, num_nodes))
            for edge in edge_index.T:
                adj[edge[0], edge[1]] = 1

            # Create spektral Graph object
            graphs.append(Graph(x=node_feat, a=adj, e=edge_feat, y=label))

        return graphs

def ogb_available_datasets():
    #These regression datasets have size % 2 == 0 number of graphs
    return ['ogbg-molesol', 'ogbg-molfreesolv', 'ogbg-mollipo']

In [5]:
def _load_data(name: str):
    '''
    Loads a dataset from [TUDataset, OGB]
    '''
    if name == 'QM9':
        dataset = QM9(amount=1000)# 1000 and 100000 ok
    elif name in TUDataset.available_datasets():
        dataset = TUDataset(name)
    elif name in ogb_available_datasets():
        dataset= OGBDataset(name)
    else:
        raise ValueError(f'Dataset {name} unknown')

    return dataset, dataset.n_labels

In [6]:
def _split_data(data, train_test_split, seed):
    '''
    Split the data into train and test sets
    '''
    np.random.seed(seed)
    idxs = np.random.permutation(len(data))
    split = int(train_test_split * len(data))
    idx_train, idx_test = np.split(idxs, [split])
    train, test = data[idx_train], data[idx_test]
    return train, test

In [7]:
def get_data(config):
    seed = config['seed']
    train_test_split = config['train_test_split']
    name = config['dataset']

    # Load data
    data, config['n_out'] = _load_data(name)
    # Split data
    train_data, test_data = _split_data(data, train_test_split, seed)

    return train_data, test_data

In [8]:
def to_tf_signature(signature):
    """
    Converts a Dataset signature to a TensorFlow signature. Extended keys (idx_a, idx_b) for MyDisjointLoader.
    :param signature: a Dataset signature.
    :return: a TensorFlow signature.
    """
    output = []
    keys = ["x", "a", "e", "i", "idx_a", "idx_b"]
    for k in keys:
        if k in signature:
            shape = signature[k]["shape"]
            dtype = signature[k]["dtype"]
            spec = signature[k]["spec"]
            output.append(spec(shape, dtype))
    output = tuple(output)
    if "y" in signature:
        shape = signature["y"]["shape"]
        dtype = signature["y"]["dtype"]
        spec = signature["y"]["spec"]
        output = (output, spec(shape, dtype))

    return output

In [9]:
class MyDisjointLoader(DisjointLoader):
    """
    Extension of DisjointLoader class from spektral library. Additionally to data and targets, it also returns ranking pair indices.
    A Loader for [disjoint mode](https://graphneural.network/data-modes/#disjoint-mode).

    This loader represents a batch of graphs via their disjoint union.

    The loader automatically computes a batch index tensor, containing integer
    indices that map each node to its corresponding graph in the batch.

    The adjacency matrix os returned as a SparseTensor, regardless of the input.

    If `node_level=False`, the labels are interpreted as graph-level labels and
    are stacked along an additional dimension.
    If `node_level=True`, then the labels are stacked vertically.

    **Note:** TensorFlow 2.4 or above is required to use this Loader's `load()`
    method in a Keras training loop.

    **Arguments**

    - `dataset`: a graph Dataset;
    - `node_level`: bool, if `True` stack the labels vertically for node-level
    prediction;
    - `batch_size`: size of the mini-batches;
    - `epochs`: number of epochs to iterate over the dataset. By default (`None`)
    iterates indefinitely;
    - `shuffle`: whether to shuffle the data at the start of each epoch.

    **Output**

    For each batch, returns a tuple `(inputs, labels)`.

    `inputs` is a tuple containing:

    - `x`: node attributes of shape `[n_nodes, n_node_features]`;
    - `a`: adjacency matrices of shape `[n_nodes, n_nodes]`;
    - `e`: edge attributes of shape `[n_edges, n_edge_features]`;
    - `i`: batch index of shape `[n_nodes]`.

    `labels` have shape `[batch, n_labels]` if `node_level=False` or
    `[n_nodes, n_labels]` otherwise.

    """

    def __init__(
        self, dataset, node_level=False, batch_size=1, epochs=None, shuffle=True, seed=42, radius=4, sampling_ratio=100
    ):
        self.node_level = node_level
        super().__init__(dataset, batch_size=batch_size, epochs=epochs, shuffle=shuffle)
        self.seed = seed
        self.radius = radius
        self.sampling_ratio = sampling_ratio

    def collate(self, batch):
        idx_a, idx_b, target = self.sample_preference_pairs(batch, seed=self.seed, radius=self.radius, sampling_ratio=self.sampling_ratio)
        packed = self.pack(batch)

        y = packed.pop("y_list", None)
        if y is not None:
            y = collate_labels_disjoint(y, node_level=self.node_level)

        output = to_disjoint(**packed)
        output = sp_matrices_to_sp_tensors(output)

        return output + (idx_a, idx_b), target

    def load(self):
        print("load")
        if not tf_loader_available:
            raise RuntimeError(
                "Calling DisjointLoader.load() requires " "TensorFlow 2.4 or greater."
            )
        return tf.data.Dataset.from_generator(
            lambda: self, output_signature=self.tf_signature()
        )

    def tf_signature(self):
        """
        Adjacency matrix has shape [n_nodes, n_nodes]
        Node features have shape [n_nodes, n_node_features]
        Edge features have shape [n_edges, n_edge_features]
        Targets have shape [*, n_labels]
        Pairs have shape [*, 2]
        """
        signature = self.dataset.signature
        if "y" in signature:
            signature["y"]["shape"] = prepend_none(signature["y"]["shape"]) #(12800,) #(None, 1)
        if "a" in signature:
            signature["a"]["spec"] = tf.SparseTensorSpec

        signature["i"] = dict()
        signature["i"]["spec"] = tf.TensorSpec
        signature["i"]["shape"] = (None,)
        signature["i"]["dtype"] = tf.as_dtype(tf.int64)

        signature["idx_a"] = dict()
        signature["idx_a"]["spec"] = tf.TensorSpec
        signature["idx_a"]["shape"] = (None,)
        signature["idx_a"]["dtype"] = tf.as_dtype(tf.int64)
        signature["idx_b"] = dict()
        signature["idx_b"]["spec"] = tf.TensorSpec
        signature["idx_b"]["shape"] = (None,)
        signature["idx_b"]["dtype"] = tf.as_dtype(tf.int64)

        return to_tf_signature(signature)

    def sample_preference_pairs(self, graphs, radius=4, sampling_ratio=100, seed=42):
        seed = self.seed
        size = len(graphs)
        sample_size = size * radius * sampling_ratio
        r = np.arange(size)
        S = sp.csr_matrix((r, (r, r)), shape=(size, size))
        parts = np.split(S.data, S.indptr[1:-1])
        rnd = np.random.default_rng(seed)
        for part in parts:
            rnd.shuffle(part)
        idx_a = np.empty((sample_size,), dtype=np.int64)
        idx_b = np.empty((sample_size,), dtype=np.int64)
        target = np.ones((sample_size,), dtype=np.float64)
        k = 0
        for i in range(size):
            part = parts[i]
            psize = len(part)
            for d in range(radius):
                ni = (i + d + 1) % size
                npart = parts[ni]
                npsize = len(npart)
                for j in range(sampling_ratio):
                    npart_offset = np.roll(npart, d * sampling_ratio + j)
                    idx_a[k:k + psize] = part
                    if npsize < psize:
                        idx_b[k:k + npsize] = npart_offset
                        idx_b[k + npsize:k + psize] = npart_offset[:psize - npsize]
                    else:
                        idx_b[k:k + psize] = npart_offset[:psize]
                    if ni < i:
                        target[k:k + psize] = 0
                    k += psize
        return idx_a, idx_b, target.reshape(-1, 1)

In [10]:
physical_devices = tf.config.list_physical_devices('GPU')
if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [11]:
train_graphs, test_graphs = get_data(config)
loader_tr = MyDisjointLoader(train_graphs, batch_size=config['batch_size'], epochs=config['epochs'], seed=config['seed'], radius=4, sampling_ratio=100)
loader_te = MyDisjointLoader(test_graphs, batch_size=config['batch_size'], epochs=1, seed=config['seed'], radius=1, sampling_ratio=1)

In [12]:
for b in loader_tr:
  print(b[0][0].shape)
  print(b[0][1].shape)
  print(b[0][2].shape)
  print(b[0][3].shape)
  print(b[0][4].shape)
  print(b[0][5].shape)
  break

(414, 9)
(414, 414)
(858, 3)
(414,)
(12800,)
(12800,)


In [13]:
def pref_lookup(X, pref_a, pref_b):
  X_a = tf.gather(X, pref_a, axis=0)
  X_b = tf.gather(X, pref_b, axis=0)
  return X_a, X_b

In [22]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from spektral.layers import GlobalSumPool, ECCConv, GraphMasking


class PRGNN(tf.keras.Model):
    def __init__(self, config):
        super().__init__()
        self.masking = GraphMasking()
        self.conv1 = ECCConv(256, activation="relu")
        self.conv2 = ECCConv(128, activation="relu")
        self.conv3 = ECCConv(64, activation="relu")
        self.conv4 = ECCConv(32, activation="relu")
        self.conv5 = ECCConv(16, activation="relu")
        self.dropout = Dropout(0.5)
        self.batchnorm = BatchNormalization()
        self.global_pool = GlobalSumPool()
        self.dense = Dense(config['n_out'], activation='relu')

    def call(self, inputs):
        x, a, e, i, idx_a, idx_b = inputs
        x = tf.cast(x, tf.float32)
        a = tf.cast(a, tf.float32)#anpassen
        e = tf.cast(e, tf.float32)

        x = self.masking(x)
        X = self.conv1([x, a, e])
        # X = self.dropout(X)
        X = self.conv2([X, a, e])
        # X = self.dropout(X)
        X = self.conv3([X, a, e])
        # X = self.dropout(X)
        X = self.conv4([X, a, e])
        # X = self.dropout(X)
        X = self.conv5([X, a, e])
        # X = self.dropout(X)

        # X = self.global_pool([X, i])
        X_util = self.dense(X)
        X_a, X_b = self.pref_lookup(X_util, idx_a, idx_b)

        return X_b - X_a, X_util

    def pref_lookup(self, X, pref_a, pref_b):

        X_a = tf.gather(X, pref_a, axis=0)
        X_b = tf.gather(X, pref_b, axis=0)

        return X_a, X_b

In [23]:
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from spektral.layers import GlobalSumPool, ECCConv, GraphMasking, GCNConv

from keras.losses import BinaryCrossentropy, MeanSquaredError
from keras.metrics import BinaryAccuracy

channels = 256  # Number of channels for GCN layers
dropout = 0.5  # Dropout rate for the features

def createPairwiseModel(config):#als tf function
        X_input = tf.keras.Input(shape=(9,), dtype=tf.float32)
        a_input = tf.keras.Input(shape=(None,), sparse=True)
        e_input = tf.keras.Input(shape=(3,), dtype=tf.float32)
        i_input = tf.keras.Input(shape=(None,), dtype=tf.int32)
        pref_a = tf.keras.Input(shape=(None,), dtype=tf.int32)
        pref_b = tf.keras.Input(shape=(None,), dtype=tf.int32)

        _model = PRGNN(config)
        out, X_utils = _model([X_input, a_input, e_input, i_input, pref_a, pref_b])

        m = tf.keras.Model(inputs=[X_input, a_input, e_input, i_input, pref_a, pref_b], outputs=out, name="RankNet")
        m_infer = tf.keras.Model(inputs=[X_input, a_input, e_input, i_input], outputs=X_utils, name="RankNet_predictor")

        m.compile(
            optimizer=
                Adam(config['learning_rate']),
                loss=BinaryCrossentropy(from_logits=True),
                metrics=[BinaryAccuracy(threshold=.0)]

        )

        return m, m_infer

In [24]:
rns = []
hs = []
for i in range(3):
  print(f"Training repeat {i+1}/3...")
  rn, rn_inf = createPairwiseModel(config)
  h = rn.fit(loader_tr.load(), validation_data=loader_te.load(), epochs=500, verbose=2)
  rns.append(rn_inf)
  hs.append(h)

Training repeat 1/3...


ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, None), dtype=tf.int32, name='input_23'), name='input_23', description="created by layer 'input_23'") at layer "prgnn_3". The following previous layers were accessed without issue: []