In [1]:
import numpy as np
import tensorflow as tf
from ogb.graphproppred import GraphPropPredDataset
import numpy as np
from spektral.datasets import TUDataset, QM9
from spektral.data import Dataset, Graph
from collections import Counter
from scipy.sparse import hstack, vstack

2025-02-03 11:41:04.277350: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
class OGBDataset(Dataset):
    '''
    (spektral) Dataset class wrapper for Open Graph Benchmark datasets.
    '''
    def __init__(self, name, **kwargs):
        self.name = name
        super().__init__(**kwargs)

    def read(self):
        dataset = GraphPropPredDataset(name=self.name)
        graphs = []
        for data in dataset:
            edge_index = data[0]['edge_index']
            edge_feat = data[0]['edge_feat']
            node_feat = data[0]['node_feat']
            label = data[1]

            # Create adjacency matrix
            num_nodes = node_feat.shape[0]
            adj = np.zeros((num_nodes, num_nodes))
            for edge in edge_index.T:
                adj[edge[0], edge[1]] = 1

            # Create spektral Graph object
            graphs.append(Graph(x=node_feat, a=adj, e=edge_feat, y=label))
            
        self.size = len(graphs)

        return graphs

def ogb_available_datasets():
    #These regression datasets have size % 2 == 0 number of graphs
    return ['ogbg-molesol', 'ogbg-molfreesolv', 'ogbg-mollipo']

In [3]:
def _load_data(name: str):
    '''
    Loads a dataset from [TUDataset, OGB]
    '''
    # if name == 'QM9':
    #     dataset = QM9(amount=10)# 1000 and 100000 ok
    if name in TUDataset.available_datasets():
        dataset = TUDataset(name)
        config['x_shape1'] = 28
    elif name in ogb_available_datasets():
        dataset= OGBDataset(name)
        config['x_shape1'] = 9
    else:
        raise ValueError(f'Dataset {name} unknown')

    return dataset, dataset.n_labels

In [4]:
def _split_data(data, train_test_split, seed):
    '''
    Split the data into train and test sets
    '''
    np.random.seed(seed)
    idxs = np.random.permutation(len(data))
    split = int(train_test_split * len(data))
    idx_train, idx_test = np.split(idxs, [split])
    train, test = data[idx_train], data[idx_test]
    train.size = len(train)
    test.size = len(test)
    return train, test

In [5]:
def get_data(config):
    seed = config['seed']
    train_test_split = config['train_test_split']
    name = config['dataset']

    # Load data
    data, config['n_out'] = _load_data(name)
    # Split data
    train_data, test_data = _split_data(data, train_test_split, seed)

    return train_data, test_data

In [6]:
config = {
    'seed': 1,
    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 0.001,
    'dataset': 'ogbg-molesol', #JA: QM9, ogbg-molesol, ogbg-molfreesolv, ogbg-mollipo, ZINC_full| NEIN: aspirin
    'train_test_split': 0.8
}

In [7]:
dataset_train, dataset_test = get_data(config)

In [8]:
dataset_train

OGBDataset(n_graphs=902)

In [9]:
config

{'seed': 1,
 'epochs': 10,
 'batch_size': 32,
 'learning_rate': 0.001,
 'dataset': 'ogbg-molesol',
 'train_test_split': 0.8,
 'x_shape1': 9,
 'n_out': 1}

In [10]:
g_1, g_2 = dataset_test[4], dataset_test[45]

In [11]:
g_1.x.shape, g_2.x.shape

((8, 9), (23, 9))

In [12]:
g_1.a.shape, g_2.a.shape

((8, 8), (23, 23))

In [13]:
g_1.e.shape, g_2.e.shape

((16, 3), (46, 3))

In [14]:
g_1.a

array([[0., 1., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1., 0., 1.],
       [0., 0., 1., 0., 0., 0., 1., 0.]])

In [15]:
g_2.a

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0.,

In [16]:
g_1.a = np.pad(g_1.a, ((0, g_2.a.shape[0]-g_1.a.shape[0]), (0, g_2.a.shape[1]-g_1.a.shape[1])), mode='constant', constant_values=0)

In [17]:
g_1.a

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0.,

In [18]:
new_g = Graph()

In [19]:
g_1.a.shape, g_2.a.shape

((23, 23), (23, 23))

In [20]:
a = np.vstack((g_1.a, g_2.a))

In [21]:
a.shape

(46, 23)

In [22]:
x = np.vstack((g_1.x, g_2.x))

In [23]:
x.shape

(31, 9)

In [24]:
e = np.vstack((g_1.e, g_2.e))

In [25]:
e.shape

(62, 3)

In [26]:
new_g.a = a
new_g.x = x
new_g.e = e

In [27]:
new_g

Graph(n_nodes=31, n_node_features=9, n_edge_features=3, n_labels=None)

In [28]:
g_2

Graph(n_nodes=23, n_node_features=9, n_edge_features=3, n_labels=1)

In [29]:
idx_a = [2,4,6,7,8,9,12,15,76,87,92,123,221,178,94,199,245,267,301,455,467,478,499]
idx_b = [512,501,500,452,409,480,212,312,5,345,7,58,2,23,99,302,303,333,387,356,401,405,406]
len(idx_a), len(idx_b)

(23, 23)

# info
ogb graphs x.shape[1] is 9 
zinc_fill graphs x.shape[1] is 28
graphs e.shape is 3
graphs a.shape is different for all graphs

In [30]:
for i, el in enumerate(dataset_train):
    if (el.x.shape[1]!=28 or el.e.shape[1]!=3):
        print(el.x.shape, el.e.shape)
    #print(el.a.shape)
print("in shape")

(13, 9) (24, 3)
(24, 9) (50, 3)
(6, 9) (12, 3)
(18, 9) (38, 3)
(11, 9) (22, 3)
(15, 9) (32, 3)
(11, 9) (24, 3)
(20, 9) (42, 3)
(23, 9) (52, 3)
(12, 9) (22, 3)
(17, 9) (34, 3)
(9, 9) (18, 3)
(16, 9) (34, 3)
(9, 9) (18, 3)
(16, 9) (32, 3)
(34, 9) (72, 3)
(4, 9) (6, 3)
(7, 9) (12, 3)
(11, 9) (24, 3)
(11, 9) (24, 3)
(18, 9) (38, 3)
(9, 9) (16, 3)
(4, 9) (6, 3)
(4, 9) (6, 3)
(7, 9) (12, 3)
(14, 9) (30, 3)
(22, 9) (46, 3)
(11, 9) (20, 3)
(8, 9) (16, 3)
(9, 9) (20, 3)
(8, 9) (16, 3)
(13, 9) (28, 3)
(9, 9) (20, 3)
(18, 9) (38, 3)
(28, 9) (56, 3)
(17, 9) (34, 3)
(16, 9) (34, 3)
(10, 9) (18, 3)
(18, 9) (42, 3)
(11, 9) (24, 3)
(26, 9) (58, 3)
(14, 9) (26, 3)
(20, 9) (40, 3)
(21, 9) (48, 3)
(4, 9) (6, 3)
(8, 9) (14, 3)
(17, 9) (36, 3)
(13, 9) (28, 3)
(9, 9) (16, 3)
(18, 9) (36, 3)
(22, 9) (42, 3)
(7, 9) (14, 3)
(5, 9) (8, 3)
(6, 9) (10, 3)
(8, 9) (16, 3)
(14, 9) (26, 3)
(17, 9) (36, 3)
(6, 9) (10, 3)
(7, 9) (12, 3)
(14, 9) (30, 3)
(22, 9) (46, 3)
(12, 9) (26, 3)
(11, 9) (22, 3)
(2, 9) (2, 3)
(12, 

In [31]:
# mode=="type-vstack"
assert(len(idx_a)==len(idx_b))
for i in range(len(idx_a)):
    g_1, g_2 = dataset_train[idx_a[i]], dataset_train[idx_b[i]]
    assert(g_1.x.shape[1]==config['x_shape1'] and g_2.x.shape[1]==config['x_shape1'])
    assert(g_2.e.shape[1]==3 and g_2.e.shape[1]==3)
    
    if not (g_1.a.shape>g_2.a.shape or g_1.a.shape<g_2.a.shape): 
        a = np.vstack((g_1.a, g_2.a))
    elif (g_1.a.shape > g_2.a.shape):
        _a = np.pad(g_2.a, ((0, g_1.a.shape[0]-g_2.a.shape[0]), (0, g_1.a.shape[1]-g_2.a.shape[1])), mode='constant', constant_values=0)
        a = np.vstack((g_1.a, _a))
    elif (g_1.a.shape < g_2.a.shape):
        _a = np.pad(g_1.a, ((0, g_2.a.shape[0]-g_1.a.shape[0]), (0, g_2.a.shape[1]-g_1.a.shape[1])), mode='constant', constant_values=0)
        a = np.vstack((_a, g_2.a))
    
    g_n = Graph()
    g_n.x = np.vstack((g_1.x, g_2.x))
    g_n.e = np.vstack((g_1.e, g_2.e))
    g_n.a = a

In [32]:
# mode=="type-hstack"
assert(len(idx_a)==len(idx_b))
for i in range(len(idx_a)):
    g_1, g_2 = dataset_train[idx_a[i]], dataset_train[idx_b[i]]
    assert(g_1.x.shape[1]==config['x_shape1'] and g_2.x.shape[1]==config['x_shape1'])
    assert(g_2.e.shape[1]==3 and g_2.e.shape[1]==3)
    #a
    if not (g_1.a.shape > g_2.a.shape or g_1.a.shape < g_2.a.shape): 
        a = np.hstack((g_1.a, g_2.a))
    elif (g_1.a.shape > g_2.a.shape):
        _a = np.pad(g_2.a, ((0, g_1.a.shape[0]-g_2.a.shape[0]), (0, g_1.a.shape[1]-g_2.a.shape[1])), mode='constant', constant_values=0)
        a = np.hstack((g_1.a, _a))
    elif (g_1.a.shape < g_2.a.shape):
        _a = np.pad(g_1.a, ((0, g_2.a.shape[0]-g_1.a.shape[0]), (0, g_2.a.shape[1]-g_1.a.shape[1])), mode='constant', constant_values=0)
        a = np.hstack((_a, g_2.a))
    #x
    if not (g_1.x.shape > g_2.x.shape or g_1.x.shape < g_2.x.shape): 
        x = np.hstack((g_1.x, g_2.x))
    elif (g_1.x.shape > g_2.x.shape):
        _x = np.pad(g_2.x, ((0, g_1.x.shape[0]-g_2.x.shape[0]), (0, g_1.x.shape[1]-g_2.x.shape[1])), mode='constant', constant_values=0)
        x = np.hstack((g_1.x, _x))
    elif (g_1.x.shape < g_2.x.shape):
        _x = np.pad(g_1.x, ((0, g_2.x.shape[0]-g_1.x.shape[0]), (0, g_2.x.shape[1]-g_1.x.shape[1])), mode='constant', constant_values=0)
        x = np.hstack((_x, g_2.x))  
    #e
    if not (g_1.e.shape > g_2.e.shape or g_1.e.shape < g_2.e.shape): 
        e = np.hstack((g_1.e, g_2.e))
    elif (g_1.e.shape > g_2.e.shape):
        _e = np.pad(g_2.e, ((0, g_1.e.shape[0]-g_2.e.shape[0]), (0, g_1.e.shape[1]-g_2.e.shape[1])), mode='constant', constant_values=0)
        e = np.hstack((g_1.e, _e))
    elif (g_1.e.shape < g_2.e.shape):
        _e = np.pad(g_1.e, ((0, g_2.e.shape[0]-g_1.e.shape[0]), (0, g_2.e.shape[1]-g_1.e.shape[1])), mode='constant', constant_values=0)
        e = np.hstack((_e, g_2.e))
    
    g_n = Graph()
    g_n.x = x
    g_n.e = e
    g_n.a = a

In [33]:
g_1, g_2 = dataset_train[2], dataset_train[512]
print(g_1.x.shape,g_2.x.shape)

(6, 9) (27, 9)


In [34]:
g_n = g_1 +g_2

TypeError: unsupported operand type(s) for +: 'Graph' and 'Graph'

In [37]:
# mode=="type-merge-mult"
assert(len(idx_a)==len(idx_b))
for i in range(len(idx_a)):
    g_1, g_2 = dataset_train[idx_a[i]], dataset_train[idx_b[i]]
    assert(g_1.x.shape[1]==config['x_shape1'] and g_2.x.shape[1]==config['x_shape1'])
    assert(g_2.e.shape[1]==3 and g_2.e.shape[1]==3)
    #a
    if not (g_1.a.shape > g_2.a.shape or g_1.a.shape < g_2.a.shape): 
        a = g_1.a * g_2.a
    elif (g_1.a.shape > g_2.a.shape):
        _a = np.pad(g_2.a, ((0, g_1.a.shape[0]-g_2.a.shape[0]), (0, g_1.a.shape[1]-g_2.a.shape[1])), mode='constant', constant_values=0)
        a = g_1.a * _a
    elif (g_1.a.shape < g_2.a.shape):
        _a = np.pad(g_1.a, ((0, g_2.a.shape[0]-g_1.a.shape[0]), (0, g_2.a.shape[1]-g_1.a.shape[1])), mode='constant', constant_values=0)
        a = _a * g_2.a
    #x
    if not (g_1.x.shape > g_2.x.shape or g_1.x.shape < g_2.x.shape): 
        x = g_1.x * g_2.x
    elif (g_1.x.shape > g_2.x.shape):
        _x = np.pad(g_2.x, ((0, g_1.x.shape[0]-g_2.x.shape[0]), (0, g_1.x.shape[1]-g_2.x.shape[1])), mode='constant', constant_values=0)
        x = g_1.x * _x
    elif (g_1.x.shape < g_2.x.shape):
        _x = np.pad(g_1.x, ((0, g_2.x.shape[0]-g_1.x.shape[0]), (0, g_2.x.shape[1]-g_1.x.shape[1])), mode='constant', constant_values=0)
        x = _x * g_2.x
    #e
    if not (g_1.e.shape > g_2.e.shape or g_1.e.shape < g_2.e.shape): 
        e = g_1.e * g_2.e
    elif (g_1.e.shape > g_2.e.shape):
        _e = np.pad(g_2.e, ((0, g_1.e.shape[0]-g_2.e.shape[0]), (0, g_1.e.shape[1]-g_2.e.shape[1])), mode='constant', constant_values=0)
        e = g_1.e * _e
    elif (g_1.e.shape < g_2.e.shape):
        _e = np.pad(g_1.e, ((0, g_2.e.shape[0]-g_1.e.shape[0]), (0, g_2.e.shape[1]-g_1.e.shape[1])), mode='constant', constant_values=0)
        e = _e * g_2.e
    g_n = Graph()
    g_n.a = a
    g_n.x = x
    g_n.e = e

In [41]:
# mode=="type-merge-add"
assert(len(idx_a)==len(idx_b))
for i in range(len(idx_a)):
    g_1, g_2 = dataset_train[idx_a[i]], dataset_train[idx_b[i]]
    assert(g_1.x.shape[1]==config['x_shape1'] and g_2.x.shape[1]==config['x_shape1'])
    assert(g_2.e.shape[1]==3 and g_2.e.shape[1]==3)
    #a
    if not (g_1.a.shape > g_2.a.shape or g_1.a.shape < g_2.a.shape): 
        a = g_1.a + g_2.a
    elif (g_1.a.shape > g_2.a.shape):
        _a = np.pad(g_2.a, ((0, g_1.a.shape[0]-g_2.a.shape[0]), (0, g_1.a.shape[1]-g_2.a.shape[1])), mode='constant', constant_values=0)
        a = g_1.a + _a
    elif (g_1.a.shape < g_2.a.shape):
        _a = np.pad(g_1.a, ((0, g_2.a.shape[0]-g_1.a.shape[0]), (0, g_2.a.shape[1]-g_1.a.shape[1])), mode='constant', constant_values=0)
        a = _a + g_2.a
    #x
    if not (g_1.x.shape > g_2.x.shape or g_1.x.shape < g_2.x.shape): 
        x = g_1.x + g_2.x
    elif (g_1.x.shape > g_2.x.shape):
        _x = np.pad(g_2.x, ((0, g_1.x.shape[0]-g_2.x.shape[0]), (0, g_1.x.shape[1]-g_2.x.shape[1])), mode='constant', constant_values=0)
        x = g_1.x + _x
    elif (g_1.x.shape < g_2.x.shape):
        _x = np.pad(g_1.x, ((0, g_2.x.shape[0]-g_1.x.shape[0]), (0, g_2.x.shape[1]-g_1.x.shape[1])), mode='constant', constant_values=0)
        x = _x + g_2.x
    #e
    if not (g_1.e.shape > g_2.e.shape or g_1.e.shape < g_2.e.shape): 
        e = g_1.e + g_2.e
    elif (g_1.e.shape > g_2.e.shape):
        _e = np.pad(g_2.e, ((0, g_1.e.shape[0]-g_2.e.shape[0]), (0, g_1.e.shape[1]-g_2.e.shape[1])), mode='constant', constant_values=0)
        e = g_1.e + _e
    elif (g_1.e.shape < g_2.e.shape):
        _e = np.pad(g_1.e, ((0, g_2.e.shape[0]-g_1.e.shape[0]), (0, g_2.e.shape[1]-g_1.e.shape[1])), mode='constant', constant_values=0)
        e = _e + g_2.e
    g_n = Graph()
    g_n.a = a
    g_n.x = x
    g_n.e = e

In [43]:
# mode=="type-merge-mean"
assert(len(idx_a)==len(idx_b))
for i in range(len(idx_a)):
    g_1, g_2 = dataset_train[idx_a[i]], dataset_train[idx_b[i]]
    assert(g_1.x.shape[1]==config['x_shape1'] and g_2.x.shape[1]==config['x_shape1'])
    assert(g_2.e.shape[1]==3 and g_2.e.shape[1]==3)
    #a
    if not (g_1.a.shape > g_2.a.shape or g_1.a.shape < g_2.a.shape): 
        a = (g_1.a + g_2.a) // 2
    elif (g_1.a.shape > g_2.a.shape):
        _a = np.pad(g_2.a, ((0, g_1.a.shape[0]-g_2.a.shape[0]), (0, g_1.a.shape[1]-g_2.a.shape[1])), mode='constant', constant_values=0)
        a = (g_1.a + _a) // 2
    elif (g_1.a.shape < g_2.a.shape):
        _a = np.pad(g_1.a, ((0, g_2.a.shape[0]-g_1.a.shape[0]), (0, g_2.a.shape[1]-g_1.a.shape[1])), mode='constant', constant_values=0)
        a = (_a + g_2.a) // 2
    #x
    if not (g_1.x.shape > g_2.x.shape or g_1.x.shape < g_2.x.shape): 
        x = (g_1.x + g_2.x) // 2
    elif (g_1.x.shape > g_2.x.shape):
        _x = np.pad(g_2.x, ((0, g_1.x.shape[0]-g_2.x.shape[0]), (0, g_1.x.shape[1]-g_2.x.shape[1])), mode='constant', constant_values=0)
        x = (g_1.x + _x) // 2
    elif (g_1.x.shape < g_2.x.shape):
        _x = np.pad(g_1.x, ((0, g_2.x.shape[0]-g_1.x.shape[0]), (0, g_2.x.shape[1]-g_1.x.shape[1])), mode='constant', constant_values=0)
        x = (_x + g_2.x) // 2
    #e
    if not (g_1.e.shape > g_2.e.shape or g_1.e.shape < g_2.e.shape): 
        e = (g_1.e + g_2.e) // 2
    elif (g_1.e.shape > g_2.e.shape):
        _e = np.pad(g_2.e, ((0, g_1.e.shape[0]-g_2.e.shape[0]), (0, g_1.e.shape[1]-g_2.e.shape[1])), mode='constant', constant_values=0)
        e = (g_1.e + _e) // 2
    elif (g_1.e.shape < g_2.e.shape):
        _e = np.pad(g_1.e, ((0, g_2.e.shape[0]-g_1.e.shape[0]), (0, g_2.e.shape[1]-g_1.e.shape[1])), mode='constant', constant_values=0)
        e = (_e + g_2.e) // 2
    g_n = Graph()
    g_n.a = a
    g_n.x = x
    g_n.e = e