In [1]:
#import spektral
import numpy as np
import tensorflow as tf
from ogb.graphproppred import GraphPropPredDataset
from spektral.data import Dataset, Graph
#from spektral.datasets import TUDataset, QM9


In [2]:
config = {
    'seed': 1,
    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 0.001,
    'dataset': 'ogbg-molesol', #JA: QM9, ogbg-molesol, ogbg-molfreesolv, ogbg-mollipo, ZINC| NEIN: aspirin
    'train_test_split': 0.8
}

np.random.seed(config['seed'])
tf.random.set_seed(config['seed'])

In [3]:
class OGBDataset(Dataset):
    '''
    (spektral) Dataset class wrapper for Open Graph Benchmark datasets.
    '''
    def __init__(self, name, **kwargs):
        self.name = name
        super().__init__(**kwargs)

    def read(self):
        dataset = GraphPropPredDataset(name=self.name)
        graphs = []
        for data in dataset:
            edge_index = data[0]['edge_index']
            edge_feat = data[0]['edge_feat']
            node_feat = data[0]['node_feat']
            label = data[1]

            # Create adjacency matrix
            num_nodes = node_feat.shape[0]
            adj = np.zeros((num_nodes, num_nodes))
            for edge in edge_index.T:
                adj[edge[0], edge[1]] = 1

            # Create spektral Graph object
            graphs.append(Graph(x=node_feat, a=adj, e=edge_feat, y=label))
            
        self.size = len(graphs)

        return graphs

def ogb_available_datasets():
    #These regression datasets have size % 2 == 0 number of graphs
    return ['ogbg-molesol', 'ogbg-molfreesolv', 'ogbg-mollipo']

In [4]:
def _load_data(name: str):
    '''
    Loads a dataset from [TUDataset, OGB]
    '''
    if name in ogb_available_datasets():
        dataset= OGBDataset(name)
    else:
        raise ValueError(f'Dataset {name} unknown')

    return dataset, dataset.n_labels

In [5]:
def _split_data(data, train_test_split, seed):
    '''
    Split the data into train and test sets
    '''
    np.random.seed(seed)
    idxs = np.random.permutation(len(data))
    split = int(train_test_split * len(data))
    idx_train, idx_test = np.split(idxs, [split])
    train, test = data[idx_train], data[idx_test]
    train.size = len(train)
    test.size = len(test)
    return train, test

In [6]:
def get_data(config):
    seed = config['seed']
    train_test_split = config['train_test_split']
    name = config['dataset']

    # Load data
    data, config['n_out'] = _load_data(name)
    # Split data
    train_data, test_data = _split_data(data, train_test_split, seed)

    return train_data, test_data

In [7]:
dataset_train, dataset_test = get_data(config)

In [8]:
len(dataset_train), len(dataset_test)

(902, 226)

In [9]:
dataset_train

OGBDataset(n_graphs=902)

In [10]:
us = [g.y for g in dataset_test]

In [11]:
idx_sort = np.argsort(us, axis=0)

In [12]:
a = np.array([0,2,5,165])

In [16]:
for d in dataset_test[a]:
    print(d)

Graph(n_nodes=7, n_node_features=9, n_edge_features=3, n_labels=1)
Graph(n_nodes=9, n_node_features=9, n_edge_features=3, n_labels=1)
Graph(n_nodes=7, n_node_features=9, n_edge_features=3, n_labels=1)
Graph(n_nodes=12, n_node_features=9, n_edge_features=3, n_labels=1)


In [19]:
dataset_test[165]

Graph(n_nodes=12, n_node_features=9, n_edge_features=3, n_labels=1)

In [21]:
import tensorflow as tf

class CustomDataLoader(tf.keras.utils.Sequence):
    def __init__(self, data, batch_size=32, shuffle=True, seed=42, sampling_ratio=20):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.sampling_ratio = sampling_ratio
        self.indices = np.arange(len(self.data))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        # hier oder iterate_train_random
        indices = np.array(self.indices[index*self.batch_size:(index+1)*self.batch_size])
        pairs = list(self.iterate_train_random(indices))
        #hier iterate_train_random und sind die elements die indizes oder sonst was und was sind dann die utils
        batch_data = self.data[indices]#muss ich hier was machen
        batch_labels = [g.y for g in batch_data]#oder nur hier
        return batch_data, batch_labels

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)
    
    def iterate_train_random(self, elements):
        objects = elements
        sort_idx = np.argsort(objects)
        olen = objects.size
        seed = self.seed + olen
        pair_count = (olen * (olen - 1)) // 2
        sample_size = min(int(self.sampling_ratio * pair_count), pair_count)
        rng = np.random.default_rng(seed)

        sample = rng.choice(pair_count, sample_size, replace=False)
        sample_b = (np.sqrt(sample * 2 + 1/4) + 1/2).astype(np.int)
        sample_a = sample - (sample_b * (sample_b - 1)) // 2
        idx_a = sort_idx[sample_a]
        idx_b = sort_idx[sample_b]

        return zip(idx_a, idx_b)
    
    def get_graphs_from_indices(self, idx_pair):
        index_a, index_b = idx_pair
        print(index_a,index_b )
        return 0   

In [23]:
# Sample data
data = np.random.rand(1000, 32)  # 1000 samples, 32 features
labels = np.random.randint(0, 2, 1000)  # Binary labels

# Create DataLoader
batch_size = 64
sampling_ratio=4
data_loader = CustomDataLoader(dataset_test, batch_size=batch_size, sampling_ratio=sampling_ratio)

# Example model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(32,)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit model
model.fit(data_loader, epochs=1, verbose=1)

AttributeError: 'OGBDataset' object has no attribute 'shape'