In [1]:
import numpy as np
from ogb.graphproppred import GraphPropPredDataset
from spektral.datasets import TUDataset, QM9
from spektral.data import Dataset, Graph

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class OGBDataset(Dataset):
    '''
    (spektral) Dataset class wrapper for Open Graph Benchmark datasets.
    '''
    def __init__(self, name, **kwargs):
        self.name = name
        super().__init__(**kwargs)

    def read(self):
        dataset = GraphPropPredDataset(name=self.name)
        graphs = []
        for data in dataset:
            edge_index = data[0]['edge_index']
            edge_feat = data[0]['edge_feat']
            node_feat = data[0]['node_feat']
            label = data[1]

            # Create adjacency matrix
            num_nodes = node_feat.shape[0]
            adj = np.zeros((num_nodes, num_nodes))
            for edge in edge_index.T:
                adj[edge[0], edge[1]] = 1

            # Create spektral Graph object
            graphs.append(Graph(x=node_feat, a=adj, e=edge_feat, y=label))
            
        self.size = len(graphs)

        return graphs

def ogb_available_datasets():
    #These regression datasets have size % 2 == 0 number of graphs
    return ['ogbg-molesol', 'ogbg-molfreesolv', 'ogbg-mollipo']

In [3]:
def _load_data(config):
    '''
    Loads a dataset from [TUDataset, OGB]
    '''
    name = config['dataset']
    # if name == 'QM9':
    #     dataset = QM9(amount=10)# 1000 and 100000 ok
    if name in TUDataset.available_datasets():
        dataset = TUDataset(name)
        config['x_shape1'] = 28
    elif name in ogb_available_datasets():
        dataset= OGBDataset(name)
        config['x_shape1'] = 9
    else:
        raise ValueError(f'Dataset {name} unknown')

    return dataset, dataset.n_labels

In [4]:
def _split_data(data, train_test_split, seed):
    '''
    Split the data into train and test sets
    '''
    np.random.seed(seed)
    idxs = np.random.permutation(len(data))
    split = int(train_test_split * len(data))
    idx_train, idx_test = np.split(idxs, [split])
    train, test = data[idx_train], data[idx_test]
    train.size = len(train)
    test.size = len(test)
    return train, test

In [5]:
def _rankData(data):
    indexed_graphs= list(enumerate(data))

    sorted_indexed_graphs = sorted(indexed_graphs, key=lambda x: x[1].y)

    sorted_graphs = [g for index, g in sorted_indexed_graphs]
    original_indices = [index for index, g in sorted_indexed_graphs]

    return zip(sorted_graphs, original_indices)

In [6]:
def get_data(config):
    seed = config['seed']
    train_test_split = config['train_test_split']

    # Load data
    data, config['n_out'] = _load_data(config)
    #ground_truth_ranking = _rankData(data)
    # Split data
    train_data, test_data = _split_data(data, train_test_split, seed)

    return train_data, test_data, _rankData(train_data), _rankData(test_data)

In [7]:
config = {
    'seed': 1,
    'epochs': 10,
    'batch_size': 32,
    'learning_rate': 0.001,
    'dataset': 'ogbg-molesol', #JA: QM9, ogbg-molesol, ogbg-molfreesolv, ogbg-mollipo, ZINC_full| NEIN: aspirin
    'train_test_split': 0.8
}
dataset_train, dataset_test, r_train, r_test = get_data(config)

In [8]:
dataset_train

OGBDataset(n_graphs=902)

In [9]:
ys = [x[0].y for x in r_test]

In [10]:
ys

[array([-8.71]),
 array([-8.49]),
 array([-8.4]),
 array([-7.92]),
 array([-7.85]),
 array([-7.68]),
 array([-7.43]),
 array([-7.21]),
 array([-7.2]),
 array([-7.15]),
 array([-7.]),
 array([-6.9]),
 array([-6.876]),
 array([-6.86]),
 array([-6.8]),
 array([-6.726]),
 array([-6.57]),
 array([-6.49]),
 array([-6.291]),
 array([-6.25]),
 array([-6.144]),
 array([-6.124]),
 array([-6.09]),
 array([-6.025]),
 array([-6.01]),
 array([-5.72]),
 array([-5.68]),
 array([-5.64]),
 array([-5.47]),
 array([-5.46]),
 array([-5.382]),
 array([-5.37]),
 array([-5.28]),
 array([-5.26]),
 array([-5.21]),
 array([-5.19]),
 array([-5.184]),
 array([-5.153]),
 array([-5.05]),
 array([-4.9]),
 array([-4.88]),
 array([-4.873]),
 array([-4.871]),
 array([-4.805]),
 array([-4.8]),
 array([-4.799]),
 array([-4.77]),
 array([-4.76]),
 array([-4.735]),
 array([-4.678]),
 array([-4.66]),
 array([-4.63]),
 array([-4.63]),
 array([-4.62]),
 array([-4.594]),
 array([-4.57]),
 array([-4.57]),
 array([-4.53]),
 array

In [11]:
from scipy.stats import kendalltau, spearmanr

In [12]:
ys2 = np.random.permutation(ys)

In [13]:
res1, res2 = kendalltau(ys, ys2), spearmanr(ys, ys2)
res1, res2

(KendalltauResult(correlation=0.007990553040740011, pvalue=0.8582019151934875),
 SpearmanrResult(correlation=0.012352616146247823, pvalue=0.8534817146408213))

In [14]:
res1, res2 = kendalltau(ys, ys), spearmanr(ys, ys)
res1, res2

(KendalltauResult(correlation=1.0, pvalue=9.794289544858485e-111),
 SpearmanrResult(correlation=1.0, pvalue=0.0))

In [15]:
a = range(0,100)
b = np.random.permutation(a)
res1, res2 = kendalltau(a,b), spearmanr(a,b)
res1, res2

(KendalltauResult(correlation=-0.06989898989898992, pvalue=0.30280894319079166),
 SpearmanrResult(correlation=-0.09453345334533451, pvalue=0.3495066764689899))

In [16]:
a = [1,2,3,4,5,6]
b = ['t','ä','#','H','2','v']
res1, res2 = kendalltau(a,b), spearmanr(a,b)
res1, res2

(KendalltauResult(correlation=-0.06666666666666665, pvalue=1.0),
 SpearmanrResult(correlation=-0.14285714285714288, pvalue=0.7871720116618075))

In [17]:
a = [1,2,3,4,5,6]
b = [6,5,4,3,2,1]
res1, res2 = kendalltau(a,b), spearmanr(a,b)
res1, res2

(KendalltauResult(correlation=-0.9999999999999999, pvalue=0.002777777777777778),
 SpearmanrResult(correlation=-1.0, pvalue=0.0))