In [50]:
import copy
import gzip
import json
import binascii
from typing import Any, List, Optional, Iterable, Callable
from collections import OrderedDict
import numpy as np
import scipy.sparse as sp

In [2]:
def is_gzipped(path: str) -> bool:
    with open(path, 'rb') as f:
        return binascii.hexlify(f.read(2)) == b'1f8b'

def read_binary(path: str) -> bytes:
    if is_gzipped(path):
        with gzip.open(path) as f:
            return f.read()
    else:
        with open(path, 'rb') as f:
            return f.read()

def read_text(path: str) -> str:
    return read_binary(path).decode('utf-8')

def read_jsonl(path: str) -> Iterable[Any]:
    """
    Parse JSONL files. See http://jsonlines.org/ for more.
    :param error_handling: a callable that receives the original line and the exception object and takes
            over how parse error handling should happen.
    :return: a iterator of the parsed objects of each line.
    """
    for line in read_text(path).splitlines():
        yield json.loads(line, object_pairs_hook=OrderedDict)

In [83]:
val = list(read_jsonl("data/qm9/valid.jsonl.gz"))
data = val

In [84]:
target_idx = [1, 2]

In [197]:
idx = [1, 250, 66, 14, 8829]
mols = [copy.deepcopy(data[i]) for i in idx]

In [198]:
def _edge_list_to_csr(edge_list):
    edge_array = np.array(edge_list)
    edgeid_to_target, edge_type, edgeid_to_source = edge_array.T
    natoms = np.max([edgeid_to_target, edgeid_to_source]) + 1
    adj_matrix = sp.csr_matrix((edge_type, (edgeid_to_target, edgeid_to_source)),
                               (natoms, natoms))
    adj_matrix += adj_matrix.T  # Make adjacency symmetric
    edge_type = adj_matrix.data - 1  # Make edge type start from 0
    adj_matrix.data.fill(1)
    return adj_matrix, edge_type

In [199]:
for mol in mols:
    mol['targets'] = np.array(mol['targets'])[target_idx].flatten()
    mol['adj_matrix'], mol['edge_type'] = _edge_list_to_csr(mol['graph'])

In [88]:
np.vstack([mol['targets'] for mol in mols])

array([[-0.11144937,  0.07785303],
       [-0.21300392,  1.2068366 ],
       [ 0.66183352,  0.07785303],
       [-0.38674787, -1.30151   ],
       [ 0.68630451,  0.05964362]])

In [200]:
np.hstack([mol['edge_type'] for mol in mols])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [None]:
dist = {'type': 'ppr', 'alpha': 0.15}

In [136]:
ppr_inner = np.eye(natoms) - (1 - dist['alpha']) * mol['adj_matrix'].A
ppr_inner

array([[ 1.  , -0.85,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
        -0.85, -0.85, -0.85,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [-0.85,  1.  , -0.85,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  , -0.85,  1.  , -0.85,  0.  ,  0.  , -0.85,  0.  ,  0.  ,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  , -0.85,  1.  , -0.85, -0.85,  0.  ,  0.  ,  0.  ,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , -0.85,  1.  , -0.85,  0.  ,  0.  ,  0.  ,
         0.  ,  0.  ,  0.  , -0.85, -0.85,  0.  ,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  ,  0.  , -0.85, -0.85,  1.  , -0.85,  0.  ,  0.  ,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  , -0.85,  0.  ,  0.  ,  0.  ],
       [ 0.  ,  0.  , -0.85,  0.  ,  0.  , -0.85,  1.  , -0.85, -0.85,
         0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ,  0.  ],

In [191]:
natoms = mol['adj_matrix'].shape[0]

eps = 1e-6
deg = mol['adj_matrix'].sum(1).A1
deg_inv_sqrt = np.sqrt(1 / (deg + eps))
T_sym = deg_inv_sqrt[None, :] * mol['adj_matrix'].A * deg_inv_sqrt[:, None]
ppr = dist['alpha'] * np.linalg.inv(np.eye(natoms) - (1 - dist['alpha']) * T_sym)

In [196]:
data[0]

OrderedDict([('targets',
              [[-0.37200695],
               [-2.6319599],
               [-1.8068211],
               [-0.22489363],
               [0.61777782],
               [-0.51425904],
               [-2.3368211],
               [2.1782131],
               [2.1805403],
               [2.1799278],
               [2.1852105],
               [-0.64080489],
               [-0.011956506]]),
             ('graph',
              [[0, 1, 1],
               [1, 1, 2],
               [1, 1, 3],
               [1, 1, 4],
               [4, 1, 5],
               [4, 1, 6],
               [4, 1, 9],
               [5, 1, 6],
               [6, 1, 7],
               [6, 1, 10],
               [7, 3, 8],
               [8, 1, 11]]),
             ('id', 'qm9:133582'),
             ('node_features',
              [[0, 0, 0, 0, 1, 9, -0.059756, 1, 0, 0, 0, 0, 1, 0, 0],
               [0, 1, 0, 0, 0, 6, 0.290528, 0, 0, 0, 0, 0, 1, 0, 0],
               [0, 0, 0, 0, 1, 9, -0.072299004, 1,