This is a tutorial about how to use GraphDot package to calculate the marginalized graph kernel

In [1]:
from graphdot.kernel.marginalized import MarginalizedGraphKernel
from graphdot.microkernel import (
    Additive,
    Constant as kC,
    TensorProduct,
    SquareExponential as sExp,
    KroneckerDelta as kDelta,
    Convolution as kConv,
    Normalize
)
from graphdot.microprobability import (
    UniformProbability,
    AssignProbability,
    Constant as kC_p,
    Additive as Additive_p
)
import rdkit.Chem.AllChem as Chem


importing from sympy.printing.cxxcode has been deprecated since SymPy
1.7. Use Import from sympy.printing.cxx instead. See
https://github.com/sympy/sympy/issues/20256 for more info.

  deprecated_since_version="1.7").warn()


In [2]:
import sys
sys.path.append('..')
from chemml.graph.hashgraph import HashGraph
from chemml.kernels.GraphKernel import NormalizationMolSize
from chemml.kernels.GraphKernel import Normalization
from chemml.graph.from_rdkit import rdkit_config

First, you need to define a labeled graph as you need.
An example is shown below:

In [3]:
smiles = ['CCCCO', 'CCCC']
graphs = list(map(HashGraph.from_smiles, smiles, [rdkit_config()] * 2, smiles))
HashGraph.unify_datatype(graphs, inplace=True)

In [4]:
graphs[0].nodes.to_pandas()

Unnamed: 0,!i,Aromatic,AtomicNumber,AtomicNumber_count_1,AtomicNumber_count_2,AtomicNumber_count_3,AtomicNumber_count_4,AtomicNumber_count_5,AtomicNumber_hash_1,AtomicNumber_hash_2,...,Hcount_hash_1,Hcount_list_1,Hcount_sum_1,Hybridization,InRing,MorganHash,RingSize_hash,RingSize_list,Ring_count,SingleAtom
0,0,False,6,1,1,1,1,1,-1.716774e+18,-1.716774e+18,...,6.570881e+18,[2],2,4,False,3542456614,-1593772956042337033,[0],0,False
1,1,False,6,2,1,1,1,1,1.061998e+18,-1.716774e+18,...,-2.6791e+18,"[3, 2]",5,4,False,1685248591,-1593772956042337033,[0],0,False
2,2,False,6,2,2,1,1,1,1.061998e+18,5.221679e+18,...,7.262059e+18,"[2, 2]",4,4,False,1973275706,-1593772956042337033,[0],0,False
3,3,False,6,2,1,1,1,1,5.221679e+18,-1.716774e+18,...,-6.572324e+18,"[2, 1]",3,4,False,1828340842,-1593772956042337033,[0],0,False
4,4,False,8,1,1,1,1,1,-1.716774e+18,-1.716774e+18,...,6.570881e+18,[2],2,4,False,1535166686,-1593772956042337033,[0],0,False


In [5]:
graphs[0].edges.to_pandas()

Unnamed: 0,!i,!j,Aromatic,Conjugated,InRing,Order,RingSize_hash,RingSize_list,RingStereo,Ring_count,Stereo
0,0,1,False,False,False,1.0,-1593772956042337033,[0],0.0,0,0
1,1,2,False,False,False,1.0,-1593772956042337033,[0],0.0,0,0
2,2,3,False,False,False,1.0,-1593772956042337033,[0],0.0,0,0
3,3,4,False,False,False,1.0,-1593772956042337033,[0],0.0,0,0


Then, you need to define the kernel function for node and edge.

The available elementary kernels include: KroneckerDelta, SquareExponential, Constant, and Convolution(for feature that is a list with variable length)

The available operations are TensorProduct and Additive.

You can define it with arbitrary combinations.

example:

In [6]:
knode = TensorProduct(
    AtomicNumber=kDelta(0.75, (0.5, 1.0)),
    Hcount=kDelta(0.9, (0.5, 1.0)),
    Chiral=kDelta(0.9, (0.5, 1.0)),
    RingSize_list=kConv(kDelta(0.9, (0.5, 1.0))),
    MorganHash=kDelta(0.9, (0.5, 1.0)),
    Ring_count=kDelta(0.9, (0.5, 1.0)),
)
kedge = TensorProduct(
    Order=kDelta(0.9, (0.5, 1.0)),
    Conjugated=kDelta(0.9, (0.5, 1.0)),
    Stereo=kDelta(0.9, (0.5, 1.0)),
    RingStereo=kDelta(0.9, (0.5, 1.0)),
)

Non-uniform start probability has not been well studied yet.

uniform start probability is used as follow

In [7]:
start_probability = kC_p(1.0)

The marginalized graph kernel is used as:

q is the stop probability.

Kernel normalization will improve the regression performance, if you are working on a target property that is not scale linearly with molecular size.

In [8]:
MGK = MarginalizedGraphKernel(
    node_kernel=knode,
    edge_kernel=kedge,
    q=0.05,
    q_bounds=(0.001, 0.5),
    p=start_probability,
)
nMGK = NormalizationMolSize(MGK, 1000)

kernel matrix computation

In [9]:
print(MGK(graphs), '\n')
print(nMGK(graphs))

[[87.96350861 68.08648682]
 [68.08648682 75.71812439]] 

[[1.         0.83415087]
 [0.83415087 1.        ]]
