# Evaluation Metrics 

This script takes smiles or graph inputs of ground-truth and generated insecticidal compounds and computes:

-The distribution of Log P values

-The structural diversity of the compounds

-The insecticide-likeness of the compounds

Written by Tobias D. Muellers

## Load dependencies

In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.utils import to_dense_adj

## Graphs to SMILES

This code takes generated graphs and converts them to a dataset of SMILES for further analysis

In [2]:
# import graphs which are pytorch datasets
graphs = torch.load('insecticides_graphs.pt')

  graphs = torch.load('insecticides_graphs.pt')


In [3]:
# from https://pytorch-geometric.readthedocs.io/en/2.4.0/_modules/torch_geometric/utils/smiles.html
from typing import Any

import torch

import torch_geometric

x_map = {
    'atomic_num':
    list(range(0, 119)),
    'chirality': [
        'CHI_UNSPECIFIED',
        'CHI_TETRAHEDRAL_CW',
        'CHI_TETRAHEDRAL_CCW',
        'CHI_OTHER',
        'CHI_TETRAHEDRAL',
        'CHI_ALLENE',
        'CHI_SQUAREPLANAR',
        'CHI_TRIGONALBIPYRAMIDAL',
        'CHI_OCTAHEDRAL',
    ],
    'degree':
    list(range(0, 11)),
    'formal_charge':
    list(range(-5, 7)),
    'num_hs':
    list(range(0, 9)),
    'num_radical_electrons':
    list(range(0, 5)),
    'hybridization': [
        'UNSPECIFIED',
        'S',
        'SP',
        'SP2',
        'SP3',
        'SP3D',
        'SP3D2',
        'OTHER',
    ],
    'is_aromatic': [False, True],
    'is_in_ring': [False, True],
}

e_map = {
    'bond_type': [
        'UNSPECIFIED',
        'SINGLE',
        'DOUBLE',
        'TRIPLE',
        'QUADRUPLE',
        'QUINTUPLE',
        'HEXTUPLE',
        'ONEANDAHALF',
        'TWOANDAHALF',
        'THREEANDAHALF',
        'FOURANDAHALF',
        'FIVEANDAHALF',
        'AROMATIC',
        'IONIC',
        'HYDROGEN',
        'THREECENTER',
        'DATIVEONE',
        'DATIVE',
        'DATIVEL',
        'DATIVER',
        'OTHER',
        'ZERO',
    ],
    'stereo': [
        'STEREONONE',
        'STEREOANY',
        'STEREOZ',
        'STEREOE',
        'STEREOCIS',
        'STEREOTRANS',
    ],
    'is_conjugated': [False, True],
}


def from_smiles(smiles: str, with_hydrogen: bool = False,
                kekulize: bool = False) -> 'torch_geometric.data.Data':
    r"""Converts a SMILES string to a :class:`torch_geometric.data.Data`
    instance.

    Args:
        smiles (str): The SMILES string.
        with_hydrogen (bool, optional): If set to :obj:`True`, will store
            hydrogens in the molecule graph. (default: :obj:`False`)
        kekulize (bool, optional): If set to :obj:`True`, converts aromatic
            bonds to single/double bonds. (default: :obj:`False`)
    """
    from rdkit import Chem, RDLogger

    from torch_geometric.data import Data

    RDLogger.DisableLog('rdApp.*')

    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        mol = Chem.MolFromSmiles('')
    if with_hydrogen:
        mol = Chem.AddHs(mol)
    if kekulize:
        Chem.Kekulize(mol)

    xs = []
    for atom in mol.GetAtoms():
        x = []
        x.append(x_map['atomic_num'].index(atom.GetAtomicNum()))
        x.append(x_map['chirality'].index(str(atom.GetChiralTag())))
        x.append(x_map['degree'].index(atom.GetTotalDegree()))
        x.append(x_map['formal_charge'].index(atom.GetFormalCharge()))
        x.append(x_map['num_hs'].index(atom.GetTotalNumHs()))
        x.append(x_map['num_radical_electrons'].index(
            atom.GetNumRadicalElectrons()))
        x.append(x_map['hybridization'].index(str(atom.GetHybridization())))
        x.append(x_map['is_aromatic'].index(atom.GetIsAromatic()))
        x.append(x_map['is_in_ring'].index(atom.IsInRing()))
        xs.append(x)

    x = torch.tensor(xs, dtype=torch.long).view(-1, 9)

    edge_indices, edge_attrs = [], []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()

        e = []
        e.append(e_map['bond_type'].index(str(bond.GetBondType())))
        e.append(e_map['stereo'].index(str(bond.GetStereo())))
        e.append(e_map['is_conjugated'].index(bond.GetIsConjugated()))

        edge_indices += [[i, j], [j, i]]
        edge_attrs += [e, e]

    edge_index = torch.tensor(edge_indices)
    edge_index = edge_index.t().to(torch.long).view(2, -1)
    edge_attr = torch.tensor(edge_attrs, dtype=torch.long).view(-1, 3)

    if edge_index.numel() > 0:  # Sort indices.
        perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
        edge_index, edge_attr = edge_index[:, perm], edge_attr[perm]

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, smiles=smiles)



def to_smiles(data: 'torch_geometric.data.Data',
              kekulize: bool = False) -> Any:
    """Converts a :class:`torch_geometric.data.Data` instance to a SMILES
    string.

    Args:
        data (torch_geometric.data.Data): The molecular graph.
        kekulize (bool, optional): If set to :obj:`True`, converts aromatic
            bonds to single/double bonds. (default: :obj:`False`)
    """
    from rdkit import Chem

    mol = Chem.RWMol()

    for i in range(data.num_nodes):
        atom = Chem.Atom(data.x[i, 0].item())
        atom.SetChiralTag(Chem.rdchem.ChiralType.values[data.x[i, 1].item()])
        atom.SetFormalCharge(x_map['formal_charge'][data.x[i, 3].item()])
        atom.SetNumExplicitHs(x_map['num_hs'][data.x[i, 4].item()])
        atom.SetNumRadicalElectrons(
            x_map['num_radical_electrons'][data.x[i, 5].item()])
        atom.SetHybridization(
            Chem.rdchem.HybridizationType.values[data.x[i, 6].item()])
        atom.SetIsAromatic(data.x[i, 7].item())
        mol.AddAtom(atom)

    edges = [tuple(i) for i in data.edge_index.t().tolist()]
    visited = set()

    for i in range(len(edges)):
        src, dst = edges[i]
        if tuple(sorted(edges[i])) in visited:
            continue

        bond_type = Chem.BondType.values[data.edge_attr[i, 0].item()]
        mol.AddBond(src, dst, bond_type)

        # Set stereochemistry:
        stereo = Chem.rdchem.BondStereo.values[data.edge_attr[i, 1].item()]
        if stereo != Chem.rdchem.BondStereo.STEREONONE:
            db = mol.GetBondBetweenAtoms(src, dst)
            db.SetStereoAtoms(dst, src)
            db.SetStereo(stereo)

        # Set conjugation:
        is_conjugated = bool(data.edge_attr[i, 2].item())
        mol.GetBondBetweenAtoms(src, dst).SetIsConjugated(is_conjugated)

        visited.add(tuple(sorted(edges[i])))

    mol = mol.GetMol()

    if kekulize:
        Chem.Kekulize(mol)

    Chem.SanitizeMol(mol)
    Chem.AssignStereochemistry(mol)

    return Chem.MolToSmiles(mol, isomericSmiles=True)

In [106]:
test = from_smiles('CCCO', with_hydrogen= False, kekulize = False) 

In [7]:
graphs[0].x[1,0].item()

0.0

In [8]:
graphs[0].num_nodes

10

In [4]:
to_smiles(graphs[0])

ArgumentError: Python argument types in
    Atom.__init__(Atom, float)
did not match C++ signature:
    __init__(struct _object * __ptr64 self, unsigned int num)
    __init__(struct _object * __ptr64 self, class RDKit::Atom other)
    __init__(struct _object * __ptr64 self, class std::basic_string<char,struct std::char_traits<char>,class std::allocator<char> > what)

In [86]:
print(graphs[0])
print(graphs[0].num_nodes)
print(graphs[0].x[3])
expanded_adj = graphs[0].edge_index
edge_attributes = graphs[0].edge_attr#.flatten()

Data(x=[10, 79], edge_index=[2, 20], edge_attr=[20, 10], y=[1])
10
tensor([1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 1.0000, 1.0000, 0.0103, 0.3333, 0.1579, 1.0000, 0.0000, 0.0000,
        0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000])


In [87]:
edge_attributes.shape

torch.Size([20, 10])

In [97]:
to_dense_adj(expanded_adj) 

tensor([[[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [1., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 1., 0., 0., 1., 0., 0., 0., 1., 0.],
         [0., 0., 0., 1., 0., 1., 0., 0., 0., 0.],
         [0., 0., 0., 0., 1., 0., 1., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 1., 0.],
         [0., 0., 0., 1., 0., 0., 0., 1., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]])

In [90]:
graphs[0].x

tensor([[0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0447, 0.0833, 0.0263, 1.0000, 0.0000, 0.0000,
         0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.

In [47]:
G = graphs[0].node_attrs

In [48]:
nx.get_node_attributes(G, 'atomic_num')

AttributeError: 'function' object has no attribute 'nodes'

In [15]:
from __future__ import print_function

import networkx as nx
import argparse
import multiprocessing
from rdkit import Chem

NUM_PROCESSES = 2 

def nx_to_mol(G):
    mol = Chem.RWMol()
    atomic_nums = nx.get_node_attributes(G, 'atomic_num')
    chiral_tags = nx.get_node_attributes(G, 'chiral_tag')
    formal_charges = nx.get_node_attributes(G, 'formal_charge')
    node_is_aromatics = nx.get_node_attributes(G, 'is_aromatic')
    node_hybridizations = nx.get_node_attributes(G, 'hybridization')
    num_explicit_hss = nx.get_node_attributes(G, 'num_explicit_hs')
    node_to_idx = {}
    for node in G.nodes():
        a=Chem.Atom(atomic_nums[node])
        a.SetChiralTag(chiral_tags[node])
        a.SetFormalCharge(formal_charges[node])
        a.SetIsAromatic(node_is_aromatics[node])
        a.SetHybridization(node_hybridizations[node])
        a.SetNumExplicitHs(num_explicit_hss[node])
        idx = mol.AddAtom(a)
        node_to_idx[node] = idx

    bond_types = nx.get_edge_attributes(G, 'bond_type')
    for edge in G.edges():
        first, second = edge
        ifirst = node_to_idx[first]
        isecond = node_to_idx[second]
        bond_type = bond_types[first, second]
        mol.AddBond(ifirst, isecond, bond_type)

    Chem.SanitizeMol(mol)
    return mol

In [16]:
nx_to_mol(graphs[0])

AttributeError: 'GlobalStorage' object has no attribute 'is_multigraph'

In [93]:
import deepchem as dc
from rdkit import Chem
rdkit_mol, smiles_mol = Chem.MolFromSmiles('CCC'), 'C1=CC=CC=C1'
molecules = [rdkit_mol, smiles_mol]
featurizer = dc.feat.MolGanFeaturizer()
features = featurizer.featurize(molecules)
len(features)

type(features[0])

molecules = featurizer.defeaturize(features) # defeaturization
type(molecules[0])


rdkit.Chem.rdchem.Mol

In [94]:
featurizer.defeaturize(graphs[0])

array([None, None, None, None], dtype=object)

In [91]:
import logging
import numpy as np
from deepchem.utils.typing import RDKitBond, RDKitMol, List
from deepchem.feat.base_classes import MolecularFeaturizer
from deepchem.utils.typing import OneOrMany

from typing import Optional

logger = logging.getLogger(__name__)


class GraphMatrix:
  """
  This is class used to store data for MolGAN neural networks.

  Parameters
  ----------
  node_features: np.ndarray
    Node feature matrix with shape [num_nodes, num_node_features]
  edge_features: np.ndarray,
    Edge feature matrix with shape [num_nodes, num_nodes]

  Returns
  -------
  graph: GraphMatrix
    A molecule graph with some features.
  """

  def __init__(self, adjacency_matrix: np.ndarray, node_features: np.ndarray):
    self.adjacency_matrix = adjacency_matrix
    self.node_features = node_features


class MolGanFeaturizer(MolecularFeaturizer):
  """
  Featurizer for MolGAN de-novo molecular generation [1]_.
  The default representation is in form of GraphMatrix object.
  It is wrapper for two matrices containing atom and bond type information.
  The class also provides reverse capabilities.

  Examples
  --------
  >>> import deepchem as dc
  >>> from rdkit import Chem
  >>> rdkit_mol, smiles_mol = Chem.MolFromSmiles('CCC'), 'C1=CC=CC=C1'
  >>> molecules = [rdkit_mol, smiles_mol]
  >>> featurizer = dc.feat.MolGanFeaturizer()
  >>> features = featurizer.featurize(molecules)
  >>> len(features) # 2 molecules
  2
  >>> type(features[0])
  <class 'deepchem.feat.molecule_featurizers.molgan_featurizer.GraphMatrix'>
  >>> molecules = featurizer.defeaturize(features) # defeaturization
  >>> type(molecules[0])
  <class 'rdkit.Chem.rdchem.Mol'>

  """

  def __init__(
      self,
      max_atom_count: int = 9,
      kekulize: bool = True,
      bond_labels: List[RDKitBond] = None,
      atom_labels: List[int] = None,
  ):
    """
    Parameters
    ----------
    max_atom_count: int, default 9
      Maximum number of atoms used for creation of adjacency matrix.
      Molecules cannot have more atoms than this number
      Implicit hydrogens do not count.
    kekulize: bool, default True
      Should molecules be kekulized.
      Solves number of issues with defeaturization when used.
    bond_labels: List[RDKitBond]
      List of types of bond used for generation of adjacency matrix
    atom_labels: List[int]
      List of atomic numbers used for generation of node features

    References
    ---------
    .. [1] Nicola De Cao et al. "MolGAN: An implicit generative model for
       small molecular graphs" (2018), https://arxiv.org/abs/1805.11973
    """

    self.max_atom_count = max_atom_count
    self.kekulize = kekulize

    try:
      from rdkit import Chem
    except ModuleNotFoundError:
      raise ImportError("This class requires RDKit to be installed.")

    # bond labels
    if bond_labels is None:
      self.bond_labels = [
          Chem.rdchem.BondType.ZERO,
          Chem.rdchem.BondType.SINGLE,
          Chem.rdchem.BondType.DOUBLE,
          Chem.rdchem.BondType.TRIPLE,
          Chem.rdchem.BondType.AROMATIC,
      ]
    else:
      self.bond_labels = bond_labels

    # atom labels
    if atom_labels is None:
      self.atom_labels = [0, 6, 7, 8, 9]  # C,N,O,F
    else:
      self.atom_labels = atom_labels

    # create bond encoders and decoders
    self.bond_encoder = {l: i for i, l in enumerate(self.bond_labels)}
    self.bond_decoder = {i: l for i, l in enumerate(self.bond_labels)}
    # create atom encoders and decoders
    self.atom_encoder = {l: i for i, l in enumerate(self.atom_labels)}
    self.atom_decoder = {i: l for i, l in enumerate(self.atom_labels)}

  def _featurize(self, mol: RDKitMol) -> Optional[GraphMatrix]:
    """
    Calculate adjacency matrix and nodes features for RDKitMol.
    It strips any chirality and charges

    Parameters
    ----------
    mol: rdkit.Chem.rdchem.Mol
      RDKit mol object.

    Returns
    -------
    graph: GraphMatrix
      A molecule graph with some features.
    """

    try:
      from rdkit import Chem
    except ModuleNotFoundError:
      raise ImportError("This method requires RDKit to be installed.")

    if self.kekulize:
      Chem.Kekulize(mol)

    A = np.zeros(
        shape=(self.max_atom_count, self.max_atom_count), dtype=np.float32)
    bonds = mol.GetBonds()

    begin, end = [b.GetBeginAtomIdx() for b in bonds], [
        b.GetEndAtomIdx() for b in bonds
    ]
    bond_type = [self.bond_encoder[b.GetBondType()] for b in bonds]

    A[begin, end] = bond_type
    A[end, begin] = bond_type

    degree = np.sum(A[:mol.GetNumAtoms(), :mol.GetNumAtoms()], axis=-1)
    X = np.array(
        [self.atom_encoder[atom.GetAtomicNum()] for atom in mol.GetAtoms()] +
        [0] * (self.max_atom_count - mol.GetNumAtoms()),
        dtype=np.int32,
    )
    graph = GraphMatrix(A, X)

    return graph if (degree > 0).all() else None

  def _defeaturize(self,
                   graph_matrix: GraphMatrix,
                   sanitize: bool = True,
                   cleanup: bool = True) -> RDKitMol:
    """
    Recreate RDKitMol from GraphMatrix object.
    Same featurizer need to be used for featurization and defeaturization.
    It only recreates bond and atom types, any kind of additional features
    like chirality or charge are not included.
    Therefore, any checks of type: original_smiles == defeaturized_smiles
    will fail on chiral or charged compounds.

    Parameters
    ----------
    graph_matrix: GraphMatrix
      GraphMatrix object.
    sanitize: bool, default True
      Should RDKit sanitization be included in the process.
    cleanup: bool, default True
      Splits salts and removes compounds with "*" atom types

    Returns
    -------
    mol: RDKitMol object
      RDKitMol object representing molecule.
    """

    try:
      from rdkit import Chem
    except ModuleNotFoundError:
      raise ImportError("This method requires RDKit to be installed.")

    if not isinstance(graph_matrix, GraphMatrix):
      return None

    node_labels = graph_matrix.node_features
    edge_labels = graph_matrix.adjacency_matrix

    mol = Chem.RWMol()

    for node_label in node_labels:
      mol.AddAtom(Chem.Atom(self.atom_decoder[node_label]))

    for start, end in zip(*np.nonzero(edge_labels)):
      if start > end:
        mol.AddBond(
            int(start), int(end), self.bond_decoder[edge_labels[start, end]])

    if sanitize:
      try:
        Chem.SanitizeMol(mol)
      except Exception:
        mol = None

    if cleanup:
      try:
        smiles = Chem.MolToSmiles(mol)
        smiles = max(smiles.split("."), key=len)
        if "*" not in smiles:
          mol = Chem.MolFromSmiles(smiles)
        else:
          mol = None
      except Exception:
        mol = None

    return mol

  def defeaturize(self, graphs: OneOrMany[GraphMatrix],
                  log_every_n: int = 1000) -> np.ndarray:
    """
    Calculates molecules from corresponding GraphMatrix objects.

    Parameters
    ----------
    graphs: GraphMatrix / iterable
      GraphMatrix object or corresponding iterable
    log_every_n: int, default 1000
      Logging messages reported every `log_every_n` samples.

    Returns
    -------
    features: np.ndarray
      A numpy array containing RDKitMol objext.
    """

    # Special case handling of single molecule
    if isinstance(graphs, GraphMatrix):
      graphs = [graphs]
    else:
      # Convert iterables to list
      graphs = list(graphs)

    molecules = []
    for i, gr in enumerate(graphs):
      if i % log_every_n == 0:
        logger.info("Featurizing datapoint %i" % i)

      try:
        molecules.append(self._defeaturize(gr))
      except Exception as e:
        logger.warning(
            "Failed to defeaturize datapoint %d, %s. Appending empty array",
            i,
            gr,
        )
        logger.warning("Exception message: {}".format(e))
        molecules.append(np.array([]))

    return np.asarray(molecules)

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (C:\Users\tobia\anaconda3\envs\Cheminformatics\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


## Log P Distribution Metrics