# Evaluation Metrics 

This script takes smiles or graph inputs of ground-truth and generated insecticidal compounds and computes:

-The distribution of Log P values

-The structural diversity of the compounds

-The insecticide-likeness of the compounds

Written by Tobias D. Muellers

## Load dependencies

In [1]:
import pandas as pd
import numpy as np
from typing import Any
import torch
import torch_geometric
from rdkit import Chem, RDLogger
from torch_geometric.data import Data

## Graphs to SMILES

This code takes generated graphs and converts them to a dataset of SMILES for further analysis

In [2]:
# import graphs which are pytorch datasets
graphs = torch.load('insecticides_graphs_small.pt', weights_only=False)

In [3]:
# from https://pytorch-geometric.readthedocs.io/en/2.4.0/_modules/torch_geometric/utils/smiles.html
# revised per comments
x_map = {
    'atomic_num':
    list(range(0, 119)),
    'degree':
    list(range(0, 11)),
    'formal_charge':
    list(range(-5, 7))
}

# only specify bond type
e_map = {
    'bond_type': [
        'UNSPECIFIED',
        'SINGLE',
        'DOUBLE',
        'TRIPLE',
        'QUADRUPLE',
        'QUINTUPLE',
        'HEXTUPLE',
        'ONEANDAHALF',
        'TWOANDAHALF',
        'THREEANDAHALF',
        'FOURANDAHALF',
        'FIVEANDAHALF',
        'AROMATIC',
        'HYDROGEN',
        'THREECENTER',
    ]
}

def from_smiles(smiles: str) -> 'torch_geometric.data.Data':
    r"""Converts a SMILES string to a :class:`torch_geometric.data.Data`
    instance.

    Args:
        smiles (str): The SMILES string.
    """
    RDLogger.DisableLog('rdApp.*')
    
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        mol = Chem.MolFromSmiles('')

    xs = []
    for atom in mol.GetAtoms():
        x = []
        x.append(x_map['atomic_num'].index(atom.GetAtomicNum()))
        x.append(x_map['degree'].index(atom.GetTotalDegree()))
        x.append(x_map['formal_charge'].index(atom.GetFormalCharge()))
        xs.append(x)

    x = torch.tensor(xs, dtype=torch.long).view(-1, 3)

    edge_indices, edge_attrs = [], []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()

        e = []
        e.append(e_map['bond_type'].index(str(bond.GetBondType())))

        edge_indices += [[i, j], [j, i]] # this creates the adjacency matrix
        edge_attrs += [e, e] # this creates the edge attributes matrix

    edge_index = torch.tensor(edge_indices)
    edge_index = edge_index.t().to(torch.long).view(2, -1)
    edge_attr = torch.tensor(edge_attrs, dtype=torch.long).view(-1, 1)

    if edge_index.numel() > 0:  # Sort indices.
        perm = (edge_index[0] * x.size(0) + edge_index[1]).argsort()
        edge_index, edge_attr = edge_index[:, perm], edge_attr[perm]

    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr, smiles=smiles)



def to_smiles(data: 'torch_geometric.data.Data') -> Any:
    """Converts a :class:`torch_geometric.data.Data` instance to a SMILES
    string.

    Args:
        data (torch_geometric.data.Data): The molecular graph.
    """
    from rdkit import Chem

    mol = Chem.RWMol()

    for i in range(data.num_nodes):
        atom = Chem.Atom(data.x[i, 0].item())
        atom.SetFormalCharge(x_map['formal_charge'][data.x[i, 2].item()])
        mol.AddAtom(atom)

    edges = [tuple(i) for i in data.edge_index.t().tolist()]
    visited = set()

    for i in range(len(edges)):
        src, dst = edges[i]
        if tuple(sorted(edges[i])) in visited:
            continue

        bond_type = Chem.BondType.values[data.edge_attr[i, 0].item()]
        mol.AddBond(src, dst, bond_type)

        visited.add(tuple(sorted(edges[i])))

    mol = mol.GetMol()

    Chem.SanitizeMol(mol)
    Chem.AssignStereochemistry(mol)

    return Chem.MolToSmiles(mol, isomericSmiles=True)

In [4]:
# define function to create data
# based on adapted code from https://www.blopig.com/blog/2022/02/how-to-turn-a-smiles-string-into-a-molecular-graph-for-pytorch-geometric/
def create_py_geom_dataset(x_smiles, y):
    '''
    x_smiles = column with input smiles
    y = column with relevant property value
    '''
    data_list = []
    
    for (smiles, y_val) in zip(x_smiles, y):

        initial_data = from_smiles(smiles)

        X = initial_data.x.detach()
        E = initial_data.edge_index.detach()
        EF = initial_data.edge_attr.detach()
        y_tensor = torch.tensor(y_val, dtype = torch.float)
        
        data_list.append(Data(x = X, edge_index = E, edge_attr = EF, y = y_tensor))
    return data_list

def create_smiles_data(py_geom_df):
    '''
    Creates a dataset with smiles and y values
    Inputs:
    py_geom_df = input dataset that is a pytorch geometric with a y tensor
    '''
    smiles_store = []
    feature_store = []
    idx = range(len(py_geom_df))
    for i in idx:

        smiles = to_smiles(py_geom_df[i])
        y = float(py_geom_df[i].y.detach().numpy().item(0))
        
        smiles_store.append(smiles)
        feature_store.append(y)
        
    df_out = pd.DataFrame(data={'SMILES': smiles_store, 'logp': feature_store})
    return df_out

In [5]:
small_graphs = graphs[0:50]
insecticides_out_small = create_smiles_data(small_graphs) 

In [6]:
insecticides_out_small.head(1)

Unnamed: 0,SMILES,logp
0,O=[N+]([O-])c1ccccc1Cl,2.2482


## Log P Distribution Metrics

In [7]:
from scipy.stats import describe

In [8]:
print(describe(insecticides_out_small['logp']))

DescribeResult(nobs=50, minmax=(-1.2941999435424805, 6.451580047607422), mean=2.891731585264206, variance=2.867706823878202, skewness=0.36445853438956816, kurtosis=-0.14949686090744319)


In [None]:
import seaborn as sns
from seaborn import kdeplot
sns.kdeplot(data=insecticides_out_small, x="logp")