# Script to check accounting of features
- Use John Chodera's .sdf of docked conformers of Moonshot subs
- Get features of each compound
- Compare to features for all the bits we get

In [1]:
from rdkit.Chem import AllChem
import os
from rdkit import RDConfig
from rdkit import Chem
from rdkit.Chem.Lipinski import RotatableBondSmarts
from rdkit.Chem import BRICS
import numpy as np


def getBits(mol):
    '''

    Parameters
    ----------
    mol : rdkit mol object to be broken up into fragments by breaking 
    rotable bonds

    Returns
    -------
    mols : A list of rdkit mol objects

    '''
    # find the rotatable bonds
    bonds = mol.GetSubstructMatches(RotatableBondSmarts)
    
    bonds = [((x,y),(0,0)) for x,y in bonds]
    p = BRICS.BreakBRICSBonds(mol,bonds=bonds)
 
    mols = [mol for mol in Chem.GetMolFrags(p,asMols=True)]
    
    return mols

# Function to build feature maps and score feats
fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))

keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder',
        'Aromatic', 'Hydrophobe', 'LumpedHydrophobe')

In [2]:
# Get the data - test missing features in JC's docking Moonshot compound set
compound_mols = Chem.SDMolSupplier('in_data/covid_submissions_all_info-docked-overlap.sdf')

In [5]:
# Let's compare each compounds features to it's bits
# Creat test lists
numb_feats_missing = []
all_feats = []

for compound in compound_mols:    
    # Create empty feature, x,y,z lists
    compound_feat_list = []
    bits_feat_list = []
    
    # Get compound bits
    bits = getBits(compound)
    
    # Get features of compound
    compound_feats = []
    rawFeats = fdef.GetFeaturesForMol(compound)
    
    # Filter that list down to only include the ones we're intereted in
    compound_feats.append([f for f in rawFeats if f.GetFamily() in keep])
    
    # Get features of bits
    bit_feats = []
    for bit in bits:
        rawFeats = fdef.GetFeaturesForMol(bit)
        # Filter that list down to only include the ones we're intereted in
        bit_feats.append([f for f in rawFeats if f.GetFamily() in keep])
    
    # Need to get compounds features and x,y,z coordinate lists
    for feat in compound_feats[0]:
        compound_feat_list.append((feat.GetType(), feat.GetPos().x, feat.GetPos().y, feat.GetPos().z))
    
    # Get bits features and x,y,z coordinate lists
    for feat_bit in bit_feats:
        for feat in feat_bit:
            bits_feat_list.append((feat.GetType(), feat.GetPos().x, feat.GetPos().y, feat.GetPos().z))
          
    # What featuress are missing?
    missing_feats = [elem for elem in compound_feat_list if elem not in bits_feat_list]
        
    # Whats the total story - count missing features and the number of expected compound features
    numb_feats_missing.append(len(missing_feats))
    all_feats.append(len(compound_feat_list))

In [7]:
# Overall percent misisng
(np.sum(numb_feats_missing) / np.sum(all_feats)) * 100

3.1626703354297696