Skip to content

Commit

Permalink
Improvements to lomap:
Browse files Browse the repository at this point in the history
- Added code to improve the mapping of molecules to the MCS in cases where there is
  symmetry. By default the map which minimises the number of element changes is
  chosen. However, if the -3 option is specified then the map which minimises the
  overall RMSD of the mapping (assuming fixed coordinates) is chosen.
  process.
- Added code to filter the MCS to remove atoms which match topologically but are not
  close in 3D space. By default this is off, but the -x option specifies a distance
  threshold to use for deleting MCS atoms.
- Added an atomic_number rule which returns (currently) the fraction of atoms in the
  MCS which match in terms of atomic number. Not currently used.

As part of this, cleaned up some very silly code in map_mcs_mol. It still pointlessly
maps the MCS molecule to itself, however.

Also improved the testing code in mcs.py
  • Loading branch information
mark-mackey-cresset committed Mar 13, 2019
1 parent bac6c64 commit c364e7d
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 22 deletions.
27 changes: 21 additions & 6 deletions lomap/dbmol.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class DBMolecules(object):

# Initialization function
def __init__(self, directory, parallel=1, verbose='off',
time=20, ecrscore=0.0, output=False,
time=20, ecrscore=0.0, threed=False, max3d=1000.0, output=False,
name='out', display=False,
max=6, cutoff=0.4, radial=False, hub=None, fingerprint=False, fast=False, linksfile=None):

Expand All @@ -92,6 +92,11 @@ def __init__(self, directory, parallel=1, verbose='off',
the maximum time in seconds used to perform the MCS search
ecrscore: float
the electrostatic score to be used (if != 0) if two molecule have diffrent charges
threed: bool
If true, symmetry-equivalent MCSes are filtered to prefer the one with the best real-space alignment
max3d: float
The MCS is filtered to remove atoms which are further apart than this threshold. The default of 1000 is
effectively "no filter"
output : bool
a flag used to generate or not the output files
name : str
Expand Down Expand Up @@ -136,12 +141,14 @@ def __init__(self, directory, parallel=1, verbose='off',
radial_str = ''
fingerprint_str = ''
fast_str = ''
threed_str = ''

parser.set_defaults(output=output)
parser.set_defaults(display=display)
parser.set_defaults(radial=radial)
parser.set_defaults(fingerprint=fingerprint)
parser.set_defaults(fast=fast)
parser.set_defaults(threed=threed)
if output:
output_str = '--output'

Expand All @@ -157,11 +164,15 @@ def __init__(self, directory, parallel=1, verbose='off',
if fast:
fast_str = '--fast'

names_str = '%s --parallel %s --verbose %s --time %s --ecrscore %s --name %s --max %s --cutoff %s --hub %s %s %s %s %s %s' \
if threed:
threed_str = '--threed'

names_str = '%s --parallel %s --verbose %s --time %s --ecrscore %s --max3d %s --name %s --max %s --cutoff %s --hub %s %s %s %s %s %s %s' \
% (
directory, parallel, verbose, time, ecrscore, name, max, cutoff, hub, output_str, display_str,
radial_str, fingerprint_str, fast_str)
directory, parallel, verbose, time, ecrscore, max3d, name, max, cutoff, hub, output_str, display_str,
radial_str, fingerprint_str, fast_str, threed_str)

print(names_str)
self.options = parser.parse_args(names_str.split().extend(['--linksfile',linksfile]))

# Internal list container used to store the loaded molecule objects
Expand Down Expand Up @@ -494,7 +505,7 @@ def ecr(mol_i, mol_j):
# The scoring between the two molecules is performed by using different rules.
# The total score will be the product of all the single rules
if not fingerprint:
tmp_scr = ecr_score * MC.mncar() * MC.mcsr()
tmp_scr = ecr_score * MC.mncar() * MC.mcsr() # * MC.atomic_number_rule()
strict_scr = tmp_scr * MC.tmcsr(strict_flag=True)
loose_scr = tmp_scr * MC.tmcsr(strict_flag=False)
strict_mtx[k] = strict_scr
Expand Down Expand Up @@ -947,7 +958,7 @@ def startup():
ops = parser.parse_args()

# Molecule DataBase initialized with the passed user options
db_mol = DBMolecules(ops.directory, ops.parallel, ops.verbose, ops.time, ops.ecrscore,
db_mol = DBMolecules(ops.directory, ops.parallel, ops.verbose, ops.time, ops.ecrscore, ops.threed, ops.max3d,
ops.output, ops.name, ops.display, ops.max, ops.cutoff, ops.radial, ops.hub, ops.fingerprint, ops.fast, ops.linksfile)
# Similarity score linear array generation
strict, loose = db_mol.build_matrices()
Expand Down Expand Up @@ -983,6 +994,10 @@ def startup():
help='Set the maximum time in seconds to perform the mcs search between pair of molecules')
mcs_group.add_argument('-e', '--ecrscore', default=0.0, action=CheckEcrscore, type=float, \
help='If different from 0.0 the value is use to set the electrostatic score between two molecules with different charges')
mcs_group.add_argument('-3', '--threed', default=False, action='store_true', \
help='Use the input 3D coordinates to guide the preferred MCS mappings')
mcs_group.add_argument('-x', '--max3d', default=1000, type=float, \
help='The MCS is trimmed to remove atoms which are further apart than this distance')

out_group = parser.add_argument_group('Output setting')
out_group.add_argument('-o', '--output', default=True, action='store_true', \
Expand Down
142 changes: 126 additions & 16 deletions lomap/mcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class MCS(object):
"""

def __init__(self, moli, molj, options=argparse.Namespace(time=20, verbose='info')):
def __init__(self, moli, molj, options=argparse.Namespace(time=20, verbose='info', max3D=1000, threed=False)):
"""
Initialization function
Expand All @@ -87,6 +87,81 @@ def __init__(self, moli, molj, options=argparse.Namespace(time=20, verbose='info
"""

def best_substruct_match(moli,molj,by_rmsd=True):
"""
This function looks over all of the substructure matches and returns the one
with the best 3D correspondence (if by_rmsd is true), or the fewest number
of atomic number mismatches (if by_rmsd is false)
"""

# Sanity checking
if not moli.HasSubstructMatch(self.mcs_mol):
raise ValueError('RDkit MCS Subgraph first molecule search failed')

if not molj.HasSubstructMatch(self.mcs_mol):
raise ValueError('RDkit MCS Subgraph second molecule search failed')

moli_sub = moli.GetSubstructMatches(self.mcs_mol,uniquify=False)
molj_sub = molj.GetSubstructMatches(self.mcs_mol,uniquify=False)
best_rmsd=1e8
for mapi in moli_sub:
for mapj in molj_sub:
rmsd=0
for pair in zip(mapi,mapj):
if by_rmsd:
rmsd += (moli.GetConformer().GetAtomPosition(pair[0]) -
molj.GetConformer().GetAtomPosition(pair[1])).LengthSq()
else:
if moli.GetAtomWithIdx(pair[0]).GetAtomicNum() != molj.GetAtomWithIdx(pair[1]).GetAtomicNum():
rmsd+=1
if rmsd < best_rmsd:
besti=mapi
bestj=mapj
best_rmsd=rmsd

return (besti,bestj)

def trim_mcs_mol(max_deviation=2.0):
"""
This function is used to trim the MCS molecule to remove mismatched atoms i.e atoms
where the topological mapping does not work in 3D coordinates.
Parameters
----------
max_deviation : the maximum difference in Angstroms between mapped atoms to allow
"""

print("debug max_deviation=",max_deviation)
while True:
(mapi,mapj) = best_substruct_match(self.__moli_noh,self.__molj_noh,by_rmsd=True)
worstatomidx=-1
worstdist=0
atomidx=0
for pair in zip(mapi,mapj):
dist = (self.__moli_noh.GetConformer().GetAtomPosition(pair[0]) -
self.__molj_noh.GetConformer().GetAtomPosition(pair[1])).Length()
if dist > worstdist:
worstdist=dist
worstatomidx=atomidx
atomidx=atomidx+1

if worstdist > max_deviation:
# Remove the furthest-away atom and try again
rwm = Chem.RWMol(self.mcs_mol)
print("REMOVING ATOM",worstatomidx," with distance", worstdist)
rwm.RemoveAtom(worstatomidx)
self.mcs_mol=Chem.Mol(rwm)
else:
break




def map_mcs_mol():
"""
Expand All @@ -97,32 +172,22 @@ def map_mcs_mol():

# mcs indexes mapped back to the first molecule moli

if self.__moli_noh.HasSubstructMatch(self.mcs_mol):
moli_sub = self.__moli_noh.GetSubstructMatch(self.mcs_mol)
else:
raise ValueError('RDkit MCS Subgraph first molecule search failed')

# GAC TEST 02/17/17
# mcsi_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol)

# This bit's a bit pointless - it will always map directly to itself?
if self.mcs_mol.HasSubstructMatch(self.mcs_mol):
mcsi_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol)
else:
raise ValueError('RDkit MCS Subgraph search failed')

(moli_sub,molj_sub) = best_substruct_match(self.__moli_noh,self.__molj_noh,by_rmsd=self.options.threed)

# mcs to moli
map_mcs_mol_to_moli_sub = zip(mcsi_sub, moli_sub)

# An RDkit atomic property is defined to store the mapping to moli
for idx in map_mcs_mol_to_moli_sub:
self.mcs_mol.GetAtomWithIdx(idx[0]).SetProp('to_moli', str(idx[1]))

# mcs indexes mapped back to the second molecule molj

if self.__molj_noh.HasSubstructMatch(self.mcs_mol):
molj_sub = self.__molj_noh.GetSubstructMatch(self.mcs_mol)
else:
raise ValueError('RDkit MCS Subgraph second molecule search failed')

if self.mcs_mol.HasSubstructMatch(self.mcs_mol):
mcsj_sub = self.mcs_mol.GetSubstructMatch(self.mcs_mol)
Expand Down Expand Up @@ -204,6 +269,8 @@ def set_ring_counter(mol):
# Set logging level and format
logging.basicConfig(format='%(levelname)s:\t%(message)s', level=logging.INFO)

self.options=options

# Local pointers to the passed molecules
self.moli = moli
self.molj = molj
Expand Down Expand Up @@ -254,6 +321,12 @@ def set_ring_counter(mol):
if sanitFail: # if not, the MCS is skipped
raise ValueError('Sanitization Failed...')

# Trim the MCS to remove atoms with too-large real-space deviations
try:
trim_mcs_mol(max_deviation=self.options.max3d)
except Exception as e:
raise ValueError(str(e))

# Mapping between the found MCS molecule and moli, molj
try:
map_mcs_mol()
Expand Down Expand Up @@ -288,6 +361,7 @@ def get_map(self):

return self.__map_moli_molj

# Note - not used in the main LOMAP calculation - here primarily for testing?
@staticmethod
def getMapping(moli, molj, hydrogens=False, fname=None, time_out=150):

Expand Down Expand Up @@ -690,11 +764,37 @@ def extend_conflict(mol, conflict):

return math.exp(-2 * beta * (orig_nha_mcs_mol - max_frag_num_heavy_atoms))

# AtomicNumber rule (Trim rule)
def atomic_number_rule(self):

"""
This rule checks how many elements have been changed in the MCS
and returns the fraction of MCS matches that are the same atomicnumber
"""
natoms=0
nmatch=0
for at in self.mcs_mol.GetAtoms():
natoms=natoms+1
moli_idx = int(at.GetProp('to_moli'))
molj_idx = int(at.GetProp('to_molj'))
moli_a = self.moli.GetAtoms()[moli_idx]
molj_a = self.molj.GetAtoms()[molj_idx]

if moli_a.GetAtomicNum() == molj_a.GetAtomicNum():
nmatch=nmatch+1

return nmatch/natoms


if "__main__" == __name__:

mola = Chem.MolFromMol2File('../test/basic/2-methylnaphthalene.mol2', sanitize=False, removeHs=False)
molb = Chem.MolFromMol2File('../test/basic/2-naftanol.mol2', sanitize=False, removeHs=False)
#mola = Chem.MolFromMol2File('../test/basic/2-methylnaphthalene.mol2', sanitize=False, removeHs=False)
#molb = Chem.MolFromMol2File('../test/basic/2-naftanol.mol2', sanitize=False, removeHs=False)
#mola = Chem.MolFromMolFile('../test/transforms/chlorotoluyl1.sdf', sanitize=False, removeHs=False)
#molb = Chem.MolFromMolFile('../test/transforms/chlorotoluyl2.sdf', sanitize=False, removeHs=False)
mola = Chem.MolFromMolFile('/home/mark/lomap-test/max/cdk2_ds/lomap_sdf/cdk2_lig13.sdf', sanitize=False, removeHs=False)
molb = Chem.MolFromMolFile('/home/mark/lomap-test/max/cdk2_ds/lomap_sdf/cdk2_lig14.sdf', sanitize=False, removeHs=False)

mp = MCS.getMapping(mola, molb, hydrogens=False, fname='mcs.png')

Expand All @@ -709,14 +809,24 @@ def extend_conflict(mol, conflict):
# # Rules calculations
mcsr = MC.mcsr()
mncar = MC.mncar()
atnum = MC.atomic_number_rule()

strict = MC.tmcsr(strict_flag=True)
loose = MC.tmcsr(strict_flag=False)

print('TMCRS STRICT = %f , TMCRS LOOSE = %f' % (strict, loose))
print('MCSR = ', mcsr)
print('MNCAR = ', mncar)
print('ATNUM = ', atnum)

tmp = mcsr * mncar

print('Total Strict = %f , Total Loose = %f' % (tmp * strict, tmp * loose))

for at in MC.mcs_mol.GetAtoms():
moli_idx = int(at.GetProp('to_moli'))
molj_idx = int(at.GetProp('to_molj'))
moli_a = mola.GetAtoms()[moli_idx]
molj_a = molb.GetAtoms()[molj_idx]
print("MCS match: ",moli_idx,moli_a.GetAtomicNum(),molj_idx,molj_a.GetAtomicNum())

0 comments on commit c364e7d

Please sign in to comment.