-
Notifications
You must be signed in to change notification settings - Fork 5
/
clean_tautomers.py
executable file
·236 lines (215 loc) · 9.94 KB
/
clean_tautomers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
from rdkit import Chem
from rdkit.Chem.MolStandardize import rdMolStandardize
import time
import ambit_tautomer
# molVs' smiles might not be identical to rdkit's so I used RDKit instead of molVs
enum = rdMolStandardize.TautomerEnumerator()
def clean_taut(dg, dg_execute, algorithm="CMI"):
"""
Remove tautomeric pairs produced in a given generation.
Note: this doesn't remove tautomers in the parent generation and does not consider
whether or not one of the parent molecules was a tautomer.
Keyword Arguments:
dg -- the DG (DerivationGraph) of the network to look at
dg_execute -- an instance of the DGExecute object of the generation in context
algorthm -- the algorithm to be used for enumerating tautomers
Available algorithms include:
"CM" for Simple Combinatorial,
"CMI" for Improved Combinatorial,
"IA-DFS" for Incremental Algorithm - Depth First Search
"combined" doesn't work is intended for a combination of CMI and IA-DFS
See https://onlinelibrary.wiley.com/doi/abs/10.1002/minf.201200133 for a detailed discussion
of the algorithms
Returns: the cleaned subset, universe
"""
init_subset = time.time()
subset = dg_execute.subset
end_subset = time.time()
print(f"Took {end_subset - init_subset} seconds to initialize subset")
universe = dg_execute.universe
end_universe = time.time()
print(f"Took {end_universe - end_subset} seconds to create universe")
# The following line may seem redundant but it is needed for retaining the indices of subset
graphs = [g for g in subset]
end_graphs = time.time()
print(f"Took {end_graphs - end_universe} seconds to create list of graphs")
# list of smiles associated with each Graph object
mod_subset_smiles = [g.smiles for g in subset]
cdk_molecules = [ambit_tautomer.smilesToMolecule(smiles) for smiles in mod_subset_smiles]
# convert to cdk unique smiles TODO: these smiles are perhaps not canonical.
cdk_smiles = [ambit_tautomer.smiles_from_molecule(mol) for mol in cdk_molecules]
end_canonicalization = time.time()
print(f"Took {end_canonicalization - end_graphs} to canonicalize SMILES")
# a mapping of the Graph objects with their corresponding DGVertex objects
mod_dgverts = {v.graph: v for v in dg.vertices if v.graph in subset}
end_dgverts = time.time()
print(f"Took {end_dgverts - end_canonicalization} to create dg vertices")
# a mapping {smiles: tuple(smiles)} between the smiles of a given molecule
# and those of all possible tautomers for that molecule
taut_tautset_dict = {}
for smiles in cdk_smiles:
taut_tautset_dict[smiles] = ambit_tautomer.generateTautomers(smiles, mode)
tautomer_sets = tuple(taut_tautset_dict.values())
complete_enumeration = time.time()
print(f"Took {complete_enumeration-end_dgverts} to create tautomer classes")
start_whatever = time.time()
class_ids = {}
# create initial class ids
for i in range(len(tautomer_sets)):
class_ids[tautomer_sets[i]] = i
# find sets with common elements and assign them the same class id
for i in range(len(tautomer_sets)):
for j in range(i+1, len(tautomer_sets)):
for item in tautomer_sets[i]:
if item in tautomer_sets[j]:
#print(f'Classes {i} and {j} have common elements')
if class_ids[tautomer_sets[i]] < class_ids[tautomer_sets[j]]:
class_ids[tautomer_sets[j]] = class_ids[tautomer_sets[i]]
else:
class_ids[tautomer_sets[i]] = class_ids[tautomer_sets[j]]
break
reversed_dict = {} # {class id: [all relevant smiles in MOD]}
for i in range(len(cdk_smiles)):
for tautomer_class, class_id in class_ids.items():
if cdk_smiles[i] in tautomer_class:
if class_id in reversed_dict.keys():
reversed_dict[class_id].append(mod_subset_smiles[i])
elif class_ids[tautomer_class] not in reversed_dict.keys():
reversed_dict[class_id] = [mod_subset_smiles[i]]
#print("There are {0} unique tautomer classes among {1} molecules".format(len(reversed_dict.keys()), len(subset)))
# a mapping {smiles: smiles}, the one that nees to be kept and the ones that need to be removed
to_remove = {}
for seen_tautomers in reversed_dict.values():
if len(seen_tautomers) > 1:
for i in range(1, len(seen_tautomers)):
to_remove[seen_tautomers[0]] = seen_tautomers[i]
print(f"My part took {time.time() - start_whatever} to complete")
p = GraphPrinter()
p.simpleCarbons = True
p.withColour = True
p.collapseHydrogens = True
start_removing = time.time()
# for each list of redundant tautomer smiles and the corresponding one tautomer to be kept
for to_keep, item_to_remove in to_remove.items():
# for each item in the list
index = mod_subset_smiles.index(item_to_remove)
index_to_keep = mod_subset_smiles.index(to_keep)
postSection(f"Removed")
graphs[index].print(p)
postSection(f"Kept")
graphs[index_to_keep].print(p)
# The DGVertex associated with the molecule to be removed
dg_vertex = mod_dgverts[graphs[index]]
# add a fake edge to the preserved graph to account for the removed graph (to avoid loss of reaction info)
start_adding_edge = time.time()
for e in dg_vertex.inEdges:
for source in e.sources:
d = Derivations()
d.left = [source.graph]
d.rules = e.rules
d.right = [graphs[index_to_keep]]
b.addDerivation(d)
print(f"Took {time.time()-start_adding_edge} to add edge {d}")
#print(f"Addded fake edge {d}")
subset.remove(graphs[index])
universe.remove(graphs[index])
#print(f"Removing {graphs[index]} and keeping {graphs[index_to_keep]}")
print(f"Took {time.time() - start_removing} to remove tautomers from the network")
return subset, universe
def clean_taut_rdkit(dg, dg_execute):
"""
Clean tautomers from the derivation graph using RDKit
This was initially created as a test but it turns out RDKit does too many proton shifts
and identifies unique species as tautomers. Considering that, Ambit with CMI algo could be a decent bet.
Keyword arguments:
dg --- a DG (DerivationGraph) object instance holding the network
dg_execute --- a DGBuildExecute object instance
return: the cleaned subset, universe
"""
subset = dg_execute.subset
universe = dg_execute.universe
# The following line may seem redundant but it is needed for retaining the indices of subset
graphs = [g for g in subset]
# list of smiles associated with each Graph objectts/reac-space-exp/rules/michaelAddition.py')
mod_subset_smiles = [g.smiles for g in subset]
# a mapping of the Graph objects with their corresponding DGVertex objects
mod_dgverts = {v.graph: v for v in dg.vertices if v.graph in subset}
# the list of RDKit canonical SMILES
rdkit_subset_smiles = []
# a mapping {smiles: tuple(smiles)} between the smiles of a given molecule
# and those of all possible tautomers for that molecule
taut_tautset_dict = {}
# a mapping {smiles: smiles} of the tautomer that needs removing to the one that should be kept
to_remove = {}
# Populate all possible tautomers and add to the above empty dict
for mod_smiles in mod_subset_smiles:
mol = Chem.MolFromSmiles(mod_smiles)
# convert into rdkit canonical smiles
rdkit_smiles = Chem.MolToSmiles(mol)
rdkit_subset_smiles.append(rdkit_smiles)
# generate Mol objects of all possible tautomers
all_tauts = enum.Enumerate(mol)
# convert into smilesif item in cdk_smiles:
# TODO: complete
all_taut_smiles = tuple(Chem.MolToSmiles(taut) for taut in all_tauts)
taut_tautset_dict[mod_smiles] = all_taut_smiles
# a tuple of tuples, containing tautomer classes as elements
list_tautset = tuple(taut_tautset_dict.values())
# I thought assigning class ids was a decent way of seeing which tautomer classes are identical
# {tautomer_class: class_id}
class_ids = {}
# create initial class ids
for i in range(len(list_tautset)):
class_ids.update({list_tautset[i]: i})
# for debugging purpose
#ids_tonote = []
#f = open('incomplete_match.txt', 'w')
# search for matching tautomer classes
for i in range(len(list_tautset)):
for j in range(i+1, len(list_tautset)):
for item in list_tautset[i]:
if item in list_tautset[j]:
#if list_tautset[j] != list_tautset[i]:
#ids_tonote.append(class_ids[list_tautset[i]])
#f.write('Classes {0} and {1} have common elements but are not identical'.format(list_tautset[i], list_tautset[j]))
#f.write('\n')
#print(f'Classes {i} and {j} have common elements')
#class_ids[list_tautset[i]] = class_ids[list_tautset[i]]
class_ids[list_tautset[j]] = class_ids[list_tautset[i]]
#print('Updated class ids to ', class_ids[list_tautset[i]])
break
#f.close()
# Now, since we're done finding identical class ids
# A dictionary of observed tautomer classes mapped as {class id: [list of smiles]}
dict_observed_tauts = {}
for taut_class, class_id in class_ids.items():
for i in range(len(rdkit_subset_smiles)):
if rdkit_subset_smiles[i] in taut_class:
# note that these are the smiles of the graph generated by mod (not rdkit's smiles)
if class_id in dict_observed_tauts.keys():
dict_observed_tauts[class_id].append(mod_subset_smiles[i])
else :
dict_observed_tauts[class_id] = [mod_subset_smiles[i]]
print('{0} unique tautomer classes in {1} molecules'.format(len(dict_observed_tauts.keys())
, len(subset)))
# debugging which molecules are causing trouble
#print(class_ids)
# add to the dictionary of items to be removed
for taut_class in dict_observed_tauts.values():
if len(taut_class) > 1:
for i in range(1, len(taut_class)):
to_remove[taut_class[i]] = taut_class[0]
for smiles in to_remove.keys():
# The DGVertex associated with the molecule to be removed
dg_vertex = mod_dgverts[graphs[mod_subset_smiles.index(smiles)]]
# add a fake edge to the preserved graph to account for the removed graph (to avoid loss of reaction info)
for e in dg_vertex.inEdges:
for source in e.sources:
d = Derivations()
d.left = [source.graph]
d.rules = e.rules
d.right = [graphs[mod_subset_smiles.index(to_remove[smiles])]]
b.addDerivation(d)
subset.remove(graphs[mod_subset_smiles.index(smiles)])
universe.remove(graphs[mod_subset_smiles.index(smiles)])
return subset, universe