In [None]:
# import molli as ml
from molli.external import rdkit as mrd
import pickle
from rdkit import Chem
from rdkit.Chem.PropertyMol import PropertyMol
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
import pandas as pd

def update_visualize_mols(
        name: str,
        rdkit_mol_list: list,
        subImgSize = (700,700),
        molsPerRow=5,
        prop: str = "_Name",
        svg=True
):
    legends = [i.GetProp(prop) for i in rdkit_mol_list]
    nRows = len(rdkit_mol_list) // molsPerRow
    if len(rdkit_mol_list) % molsPerRow:
        nRows += 1
    fullSize = (molsPerRow * subImgSize[0], nRows * subImgSize[1])

    d2d = rdMolDraw2D.MolDraw2DSVG(fullSize[0],fullSize[1],subImgSize[0], subImgSize[1])
    d2d.drawOptions().legendFontSize=30
    d2d.DrawMolecules(rdkit_mol_list,legends=legends)
    d2d.FinishDrawing()

    with open(f"{name}.svg", "w") as f:
        f.write(d2d.GetDrawingText())

with open('problem_mol_type_dict.pkl', 'rb') as f:
    smi_dict = pickle.load(f)

smi_dict: dict
mono = 0
mono_list = list()
gem = 0
gem_list = list()
cis = 0
cis_list = list()
trans = 0
trans_list = list()
tri = 0
tri_list = list()
tetra = 0
tetra_list = list()

all_mols = list()

for i,(smi,alk_type) in enumerate(smi_dict.items()):
    rdmol = PropertyMol(Chem.MolFromSmiles(smi))
    can_rdmol = mrd.canonicalize_rdkit_mol(rdmol, sanitize=True)
    match alk_type:
        case 'Mono':
            mono += 1
            can_rdmol.SetProp("_Name", f'Mono_{mono}')
            can_rdmol.SetProp("_Alkene_Type", f'Mono')
            mono_list.append(can_rdmol)
        case 'Gem':
            gem += 1
            can_rdmol.SetProp("_Name", f'Gem_{gem}')
            can_rdmol.SetProp("_Alkene_Type", f'Gem')
            gem_list.append(can_rdmol)
        case 'Cis':
            cis += 1
            can_rdmol.SetProp("_Name", f'Cis_{cis}')
            can_rdmol.SetProp("_Alkene_Type", f'Cis')
            cis_list.append(can_rdmol)
        case 'Trans':
            trans += 1
            can_rdmol.SetProp("_Name", f'Trans_{trans}')
            can_rdmol.SetProp("_Alkene_Type", f'Trans')
            trans_list.append(can_rdmol)
        case 'Tri':
            tri += 1
            can_rdmol.SetProp("_Name", f'Tri_{tri}')
            can_rdmol.SetProp("_Alkene_Type", f'Tri')
            tri_list.append(can_rdmol)
        case 'Tetra':
            tetra += 1
            can_rdmol.SetProp("_Name", f'Tetra_{tetra}')
            can_rdmol.SetProp("_Alkene_Type", f'Tetra')
            tetra_list.append(can_rdmol)
    
    all_mols.append(can_rdmol)

all_types = {
    'Mono': mono,
    'Gem' : gem,
    'Cis' : cis,
    'Trans' : trans,
    'Tri': tri,
    'Tetra' : tetra
}
print(f'There are {mono} Monosubstituted alkenes!')
print(f'There are {gem} Gem Disubstituted substituted alkenes!')
print(f'There are {cis} Cis Disubstituted alkenes!')
print(f'There are {trans} Trans Disubstituted alkenes!')
print(f'There are {tri} Trisubstituted alkenes!')
print(f'There are {tetra} Tetrasubstituted alkenes!')

# update_visualize_mols('Mono', mono_list, molsPerRow=5, svg=True)
# update_visualize_mols('Gem', gem_list, molsPerRow=5, svg=True)
# update_visualize_mols('Cis', cis_list, molsPerRow=5, svg=True)
# update_visualize_mols('Trans', trans_list, molsPerRow=5, svg=True)
# update_visualize_mols('Tri', tri_list, molsPerRow=5, svg=True)
# update_visualize_mols('Tetra', tetra_list, molsPerRow=5, svg=True)

There are 971 Monosubstituted alkenes!
There are 272 Gem Disubstituted substituted alkenes!
There are 191 Cis Disubstituted alkenes!
There are 1108 Trans Disubstituted alkenes!
There are 386 Trisubstituted alkenes!
There are 62 Tetrasubstituted alkenes!


Certain alkenes were mislabeled even though the code was correct. In addition, other problematic alkenes, such as porphyrins, structures with multiple alkenes, deuteration, etc. These were removed for the purposes of the external dataset.

In [None]:

df = pd.read_excel('Problems_with_External.xlsx', index_col=0)
multiple_problems = list()
not_usable = list()

correct_mols = list()

for rdmol in all_mols:
    name = rdmol.GetProp("_Name")
    if name in df.index:
        problem = df.loc[name]['Problem']
        print(problem)
        split = problem.split(' ')
        if 'actually' in problem:
            alk_type = str.capitalize(split[1])
            rdmol.SetProp("_Alkene_Type", alk_type)
            rdmol.SetProp("_OldName", name)

            rdmol.SetProp('_Name', f'{alk_type}_{all_types[alk_type]}')

            if (len(split) == 2) or ((len(split) == 3) & ('deuterated' in split)):
                correct_mols.append(rdmol)
                continue

        if ('multiple' in split) or ('present' in split) or ('many' in split) or ('both' in split):

            rdmol.SetProp("_Name", f'{rdmol.GetProp("_Name")}_mul')
            multiple_problems.append(rdmol)
        else:

            rdmol.SetProp("_Name", f'{rdmol.GetProp("_Name")}_not')
            not_usable.append(rdmol)
    else:
        correct_mols.append(rdmol)

print(f'There are {len(multiple_problems)} alkenes with multiple that would need to be sorted')
print(f'There are {len(not_usable)} alkenes that we should avoid')


# update_visualize_mols('multiple_alkenes', multiple_problems)       
# update_visualize_mols('not_usable', not_usable)

multiple mono
many trans
actually cis deuterated
multiple trans
actually cis tri present
cis tri and trans present
multiple mono
actually cis
actually cis deuterated
actually cis
actually mono
both trans and cis
multiple mono
multiple tri
tri present
many trans
porphyrin
multiple trans
actually cis
porphyrin
actually tri deuterated
allene multiple tri
actually tri deuterated
actually cis
multiple tri cis present
multiple mono
multiple tri
actually cis
multiple tri
actually cis
multiple tri
porphyrin
many mono
actually cis
allene multiple tri
tri present
multiple mono
cis present
multiple tri
actually cis
actually cis
tetra present
actually cis
tetra present
gem present
cis present
multiple cis
tetra present
porphyrin
porphyrin
actually cis
both trans and mono present
tetra present
cis present
actually cis
porphyrin
both trans and tri
many mono
actually cis
porphyrin
both trans and mono present
actually cis tri present
many tri mono present
actually cis
actually cis
multiple tri
multipl

In [None]:
mono = 0
mono_set = set()
mono_list = list()
gem = 0
gem_set = set()
gem_list = list()
cis = 0
cis_set = set()
cis_list = list()
trans = 0
trans_set = set()
trans_list = list()
tri = 0
tri_set = set()
tri_list = list()
tetra = 0
tetra_set = set()
tetra_list = list()

final_mols = list()

for rdmol in correct_mols:
    rdmol = PropertyMol(rdmol)
    alk_type = rdmol.GetProp("_Alkene_Type")
    smi = Chem.MolToSmiles(rdmol, canonical=True)
    match alk_type:
        case 'Mono':
            mono_set.add(smi)
            mono += 1
            assert len(mono_set) == mono, f'The mono set is {len(mono_set)} while the index is {mono}'
            mono_list.append(rdmol)
        case 'Gem':
            gem_set.add(smi)
            gem += 1
            assert len(gem_set) == gem, f'The gem set is {len(gem_set)} while the index is {gem}'
            rdmol.SetProp("_Name", f'Gem_{gem}')
            gem_list.append(rdmol)
        case 'Cis':
            cis_set.add(smi)
            cis += 1
            assert len(cis_set) == cis, f'The cis set is {len(cis_set)} while the index is {cis}'
            rdmol.SetProp("_Name", f'Cis_{cis}')
            cis_list.append(rdmol)
        case 'Trans':
            trans_set.add(smi)
            trans += 1
            assert len(trans_set) == trans, f'The trans set is {len(trans_set)} while the index is {trans}'
            rdmol.SetProp("_Name", f'Trans_{trans}')
            trans_list.append(rdmol)
        case 'Tri':
            tri_set.add(smi)
            tri += 1
            assert len(tri_set) == tri, f'The tri set is {len(tri_set)} while the index is {tri}'
            rdmol.SetProp("_Name", f'Tri_{tri}')
            tri_list.append(rdmol)
        case 'Tetra':
            tetra_set.add(smi)
            tetra += 1
            assert len(tetra_set) == tetra, f'The tetra set is {len(tetra_set)} while the index is {tetra}'
            rdmol.SetProp("_Name", f'Tetra_{tetra}')
            tetra_list

    final_mols.append(rdmol)

print(f'There are {mono} Monosubstituted alkenes!')
print(f'There are {gem} Gem Disubstituted substituted alkenes!')
print(f'There are {cis} Cis Disubstituted alkenes!')
print(f'There are {trans} Trans Disubstituted alkenes!')
print(f'There are {tri} Trisubstituted alkenes!')
print(f'There are {tetra} Tetrasubstituted alkenes!')

# update_visualize_mols('Mono', mono_list, molsPerRow=5, svg=True)
# update_visualize_mols('Gem', gem_list, molsPerRow=5, svg=True)
# update_visualize_mols('Cis', cis_list, molsPerRow=5, svg=True)
# update_visualize_mols('Trans', trans_list, molsPerRow=5, svg=True)
# update_visualize_mols('Tri', tri_list, molsPerRow=5, svg=True)
# update_visualize_mols('Tetra', tetra_list, molsPerRow=5, svg=True)

with open('fix_external_mols.pkl', 'wb') as f:
    pickle.dump(final_mols, f)

There are 862 Monosubstituted alkenes!
There are 260 Gem Disubstituted substituted alkenes!
There are 291 Cis Disubstituted alkenes!
There are 785 Trans Disubstituted alkenes!
There are 292 Trisubstituted alkenes!
There are 34 Tetrasubstituted alkenes!
