### This implementation exists within molli 1.2.1, but the syntax has since changed with newer version of RDKit. As a result, the workflow here has a native implementation of the canonicalization.

In [3]:
import molli as ml
from molli.external import rdkit as mrd
import pickle
from rdkit import Chem
import numpy as np
from rdkit.Chem.PropertyMol import PropertyMol
from tqdm import tqdm
from rdkit.Chem import AllChem as ac

def can_mol_order(rdmol:PropertyMol) -> tuple[PropertyMol, list[int], list[int]]:
    '''This a function tries to match the indexes of the canonicalized smiles string/molecular graph to a Molli Molecule object.
    Any inputs to this function will AUTOMATICALLY ADD HYDROGENS (make them explicit) to the RDKit mol object.

    Important Notes:
    - It will only have "_Kekulize_Issue" if the initial object had this property set (i.e. if it ran into an issue in the in initial instantiation)
    - The canonical rdkit mol object will have the "Canonical SMILES with hydrogens" available as the property: "_Canonical_SMILES_w_H"
    - There may be some properties missing as the PropertyCache is not being updated on the new canonicalized mol object, so consider using rdmol.UpdatePropertyCache() if you want to continue using the mol object
    
    Parameters
    ----------
    rdmol : PropertyMol
        RDKit Mol to change canonicalize

    Returns
    -------
    tuple[PropertyMol, list[int], list[int]]
        `1. Canonical RDKit Mol Object with Hydrogens\n`
        `2. A List for reordering the Atom Indices after canonicalization\n`
        `3. A list for reordering the Bond Indices after canonicalization\n`
    '''

    # This is here to deal with any smiles strings or mol objects that do not get assigned hydrogens
    new_rdmol = Chem.AddHs(rdmol)

    #### This statement is necessary to generate the mol.GetPropertyName "_smilesAtomOutputOrder" and"_smilesBondOutputOrder"######
    Chem.MolToSmiles(new_rdmol, canonical=True)

    # The smiles output order is actually a string of the form "[0,1,2,3,...,12,]", so it requires a start at 1 and end at -2!
    can_atom_reorder = eval(new_rdmol.GetProp("_smilesAtomOutputOrder"))
    
    canonical_bond_reorder_list =eval(new_rdmol.GetProp("_smilesBondOutputOrder"))

    can_smiles_w_h = Chem.MolToSmiles(new_rdmol, canonical=True)

    # # # #Allows maintaining of hydrogens when Mol object is created
    can_mol_w_h = PropertyMol(Chem.MolFromSmiles(can_smiles_w_h, sanitize=False))
    # Certain odd molecules result in some odd calculated properties, so this part is remaining commented out for now
    # can_mol_w_h.UpdatePropertyCache()
    all_props_original_rdmol = list(rdmol.GetPropNames())

    # Helps new rdkit object maintain original properties of rdkit mol put in
    for prop in all_props_original_rdmol:
        if not can_mol_w_h.HasProp(prop):
            can_mol_w_h.SetProp(prop, rdmol.GetProp(prop))

    can_mol_w_h.SetProp("_Canonical_SMILES_w_H", f"{can_smiles_w_h}")

    return can_mol_w_h, can_atom_reorder, canonical_bond_reorder_list

def reorder_molecule(
    ml_mol: ml.Molecule,
    can_rdmol_w_h: PropertyMol,
    can_atom_reorder: list,
    can_bond_reorder: list,
) -> dict[ml.Molecule, PropertyMol]:
    '''This is a function that utilizes the outputs of new_mol_order to reorder an existing molecule.
    Currently done in place on the original ml_mol object.

    Parameters
    ----------
    ml_mol : Molecule
        Molli Molecule Object to be reordered
    can_rdmol_w_h : _type_
        Canonical RDKit Object to be matched with
    can_atom_reorder : list
        List of integers associated with the atom reordering
    can_bond_reorder : list
        List of integers associated with the bond reordering

    Returns
    -------
    dict[ml.Molecule, PropertyMol]
        Dictionary linking Molli Molecule Object to RDKit Object 
    '''

    # This reorders the atoms of the molecule object
    molli_atoms_arr = np.array(ml_mol.atoms)
    fixed_atom_order_list = molli_atoms_arr[can_atom_reorder].tolist()
    ml_mol._atoms = fixed_atom_order_list

    # This reorders the bonds of the molecule object
    molli_obj_bonds_arr = np.array(ml_mol.bonds)
    fixed_bond_order_list = molli_obj_bonds_arr[can_bond_reorder].tolist()
    ml_mol._bonds = fixed_bond_order_list

    # This fixes the geometry of the molecule object
    ml_mol.coords = ml_mol.coords[can_atom_reorder]

    # This checks to see if the new rdkit atom order in the canonical smiles matches the new molli order of atoms
    can_rdkit_atoms = can_rdmol_w_h.GetAtoms()
    can_rdkit_atom_elem = np.array([x.GetSymbol() for x in can_rdkit_atoms])

    new_molli_elem = np.array([atom.element.symbol for atom in ml_mol.atoms])
    equal_check = np.array_equal(can_rdkit_atom_elem, new_molli_elem)

    assert (
        equal_check
    ), f"Array of rdkit atoms: {can_rdkit_atom_elem} is not equal to array of molli atoms: {new_molli_elem}"

    return {ml_mol: can_rdmol_w_h}


In [4]:
mlib = ml.MoleculeLibrary('2_Diol_Unordered.mlib')

reorder_mlib = ml.MoleculeLibrary(
    "3_Diol_Reordered.mlib", overwrite=True, readonly=False
)

with open("2_Diol_Unordered_w_H.pkl", "rb") as f:
    rdmols = pickle.load(f)

rdmol_dict = {rdmol.GetProp("_Name"): rdmol for rdmol in rdmols}
ex_props = ["_Name", "_Alkene_Type", "_Diol", "_CIP", "_Canonical_SMILES"]
can_rdmols = list()
problem_can_rdmols = list()

In [5]:
with mlib.reading(), reorder_mlib.writing():
    for name in tqdm(mlib):
        mlmol = mlib[name]

        rdmol_w_h = rdmol_dict[name]

        can_rdmol_w_h, can_rdatom_w_h, can_rdbond = can_mol_order(rdmol_w_h)

        #Reorders the molli object to match this order and creates a dictionary mapping the molli object to the rdkit object
        reorder_mlmol_dict = reorder_molecule(mlmol, can_rdmol_w_h, can_rdatom_w_h, can_rdbond)

        reorder_rdmol = reorder_mlmol_dict[mlmol]

        #Updates properties of the original molecule to the new molecule
        for prop in ex_props:
            reorder_rdmol.SetProp(prop, rdmol_w_h.GetProp(prop))
        reorder_rdmol.SetProp("_Canonical_SMILES_w_H", Chem.MolToSmiles(rdmol_w_h, canonical=True))

        #Checks to confirm the element order is the same
        equal_check = np.array_equal((rd_reorder_el := np.array([ml.Element[x.GetSymbol()] for x in reorder_rdmol.GetAtoms()])), mlmol.elements)
        assert equal_check, f"Array of rdkit atoms: {rd_reorder_el} is not equal to array of molli atoms: {mlmol.elements}"

        #Writes new molecule to the MoleculeLibrary
        reorder_mlib[name] = mlmol

        #Creates a list of RDKit molecules
        can_rdmols.append(reorder_rdmol)


with open(f"3_Canonical_Diol_w_h.pkl", "wb") as f:
    pickle.dump(can_rdmols, f)

100%|██████████| 987/987 [00:05<00:00, 191.78it/s]
