In [1]:
import molli as ml
from collections import deque
from molli.external.rdkit import atom_filter as af
from rdkit import Chem
import numpy as np
from tqdm import tqdm
ml.visual.configure()

def yield_bfsd(m: ml.Molecule, start:ml.Atom, first:ml.Atom, done: list):
    '''
    Built from Molli 1.0 molli.chem.bond implementation.
    This will return a generator object
    '''
    _sa = start
    visited = set((_sa, *done))
    q = deque()
    if first is None:
        q.append((start,0))
    else:
        visited.add(first)
        q.append((first,1))
        yield (first, 1)
    while q:
        start, dist = q.pop()
        for a in m.connected_atoms(start):
            if a not in visited:
                yield (a, dist+1)
                visited.add(a)
                q.appendleft((a, dist + 1))

def bfsd_sub(
        m: ml.Molecule,
        visited_atoms:list,
        ap: ml.Atom,
        a1:ml.Atom,
        limit: int | None
)-> ml.Substructure:
    
    '''This creates a full substructure or breadth-first-search (BFS) substructure 
    based on whether a limit is present. The atoms to be avoided should be provided
    and an attachment point

    Parameters
    ----------
    m : ml.Molecule
        Molecule to Traverse
    visited_atoms : list
        Atoms that should be avoided within the BFS
    ap : ml.Atom
        Atom attachment point to be appended to the list
    limit : int | None
        This is the BFS-limit, should start at 0

    Returns
    -------
    ml.Substructure
    '''
    bfsd = list(yield_bfsd(m,ap,a1,visited_atoms))
    final_bfsd = list()
    for (a,dist) in bfsd:
        if limit:
            if dist <= limit:
                final_bfsd.append(a)
        else:
            final_bfsd.append(a)

    final_bfsd.append(ap)

    return ml.Substructure(m,final_bfsd)

def join_tma(
        frag:ml.Molecule, 
        frag_ap:ml.Atom, 
        tma: ml.Molecule, 
        tma_ap:ml.Molecule) -> ml.Molecule:
    
    final_frag = ml.Molecule.join(
        frag,
        tma,
        frag_ap,
        tma_ap,
        optimize_rotation=True
    )
    final_frag.name = frag.name
    return final_frag

def frag_esp_alkene(
    m:ml.Molecule,
    ap: ml.Atom,
    ap_num: str,
    c0: ml.Atom,
    c1: ml.Atom,
    qatom: ml.Atom,
    qnum: int,
    tma:ml.Molecule,
    tma_ap:ml.Atom,
    alk_case:str='Branch'
):
    '''This is meant to be an overarching function for fragmenting alkenes with a tetramethyl ammonium (TMA) ion. THere are 3 classes of alkenes: endo, exo, and branched that need to 
    be treated slightly differently depending on their use. This function uses BFSD to create a substructure, and then creates a full molecule. From there, it does further 
    processing to determine the atoms that will be joind with the TMA and which one will get an additional hydrogen

    Parameters
    ----------
    m : ml.Molecule
        Molecule to be Fragmented
    ap : ml.Atom
        Atom that will become the attachment point (will be c0 or c1)
    ap_num : str
        The label associated with the atom (C0 or C1)
    c0 : ml.Atom
        This is the alkene carbon associated with the largest volume
    c1 : ml.Atom
        This is the alkene carbon NOT associated with the largest volume
    qatom : ml.Atom
        This is the atom that will be used to create the fragment
    qnum : int
        This is the quadrant number of the qatom (1, 2, 3, 4)
    tma : ml.Molecule
        This is the tetramethyl ammonium fragment with attachment point
    tma_ap : ml.Atom
        This is the attachment point of the TMA group
    alk_case : str, optional
        This is the alkene type mainly for creation of substructures, by default 'Branch'

    Returns
    -------
    ml.Molecule
        Fragmented molecule with added implicit hydrogens and tetramethyl ammonium ion appended
    '''

    if alk_case == 'Endo':
        q_sub = bfsd_sub(
            m=m,
            visited_atoms=[c0,c1],
            ap=ap,
            a1=qatom,
            limit=None
            )
    else:
        q_sub = bfsd_sub(
            m=m,
            visited_atoms=[ap],
            ap=ap,
            a1=qatom,
            limit=None
        )
    
    qm = ml.Molecule(q_sub, name=f'{m.name}_Q{qnum}')

    for a in qm.atoms:
        if ap_num in a.attrib:
            qm_ap = a

    for a in qm.atoms:
        if 'Q' in a.attrib:
            #Removes excess bond for exocyclic to attachment point 
            if qnum != a.attrib['Q']:
                if (bond_del := qm.lookup_bond(a, qm_ap)):
                    qm.del_bond(bond_del)
                #Adds hydrogen to missing connection
                qm.add_implicit_hydrogens(a)
    # return qm
    #This joins the TMA and fragment created
    final_qm = join_tma(
    frag=qm,
    frag_ap=qm_ap,
    tma=tma,
    tma_ap=tma_ap
    )

    final_qm.charge = 1

    return final_qm

def proc_rings(
        rings: tuple,
        c0_idx: int,
        c1_idx: int,
        q1a_idx: int,
        q2a_idx: int,
        q3a_idx: int,
        q4a_idx: int
):
    endo_rings = list()
    exo_rings = list()
    for ring_tuple in rings:
        if (c0_idx in ring_tuple) and (c1_idx in ring_tuple):
            if q1a_idx in ring_tuple:
                endo_rings.append(q1a_idx)
            if q2a_idx in ring_tuple:
                endo_rings.append(q2a_idx)
            if q3a_idx in ring_tuple:
                endo_rings.append(q3a_idx)
            if q4a_idx in ring_tuple:
                endo_rings.append(q4a_idx)
        elif (c0_idx in ring_tuple) and (c1_idx not in ring_tuple):
            # print('c0_tuple')
            if q1a_idx in ring_tuple:
                exo_rings.append(q1a_idx)
            if q4a_idx in ring_tuple:
                exo_rings.append(q4a_idx)
        elif (c0_idx not in ring_tuple) and (c1_idx in ring_tuple):
            # print('c1_tuple')
            if q2a_idx in ring_tuple:
                exo_rings.append(q2a_idx)
            if q3a_idx in ring_tuple:
                exo_rings.append(q3a_idx)
    
    return endo_rings, exo_rings

In [3]:
mlib = ml.MoleculeLibrary('5_3_DB_OPT_AlignMaxVol.mlib')
frag_mlib = ml.MoleculeLibrary('6_1_DB_All_ESPFrags.mlib', readonly=False, overwrite=True)
tma = ml.Molecule.load_mol2('6_1_TMA.mol2')
tma_ap, = tma.get_atoms(*tma.yield_atoms_by_label("AP1"))
with mlib.reading(), frag_mlib.writing():
    for k in tqdm(mlib):

        mlmol = mlib[k]

        #Pulls out necessary information on alkenes
        c0,c1 = [mlmol.get_atom(x) for x in mlmol.attrib['C Order']]
        c0_idx, c1_idx = [mlmol.get_atom_index(x) for x in [c0,c1]]

        q1a,q2a,q3a,q4a = [mlmol.get_atom(x) for x in mlmol.attrib['Q Order']]
        q1a_idx,q2a_idx,q3a_idx,q4a_idx = [mlmol.get_atom_index(x) for x in [q1a,q2a,q3a,q4a]]

        alk_bool = np.array([True if v == '1' else False for v in mlmol.attrib['_Alkene_w_H']])

        rdmol = Chem.MolFromSmiles(mlmol.attrib['_Canonical_SMILES_H'],sanitize=False)
        Chem.SanitizeMol(rdmol)

        #Identifies if an alkene is in a ring
        ring_alk_bool = alk_bool & af(rdmol).in_ring()

        ring_num = np.count_nonzero(ring_alk_bool)
        match ring_num:
            #This should be endocyclic or special exocyclic alkenes (cis, tri, tetra, and cyclohexylidene alkenes )
            case 2:
                rings = rdmol.GetRingInfo().AtomRings()
                endo_ring, exo_ring = proc_rings(
                rings=rings,
                c0_idx=c0_idx,
                c1_idx=c1_idx, 
                q1a_idx=q1a_idx,
                q2a_idx=q2a_idx,
                q3a_idx=q3a_idx,
                q4a_idx=q4a_idx)

                if endo_ring:
                    #If Q1 is in an endo ring, it must be processed as endo
                    if (q1a_idx in endo_ring) or (q1a_idx in exo_ring):
                        q1_sub = frag_esp_alkene(
                            m=mlmol,
                            ap=c0, #Q1 is attached to C0
                            ap_num='C0', #Q1 is attached to C0
                            c0=c0,
                            c1=c1,
                            qatom=q1a, #Q1 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=1, #Q1 substructure
                            alk_case='Endo' #Endo Processing
                        )
                    else:
                        q1_sub = frag_esp_alkene(
                            m=mlmol,
                            ap=c0, #Q1 is attached to C0
                            ap_num='C0', #Q1 is attached to C0
                            c0=c0,
                            c1=c1,
                            qatom=q1a, #Q1 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=1, #Q1 substructure
                            alk_case='Branch' #Branch Processing
                        )
                    if (q2a_idx in endo_ring) or (q2a_idx in exo_ring):
                        q2_sub = frag_esp_alkene(
                            m=mlmol,
                            ap=c1, #Q2 is attached to C1
                            ap_num='C1', #Q2 is attached to C1
                            c0=c0,
                            c1=c1,
                            qatom=q2a, #Q2 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=2, #Q2 substructure
                            alk_case='Endo' #Endo Processing
                        )
                    else:
                        q2_sub = frag_esp_alkene(
                            m=mlmol,
                            ap=c1, #Q2 is attached to C1
                            ap_num='C1', #Q2 is attached to C1
                            c0=c0,
                            c1=c1,
                            qatom=q2a, #Q2 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=2, #Q2 substructure
                            alk_case='Branch' #Branch Processing
                        )
                    if (q3a_idx in endo_ring) or (q3a_idx in exo_ring):
                        q3_sub = frag_esp_alkene(
                            m=mlmol, 
                            ap=c1, #Q3 is attached to C1
                            ap_num='C1', #Q3 is attached to C1
                            c0=c0,
                            c1=c1,
                            qatom=q3a, #Q3 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=3, #Q3 substructure
                            alk_case='Endo' #Endo Processing
                        )
                    else:
                        q3_sub = frag_esp_alkene(
                            m=mlmol, 
                            ap=c1, #Q3 is attached to C1
                            ap_num='C1', #Q3 is attached to C1
                            c0=c0,
                            c1=c1,
                            qatom=q3a, #Q3 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=3, #Q3 substructure
                            alk_case='Branch' #Branch Processing
                        )
                    if (q4a_idx in endo_ring) or (q4a_idx in exo_ring):
                        q4_sub = frag_esp_alkene(
                            m=mlmol,
                            ap=c0, #Q4 is attached to C0
                            ap_num='C0', #Q4 is attached to C0
                            c0=c0,
                            c1=c1,
                            qatom=q4a, #Q4 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=4, #Q4 substructure
                            alk_case='Endo' #Endo Processing
                        )
                    else:
                        q4_sub = frag_esp_alkene(
                            m=mlmol,
                            ap=c0, #Q4 is attached to C0
                            ap_num='C0', #Q4 is attached to C0
                            c0=c0,
                            c1=c1,
                            qatom=q4a, #Q4 substructure
                            tma=tma,
                            tma_ap=tma_ap,
                            qnum=4, #Q4 substructure
                            alk_case='Branch' #Branch Processing
                        )
                elif exo_ring:
                    #Alkene carbons in different ring (cyclohexylidene/only exo)
                    q1_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c0, #Q1 is attached to C0
                        ap_num='C0', #Q1 is attached to C0
                        c0=c0,
                        c1=c1,
                        qatom=q1a, #Q1 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=1, #Q1 substructure
                        alk_case='Exo' #Exo Processing
                    )
                    q2_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c1, #Q2 is attached to C1
                        ap_num='C1', #Q2 is attached to C1
                        c0=c0,
                        c1=c1,
                        qatom=q2a, #Q2 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=2, #Q2 substructure
                        alk_case='Exo' #Endo Processing
                    )
                    q3_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c1, #Q3 is attached to C1
                        ap_num='C1', #Q3 is attached to C1
                        c0=c0,
                        c1=c1,
                        qatom=q3a, #Q3 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=3,
                        alk_case='Exo' #Exo Processing
                    )
                    q4_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c0, #Q4 is attached to C0
                        ap_num='C0', #Q4 is attached to C0
                        c0=c0,
                        c1=c1,
                        qatom=q4a, #Q4 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=4, #Q4 substructure
                        alk_case='Exo' #Exo Processing
                    )
                else:
                    raise ValueError('Not behaving Correctly for cyclohexylidene type complex')
                
            #This should be exocyclic alkenes (gem, tri, tetra)
            case 1:
                #One side should be a branch, the other side should be cyclic fragment
                iso_ring_atom = mlmol.get_atom(int(np.where(ring_alk_bool)[0][0]))
                #If C0 is the one with the ring, Q1/Q4 are exocyclic connections
                if c0 == iso_ring_atom:
                    q1_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c0, #Q1 is attached to C0
                        ap_num='C0', #Q1 is attached to C0
                        c0=c0,
                        c1=c1,
                        qatom=q1a, #Q1 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=1, #Q1 substructure
                        alk_case='Exo' #Exo Processing
                    )

                    q4_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c0, #Q4 is attached to C0
                        ap_num='C0', #Q4 is attached to C0
                        c0=c0,
                        c1=c1,
                        qatom=q4a, #Q4 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=4, #Q4 substructure
                        alk_case='Exo' #Exo Processing
                    )
                    #Q2/Q3 are just branched alkenes
                    q2_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c1, #Q2 is attached to C1
                        ap_num='C1', #Q2 is attached to C1
                        c0=c0,
                        c1=c1,
                        qatom=q2a, #Q2 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=2, #Q2 substructure
                        alk_case='Branch' #Branch Processing
                    )
                    q3_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c1, #Q3 is attached to C1
                        ap_num='C1', #Q3 is attached to C1
                        c0=c0,
                        c1=c1,
                        qatom=q3a, #Q3 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=3,
                        alk_case='Branch' #Branch Processing
                    )
                #If C1 is the one with the ring, Q2/Q3 are exocyclic connections
                if c1 == iso_ring_atom:
                    q2_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c1, #Q2 is attached to C1
                        ap_num='C1', #Q2 is attached to C1
                        c0=c0,
                        c1=c1,
                        qatom=q2a, #Q2 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=2, #Q2 substructure
                        alk_case='Exo' #Exo Processing
                    )
                    q3_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c1, #Q3 is attached to C1
                        ap_num='C1', #Q3 is attached to C1
                        c0=c0,
                        c1=c1,
                        qatom=q3a, #Q3 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=3,
                        alk_case='Exo' #Exo Processing
                    )
                    #Q1/Q4 are just branched alkenes
                    q1_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c0, #Q1 is attached to C0
                        ap_num='C0', #Q1 is attached to C0
                        c0=c0,
                        c1=c1,
                        qatom=q1a, #Q1 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=1, #Q1 substructure
                        alk_case='Branch' #Branch Processing
                    )
                    q4_sub = frag_esp_alkene(
                        m=mlmol,
                        ap=c0, #Q4 is attached to C0
                        ap_num='C0', #Q4 is attached to C0
                        c0=c0,
                        c1=c1,
                        qatom=q4a, #Q4 substructure
                        tma=tma,
                        tma_ap=tma_ap,
                        qnum=4, #Q4 substructure
                        alk_case='Branch' #Branch Processing
                    )
                    
            #This should be any branched alkene
            #Q1/Q4 should be connected to C0, Q2/Q3 should be connected to C1
            case 0:
                q1_sub = frag_esp_alkene(
                    m=mlmol,
                    ap=c0, #Q1 is attached to C0
                    ap_num='C0', #Q1 is attached to C0
                    c0=c0,
                    c1=c1,
                    qatom=q1a, #Q1 substructure
                    tma=tma,
                    tma_ap=tma_ap,
                    qnum=1, #Q1 substructure
                    alk_case='Branch' #Branch Processing
                )

                q2_sub = frag_esp_alkene(
                    m=mlmol,
                    ap=c1, #Q2 is attached to C1
                    ap_num='C1', #Q2 is attached to C1
                    c0=c0,
                    c1=c1,
                    qatom=q2a, #Q2 substructure
                    tma=tma,
                    tma_ap=tma_ap,
                    qnum=2, #Q2 substructure
                    alk_case='Branch' #Branch Processing
                )

                q3_sub = frag_esp_alkene(
                    m=mlmol,
                    ap=c1, #Q3 is attached to C1
                    ap_num='C1', #Q3 is attached to C1
                    c0=c0,
                    c1=c1,
                    qatom=q3a, #Q3 substructure
                    tma=tma,
                    tma_ap=tma_ap,
                    qnum=3,
                    alk_case='Branch' #Branch Processing
                )

                q4_sub = frag_esp_alkene(
                    m=mlmol,
                    ap=c0, #Q4 is attached to C0
                    ap_num='C0', #Q4 is attached to C0
                    c0=c0,
                    c1=c1,
                    qatom=q4a, #Q4 substructure
                    tma=tma,
                    tma_ap=tma_ap,
                    qnum=4, #Q4 substructure
                    alk_case='Branch' #Branch Processing
                )

        frag_mlib[q1_sub.name] = q1_sub
        frag_mlib[q2_sub.name] = q2_sub
        frag_mlib[q3_sub.name] = q3_sub
        frag_mlib[q4_sub.name] = q4_sub

100%|██████████| 784/784 [00:14<00:00, 55.62it/s]


In [4]:
mlmol

Molecule(name='react_218', formula='C12 H12 O1')

In [5]:
q1_sub

Molecule(name='react_218_Q1', formula='C14 H20 N1 O1')

In [6]:
q2_sub

Molecule(name='react_218_Q2', formula='C4 H12 N1')

In [7]:
q3_sub

Molecule(name='react_218_Q3', formula='C4 H12 N1')

In [None]:
q4_sub

Molecule(name='react_216_Q4', formula='C4 H12 N1')