In [6]:
import molli as ml
from collections import deque
import pickle
import pandas as pd
from tqdm import tqdm
import networkx as nx

def yield_bfsd(m: ml.Molecule, start:ml.Atom, first:ml.Atom, done: list):
    '''
    Built from Molli 1.0 molli.chem.bond implementation.
    This will return a generator object
    '''
    _sa = start
    visited = set((_sa, *done))
    q = deque()
    if first is None:
        q.append((start,0))
    else:
        visited.add(first)
        q.append((first,1))
        yield (first, 1)
    while q:
        start, dist = q.pop()
        for a in m.connected_atoms(start):
            if a not in visited:
                yield (a, dist+1)
                visited.add(a)
                q.appendleft((a, dist + 1))

def bfsd_sub(m: ml.Molecule,
             visited_atoms: list,
             ap: ml.Atom,
             a1: ml.Atom,
             limit: int | None) -> ml.Substructure:
    
    '''This creates a full substructure or breadth-first-search (BFS) substructure 
    based on whether a limit is present. The atoms to be avoided should be provided
    and an attachment point

    Parameters
    ----------
    m : ml.Molecule
        Molecule to Traverse
    visited_atoms : list
        Atoms that should be avoided within the BFS
    ap : ml.Atom
        Atom attachment point to be appended to the list
    limit : int | None
        This is the BFS-limit, should start at 0

    Returns
    -------
    ml.Substructure
    '''
    bfsd = list(yield_bfsd(m,ap,a1,visited_atoms))
    final_bfsd = list()
    for (a,dist) in bfsd:
        if limit:
            if dist <= limit:
                final_bfsd.append(a)
        else:
            final_bfsd.append(a)

    final_bfsd.append(ap)

    return ml.Substructure(m,final_bfsd)

def frag_quadrants(m:ml.Molecule,
    ap: ml.Atom,
    c0: ml.Atom,
    c1: ml.Atom,
    qatom: ml.Atom,
    qnum: int,
    alk_case:str='Branch'
):
    '''This is meant to be an overarching function for fragmenting alkenes with a tetramethyl ammonium (TMA) ion. THere are 3 classes of alkenes: endo, exo, and branched that need to 
    be treated slightly differently depending on their use. This function uses BFSD to create a substructure, and then creates a full molecule. From there, it does further 
    processing to determine the atoms that will be joind with the TMA and which one will get an additional hydrogen

    Parameters
    ----------
    m : ml.Molecule
        Molecule to be Fragmented
    ap : ml.Atom
        Atom that will become the attachment point (will be c0 or c1)
    c0 : ml.Atom
        This is the alkene carbon associated with the largest volume
    c1 : ml.Atom
        This is the alkene carbon NOT associated with the largest volume
    qatom : ml.Atom
        This is the atom that will be used to create the fragment
    qnum : int
        This is the quadrant number of the qatom (1, 2, 3, 4)
    alk_case : str, optional
        This is the alkene type mainly for creation of substructures, by default 'Branch'

    Returns
    -------
    ml.Molecule
        Fragmented molecule with added implicit hydrogens and tetramethyl ammonium ion appended
    '''

    if alk_case == 'Endo':
        q_sub = bfsd_sub(
            m=m,
            visited_atoms=[c0,c1],
            ap=ap,
            a1=qatom,
            limit=None
            )
    else:
        q_sub = bfsd_sub(
            m=m,
            visited_atoms=[ap],
            ap=ap,
            a1=qatom,
            limit=None
        )

    qm = ml.Molecule(q_sub, name=f'{m.name}_Q{qnum}')

    qm_c0 = None
    qm_c1 = None

    for a in qm.atoms:
        if c0.attrib == a.attrib:
            qm_c0 = a
        elif c1.attrib == a.attrib:
            qm_c1 = a

    for a in qm.atoms:
        if 'Q' in a.attrib:
            #Removes excess bond for exocyclic to attachment point 
            if qatom.attrib['Q'] != a.attrib['Q']:
                if qm_c0: 
                    if (bond_del0 := qm.lookup_bond(a, qm_c0)):
                        qm.del_bond(bond_del0)
                elif qm_c1:
                    if (bond_del1 := qm.lookup_bond(a, qm_c1)):
                        qm.del_bond(bond_del1)
        if 'Arb C' in a.attrib:
            qm.del_atom(a)


    return q_sub, qm

def frag_q1q4(diol_alk_dict:dict, diols: list, ref_mlib: ml.MoleculeLibrary, q1q4_connect_mlib: ml.MoleculeLibrary, q1q4_mlib: ml.MoleculeLibrary):

    with ref_mlib.reading(), q1q4_connect_mlib.writing(), q1q4_mlib.writing():

        # for diol in diols:
        for diol in tqdm(diols):
            diol_name = diol.GetProp("_Name")
            alk_name = diol_alk_dict[diol_name]
            if (f'{alk_name}_Q1' in q1q4_mlib) | (f'{alk_name}_Q4' in q1q4_mlib) | (f'{alk_name}_Q1Q4' in q1q4_connect_mlib):
                continue
            #Retrieves necessary information
            alk_mlmol = ref_mlib[alk_name]
            alk_c0,alk_c1 = [alk_mlmol.get_atom(x) for x in alk_mlmol.attrib['C Order']]
            alk_c0_idx, alk_c1_idx = [alk_mlmol.get_atom_index(x) for x in [alk_c0,alk_c1]]

            alk_q_atoms = [alk_mlmol.get_atom(x) for x in alk_mlmol.attrib['Q Order']]
            alk_q1a,alk_q2a,alk_q3a,alk_q4a = alk_q_atoms
            alk_q1a_idx,alk_q2a_idx,alk_q3a_idx,alk_q4a_idx = [alk_mlmol.get_atom_index(x) for x in alk_q_atoms]

            nx_mol = alk_mlmol.to_nxgraph()

            ring_num = 0
            rings = nx.cycle_basis(nx_mol)
            endo_ring = list()
            exo_ring = list()
            
            #Iterates through subgraphs to assign endo/exo ring behavior
            for ra_subgraph in rings:
                if (alk_c0 in ra_subgraph) and (alk_c1 in ra_subgraph):
                    if alk_q1a in ra_subgraph:
                        endo_ring.append(alk_q1a)
                    if alk_q4a in ra_subgraph:
                        endo_ring.append(alk_q4a)
                    ring_num = 2

                elif (alk_c0 in ra_subgraph) and (alk_c1 in ra_subgraph):
                    if alk_q1a in ra_subgraph:
                        exo_ring.append(alk_q1a)
                    if alk_q4a in ra_subgraph:
                        exo_ring.append(alk_q4a)
                    ring_num = 1
            
            #Matches number for alkene rings
            match ring_num:
                #Endocyclic alkene
                case 2:
                    if len(endo_ring) > 0:
                        if (alk_q1a in endo_ring) or (alk_q1a in exo_ring):
                            q1_sub, q1_m = frag_quadrants(
                                m=alk_mlmol,
                                ap=alk_c0,
                                c0=alk_c0,
                                c1=alk_c1,
                                qatom=alk_q1a,
                                qnum=1,
                                alk_case='Endo'
                            )
                        else:
                            q1_sub, q1_m = frag_quadrants(
                            m=alk_mlmol,
                            ap=alk_c0,
                            c0=alk_c0,
                            c1=alk_c1,
                            qatom=alk_q1a,
                            qnum=1,
                            alk_case='Branch'
                            )
                        if (alk_q4a in endo_ring) or (alk_q4a in exo_ring):
                            q4_sub, q4_m = frag_quadrants(
                                m=alk_mlmol,
                                ap=alk_c0,
                                c0=alk_c0,
                                c1=alk_c1,
                                qatom=alk_q4a,
                                qnum=4,
                                alk_case='Endo'
                            )
                        else:
                            q4_sub, q4_m = frag_quadrants(
                            m=alk_mlmol,
                            ap=alk_c0,
                            c0=alk_c0,
                            c1=alk_c1,
                            qatom=alk_q4a,
                            qnum=4,
                            alk_case='Branch'
                            )
                    elif len(exo_ring) > 0:
                        q1_sub, q1_m = frag_quadrants(
                            m=alk_mlmol,
                            ap=alk_c0,
                            c0=alk_c0,
                            c1=alk_c1,
                            qatom=alk_q1a,
                            qnum=1,
                            alk_case='Exo'
                        )
                        q4_sub, q4_m = frag_quadrants(
                        m=alk_mlmol,
                        ap=alk_c0,
                        c0=alk_c0,
                        c1=alk_c1,
                        qatom=alk_q4a,
                        qnum=4,
                        alk_case='Exo'
                        )
                    else:
                        raise ValueError('Not behaving Correctly for cyclohexylidene type complex')
                #Exocyclic alkene
                case 1:
                    if alk_c0 in exo_ring:
                        q1_sub, q1_m = frag_quadrants(
                            m=alk_mlmol,
                            ap=alk_c0,
                            c0=alk_c0,
                            c1=alk_c1,
                            qatom=alk_q1a,
                            qnum=1,
                            alk_case='Exo'
                        )
                        q4_sub, q4_m = frag_quadrants(
                        m=alk_mlmol,
                        ap=alk_c0,
                        c0=alk_c0,
                        c1=alk_c1,
                        qatom=alk_q4a,
                        qnum=4,
                        alk_case='Exo'
                        )
                    elif alk_c1 in exo_ring:
                        q1_sub, q1_m = frag_quadrants(
                            m=alk_mlmol,
                            ap=alk_c0,
                            c0=alk_c0,
                            c1=alk_c1,
                            qatom=alk_q1a,
                            qnum=1,
                            alk_case='Branch'
                        )
                        q4_sub, q4_m = frag_quadrants(
                        m=alk_mlmol,
                        ap=alk_c0,
                        c0=alk_c0,
                        c1=alk_c1,
                        qatom=alk_q4a,
                        qnum=4,
                        alk_case='Branch'
                        )
                
                #Branched Alkene
                case 0:
                    q1_sub, q1_m = frag_quadrants(
                        m=alk_mlmol,
                        ap=alk_c0,
                        c0=alk_c0,
                        c1=alk_c1,
                        qatom=alk_q1a,
                        qnum=1,
                        alk_case='Branch'
                    )
                    q4_sub, q4_m = frag_quadrants(
                    m=alk_mlmol,
                    ap=alk_c0,
                    c0=alk_c0,
                    c1=alk_c1,
                    qatom=alk_q4a,
                    qnum=4,
                    alk_case='Branch'
                    )
            
            q1_sub: ml.Substructure
            q1_m: ml.Molecule
            q4_sub: ml.Substructure
            q4_m: ml.Molecule

            q1q4_mlib[q1_m.name] = q1_m
            q1q4_mlib[q4_m.name] = q4_m

            q1q4_full_sub_atoms = [alk_c0]


            q1q4_full_sub_atoms.extend(alk_mlmol.get_atoms(*q1_sub.parent_atom_indices))

            q1q4_full_sub_atoms.extend(alk_mlmol.get_atoms(*q4_sub.parent_atom_indices))

            q1q4_full_sub_atoms = list(set(q1q4_full_sub_atoms))
            if alk_c1 in q1q4_full_sub_atoms:
                
                q1q4_full_sub_atoms.remove(alk_c1)

            q1q4_sub = ml.Substructure(alk_mlmol,q1q4_full_sub_atoms)
            q1q4_m = ml.Molecule(q1q4_sub, name=f'{alk_mlmol.name}_Q1Q4')

            q1q4_connect_mlib[q1q4_m.name] = q1q4_m
        print(q1q4_connect_mlib)
        print(q1q4_mlib)

In [7]:
DB_df = pd.read_csv("SAD_Database.csv")

diol_alk_dict = dict(DB_df[["Product ID", "Reactant ID"]].values)
with open("4_Diol_w_H_Filter.pkl", "rb") as f:
    diols = pickle.load(f)

In [8]:
BFSVol_mlib = ml.MoleculeLibrary("6_7_Realign_3BFSVol.mlib")
q1q4_connect_BFSVol_mlib = ml.MoleculeLibrary("5_1_q1q4_connect_3BFSVol.mlib",readonly=False, overwrite=True)
q1q4_BFSVol_mlib = ml.MoleculeLibrary("5_1_q1q4_frags_3BFSVol.mlib", readonly=False, overwrite=True)

frag_q1q4(diol_alk_dict=diol_alk_dict, diols=diols, ref_mlib=BFSVol_mlib, q1q4_connect_mlib=q1q4_connect_BFSVol_mlib, q1q4_mlib=q1q4_BFSVol_mlib)

100%|██████████| 987/987 [00:11<00:00, 89.48it/s] 


MoleculeLibrary(backend=UkvCollectionBackend('5_1_q1q4_connect_3BFSVol.mlib'), n_items=784)
MoleculeLibrary(backend=UkvCollectionBackend('5_1_q1q4_frags_3BFSVol.mlib'), n_items=1568)


In [9]:
maxvol_mlib = ml.MoleculeLibrary("6_7_Realign_MaxVol.mlib")
q1q4_connect_maxvol_mlib = ml.MoleculeLibrary("5_1_q1q4_connect_MaxVol.mlib",readonly=False, overwrite=True)
q1q4_maxvol_mlib = ml.MoleculeLibrary("5_1_q1q4_frags_MaxVol.mlib", readonly=False, overwrite=True)

frag_q1q4(diol_alk_dict=diol_alk_dict, diols=diols,ref_mlib=maxvol_mlib, q1q4_connect_mlib=q1q4_connect_maxvol_mlib, q1q4_mlib=q1q4_maxvol_mlib)

100%|██████████| 987/987 [00:10<00:00, 93.04it/s] 


MoleculeLibrary(backend=UkvCollectionBackend('5_1_q1q4_connect_MaxVol.mlib'), n_items=784)
MoleculeLibrary(backend=UkvCollectionBackend('5_1_q1q4_frags_MaxVol.mlib'), n_items=1568)
