## This script assigns quadrants, and aligns the structures to match ordering by Volume

This script adds the following properties:

#### To 4 Atoms:
- 'Q' = int, This is the quadrant the atom index is associated with (i.e. 1,2,3,4)

#### To 1 Atom:
- 'C0' = int, This is the alkene atom associated with the highest volume substituent

#### To 1 Atom:
- 'C1' = int, This is the other alkene atom NOT associated with the highest volume substituent


In [7]:
import molli as ml
import numpy as np
from molli.math import rotation_matrix_from_vectors
from scipy.spatial.transform import Rotation
from tqdm import tqdm

def id_cis_trans(
        m: ml.Molecule,
        q1a: ml.Atom,
        c0: ml.Atom,
        a1a2_tup: tuple
)-> tuple:
    '''This returns 2 atoms such that the atom cis to q1a is returned first and 
    the atom trans to q1a is returned second

    Parameters
    ----------
    m : ml.Molecule
        Molecule of Interest
    q1a : ml.Atom
        Atom Defined by Q1
    c0 : ml.Atom
        Alkene Carbon connected to Q1
    a1a2_tup : tuple
        Tuple of atoms on the alkene carbon connected to Q2/Q3
    '''

    v1 = m.get_atom_coord(c0) - m.get_atom_coord(q1a)
    # print(m.get_atom_index(q1a))
    # print([m.get_atom_index(x) for x in a1a2_tup])
    a1, a2 = a1a2_tup
    v2 = m.get_atom_coord(a2) - m.get_atom_coord(a1)
    
    '''
    v1 = vector from q1a to c0 
    v2 = vector from a test1 -> test 2               
    '''
    if np.sign(np.dot(v1,v2)) == 1:
        # A1 Cis to Q1A (i.e. A1 == Q2, A2 == Q3)
        return a1,a2
    else:
        # A1 Trans to Q1A (i.e. A2 == Q2, A1 == Q3)
        return a2,a1

def set_origin(m:ml.Molecule,i: int):
    m.translate(-1*m.coords[i])

def rot_xy(m: ml.Molecule, q1a:ml.Atom, c0:ml.Atom, c1:ml.Atom):
    '''
    Q1
      \ 
        Q1C = Q2C
    
    This should be positive with respect to right hand rule
    '''
    v1 = m.get_atom_coord(c0)-m.get_atom_coord(q1a)
    v2 = m.get_atom_coord(c1) - m.get_atom_coord(c0)
    c = np.cross(v1,v2)
    t_matrix = rotation_matrix_from_vectors(c, np.array([0,0,1]))
    m.transform(t_matrix)

def rot_rz(m: ml.Molecule, c0:ml.Atom, c1: ml.Atom):
    v1 = m.get_atom_coord(c1) - m.get_atom_coord(c0)
    t_matrix = rotation_matrix_from_vectors(v1, np.array([1,0,0]))
    m.transform(t_matrix)

def check_q_align(m: ml.Molecule,q1a:ml.Atom,q2a:ml.Atom,q3a:ml.Atom):
    '''Asserts that the Q1Q2 vector and Q2Q3 vector are negative with respect to the z axis. This confirms that they are ordered correctly.

    Parameters
    ----------
    m : ml.Molecule
    q1a : ml.Atom
        Q1 Atom
    q2a : ml.Atom
        Q2 Atom
    q3a : ml.Atom
        Q3 Atom
    '''
    
    q1q2v = m.get_atom_coord(q2a)-m.get_atom_coord(q1a)
    q2q3v = m.get_atom_coord(q3a)-m.get_atom_coord(q2a)
    c = np.cross(q1q2v,q2q3v)
    assert np.sign(np.dot([0,0,1],c)) == -1, f'Sign is not negative! {np.sign(np.cross(q1q2v,q2q3v))}'

def check_alk_types(m:ml.Molecule, q1a:ml.Atom,q2a:ml.Atom,q3a:ml.Atom,q4a:ml.Atom):
    '''
    This function serves to check that alkene types are being assigned correctly.
    '''
    # {h_count}\n{m.name}\n{m.attrib["_Canonical_SMILES_H"]}
    h_count = [x.element for x in [q1a,q2a,q3a,q4a]].count(ml.Element.H)
    d_count = [x.isotope for x in [q1a,q2a,q3a,q4a]].count(2)
    try:
        match m.attrib['_Alkene_Type']:
            case 'Mono':
                assert (h_count == 3), f'Mono has {h_count}'
                assert q1a.element != ml.Element.H, f'Mono has Q1 {q1a}'
            case 'Gem':
                assert ((h_count == 2) & (d_count ==  0)) | ((h_count == 3) & (d_count ==  1)), f'Gem has H Count {h_count} and D Count {d_count}'
                assert (q1a.element != ml.Element.H), f'Gem has Q1 {q1a}, Q4 {q4a}'
            case 'Cis':
                assert ((h_count == 2) & (d_count ==  0)) | ((h_count == 3) & (d_count ==  1)), f'Cis has {h_count}'
                assert (q1a.element != ml.Element.H), f'Cis has Q1 {q1a}, Q2 {q2a}'
            case 'Trans':
                assert ((h_count == 2) & (d_count ==  0)) | ((h_count == 3) & (d_count ==  1)), f'Trans has H Count {h_count} and D Count {d_count}'
                assert (q1a.element != ml.Element.H), f'Trans has Q1 {q1a}'
            case 'Tri_Q2':
                assert ((h_count == 1) & (d_count ==  0)) | ((h_count == 2) & (d_count ==  1)) | ((h_count == 3) & (d_count ==  2)), f'Tri_Q2 has H Count {h_count} and D Count {d_count}'
                assert (q1a.element != ml.Element.H), f'Tri_Q2 has Q1 {q1a}'
            case 'Tri_Q3':
                assert ((h_count == 1) & (d_count ==  0)) | ((h_count == 2) & (d_count ==  1)) | ((h_count == 3) & (d_count ==  2)), f'Tri_Q3 has H Count {h_count} and D Count {d_count}'
                assert (q1a.element != ml.Element.H) , f'Tri_Q3 has Q1 {q1a}'
            case 'Tri_Q4':
                assert ((h_count == 1) & (d_count ==  0)) | ((h_count == 2) & (d_count ==  1)) | ((h_count == 3) & (d_count ==  2)), f'Tri_Q4 has H Count {h_count} and D Count {d_count}'
                assert (q1a.element != ml.Element.H), f'Tri_Q4 has Q1 {q1a}'
            case 'Tetra':
                assert ((h_count== 0 & d_count == 0) | (h_count == 1) & (d_count ==  1)) | ((h_count == 2) & (d_count ==  2)) | ((h_count == 3) & (d_count ==  3)), f'Tetra has H Count {h_count} and D Count {d_count}'
    except Exception as e:
        print(e)
        print(f'\n{m.name}\n{m.attrib["_Canonical_SMILES"]})')


  '''


In [8]:
#The number of atoms starts at 0, so "2" refers to 3-atom breadth-first search
bfs_limit = 2
vol_types = ['MaxVol', '3BFSVol']
mlib = ml.MoleculeLibrary('5_2_DB_OPT_Sterimol.mlib')

for vol_type in vol_types:
    print(f'Realigning based on {vol_type}')
    mlib_align = ml.MoleculeLibrary(f'5_3_DB_OPT_Align{vol_type}.mlib', readonly=False, overwrite=True)

    with mlib.reading(), mlib_align.writing():
        for k in tqdm(mlib):
            m = mlib[k]

            #Retrieves arbitrary atoms
            arb_c1 = m.get_atom(m.attrib['Arb C0'])
            arb_c1_con = tuple([m.get_atom(x) for x in m.attrib['Arb C0 Con']])
            arb_c2 = m.get_atom(m.attrib['Arb C1'])
            arb_c2_con = tuple([m.get_atom(x) for x in m.attrib['Arb C1 Con']])

            arb_c_dict = {arb_c1:arb_c1_con, arb_c2:arb_c2_con}

            desc = m.attrib['All Arb Sterimol']

            if vol_type == 'MaxVol':
                max_vol = max([desc[x][f'vol'] for x in desc])
            elif vol_type == '3BFSVol':
                max_vol = max([desc[x][f'bfs{bfs_limit}_vol'] for x in desc])

            #Find atoms of individual quadrants
            for i, ster in desc.items():
                if vol_type == 'MaxVol':
                    vol = ster[f'vol']
                elif vol_type == '3BFSVol':
                    vol = ster[f'bfs{bfs_limit}_vol']

                if vol == max_vol:
                    #Identifies True Q1 Atom and True C0 Atom
                    q1a = m.get_atom(int(i))
                    true_c0 = m.get_atom(q1a.attrib['Arb C Con'])

                    #Identifies True C1 Atom
                    for a in arb_c_dict:
                        if a != true_c0:
                            true_c1 = a
                    #Identifies True Q4 Atom
                    true_tup = arb_c_dict[true_c0]
                    for a in true_tup:
                        if a != q1a:
                            q4a = a
                    break

            q2a, q3a = id_cis_trans(
                m=m,
                q1a=q1a,
                c0=true_c0,
                a1a2_tup=arb_c_dict[true_c1]
            )

            q_atoms = (q1a,q2a,q3a,q4a)

            true_c0.attrib['C0'] = m.get_atom_index(true_c0)
            true_c1.attrib['C1'] = m.get_atom_index(true_c1)

            q1a.attrib['Q'] = 1
            q2a.attrib['Q'] = 2
            q3a.attrib['Q'] = 3
            q4a.attrib['Q'] = 4

            if m.attrib['_Alkene_Type'] == 'Tri':
                h_count = [x.element for x in q_atoms].count(ml.Element.H)

                assert h_count == 1, f'Tri H count = {h_count}\n{m.name}\n{m.attrib["_Canonical_SMILES_H"]}'

                if q2a.element == ml.Element.H:
                    m.attrib['_Alkene_Type'] = 'Tri_Q2'
                elif q3a.element == ml.Element.H:
                    m.attrib['_Alkene_Type'] = 'Tri_Q3'
                elif q4a.element == ml.Element.H:
                    m.attrib['_Alkene_Type'] = 'Tri_Q4'

            m.attrib['C Order'] = tuple(m.get_atom_index(x) for x in [true_c0, true_c1])
            m.attrib['Q Order'] = tuple(m.get_atom_index(x) for x in q_atoms)
            m.attrib['True Sterimol'] = {f'Q{i+1}':a.attrib['Sterimol'] for i,a in enumerate(q_atoms)}
            
            #Sets Alkene Carbon C0 to be the origin
            set_origin(m,m.get_atom_index(true_c0))
            #Rotates molecule such that Q1 and alkene atoms are in the XY plane
            rot_xy(m,q1a,true_c0,true_c1)
            #Rotates molecule such that alkene atoms are along the X axis
            rot_rz(m,true_c0,true_c1)
            #This asserts that the vectors formed after alignment are correct
            check_q_align(m,q1a,q2a,q3a)
            #This asserts some key aspects of the alkenes to ensure they are behaving correctly
            check_alk_types(m,q1a,q2a,q3a,q4a)
            mlib_align[k] = m
        print(mlib_align)


Realigning based on MaxVol


100%|██████████| 784/784 [00:02<00:00, 347.42it/s]


MoleculeLibrary(backend=UkvCollectionBackend('5_3_DB_OPT_AlignMaxVol.mlib'), n_items=784)
Realigning based on 3BFSVol


100%|██████████| 784/784 [00:02<00:00, 351.31it/s]

MoleculeLibrary(backend=UkvCollectionBackend('5_3_DB_OPT_Align3BFSVol.mlib'), n_items=784)





## Note about Alignment Scheme

For the alignment scheme here, the Q1 is placed in the top left quadrant of the alkene:

```
Q1           Q2
  \          /
    Q1C = Q2C
  /          \
Q4            Q3
```

The main body of work uses the same ordering of the quadrants, it is just flipped vertically to match the mnemonic during discussions:

```
Q4           Q3
  \          /
    Q1C = Q2C
  /          \
Q1            Q2
```

This does not change any of the conclusions or the actual descriptor calculation/alignment. It is just viewed at a different angle for the purposes of discussion.