In [1]:
import molli as ml
from pprint import pprint
import pandas as pd
import numpy as np
from tqdm import tqdm

def sort_ids(s: str):
    '''This will correctly sort any type of reaction IDs'''
    _, b = s.split('_')
    return int(b)

def construct_desc(DB_df:pd.DataFrame, mlib: ml.MoleculeLibrary, vol_type: str = 'vol', file_name: str='test'):
    full_df = pd.DataFrame()
    print(f'Working on:\n{mlib}\nWriting To:\n{file_name}')
    with mlib.reading():
        for name in tqdm(mlib):
            if name not in DB_df.index:
                continue
            m = mlib[name]

            #Isolates quadrant atoms
            q1a, q2a, q3a, q4a = [m.get_atom(i) for i in m.attrib['Q Order']]
            q_atoms = [('Q1', q1a), ('Q2', q2a), ('Q3', q3a), ('Q4', q4a)]
            
            #Isolates Alkene Carbons
            c0, c1 = [m.get_atom(i)for i in m.attrib['C Order']]
            c0_idx, c1_idx = [m.get_atom_index(a) for a in [c0,c1]]

            sterimol_props = ['b1', 'b5', 'l', vol_type]
            esp_props = ['NWESPmin','99ESPMax', "Natural Charge"]
            desc_dict = dict()
            #This isoaltes all the associated fragment-based descriptors
            for q_num, qatom in q_atoms:
                res = dict()

                for k in sterimol_props:
                    res[f"{q_num}_{str.upper(k)}"] = qatom.attrib['Sterimol'][k]
                for k in esp_props:
                    res[f"{q_num}_{str.upper(k)}"] = qatom.attrib[k]

                # q_sterimol = {f"{q_num}_{str.upper(k)}": q1a.attrib['Sterimol'][k] for k in sterimol_props}
                # q_esp = {f"{q_num}_{str.upper(k)}": q1a.attrib[k] for k in esp_props}
                
                desc_dict.update(res)

            #These are associated with full alkene descriptors
            c0_nat_charge = c0.attrib['Natural Charge']
            c1_nat_charge = c1.attrib['Natural Charge']

            #Isolates the homo and lumo energies out of the tuple
            homo,lumo = m.attrib['HOMO_LUMO']

            #Finds top 3 perturbation energies associated with the alkene.
            pert = m.attrib["Perturbation Energies"]

            pert_vals = list()
            for (indices, value) in pert:
                if (c0_idx in indices) and (c1_idx in indices):
                    pert_vals.append(value)
            pert_arr = np.array(pert_vals)
            sort_pert_arr = pert_arr[np.argsort(pert_arr)[::-1]]
            top3_pert = sort_pert_arr[0:3]

            #Finds the pi and pi* orbital energies assuming
            nbo_orb = m.attrib["NBO Orbital Energies"]

            for (bd_type, indices, bd_order, energy) in nbo_orb:
                #Finds pi bonding orbital energy
                if (bd_type == 'BD') and (bd_order == '2') and (c0_idx in indices) and (c1_idx in indices):
                    pi_orb_energy = energy
                #Finds pi anti bonding orbital energy
                elif (bd_type == 'BD*') and (bd_order == '2') and (c0_idx in indices) and (c1_idx in indices):
                    anti_pi_orb_energy = energy        

            #Finds the RDF Values calculated for c0 and c1
            rdf_c0 = {f'AlkC1_RDF_{int(k)+1}': v for k,v in m.attrib["RDF Series"]['C0 RDF'].items()}
            rdf_c1 = {f'AlkC2_RDF_{int(k)+1}': v for k,v in m.attrib["RDF Series"]['C1 RDF'].items()}
        
            full_dict = {
                'HOMO': homo,
                'LUMO': lumo,
                'C0_nat_charge': c0_nat_charge,
                'C1_nat_charge': c1_nat_charge,
                'pert1': float(top3_pert[0].item()),
                'pert2': float(top3_pert[1].item()),
                'pert3': float(top3_pert[2].item()),
                'pi_orb_energy': pi_orb_energy,
                'pi*_orb_energy':anti_pi_orb_energy,
            }
            full_dict.update(rdf_c0)
            full_dict.update(rdf_c1)

            desc_dict.update(full_dict)
            desc_dict.update({'Alkene Type':m.attrib["_Alkene_Type"]})

            df_entry = pd.DataFrame(desc_dict, index=[name])
            full_df = pd.concat([full_df, df_entry], axis=0) 
            
    react_argsort = np.vectorize(sort_ids)(full_df.index).argsort()
    sort_react_df = full_df.iloc[react_argsort]
    print(sort_react_df)
    sort_react_df.to_csv(file_name,index=True)

In [2]:
DB_df = pd.read_csv("SAD_Database.csv", index_col=0)

max_vol_mlib = ml.MoleculeLibrary('6_8_2_RDF_Realign_3BFSVol.mlib')
construct_desc(DB_df, max_vol_mlib, 'vol', '7_1_Full_Alkene_Desc_MaxVol.csv')

Working on:
MoleculeLibrary(backend=UkvCollectionBackend('6_8_2_RDF_Realign_3BFSVol.mlib'), n_items=0)
Writing To:
7_1_Full_Alkene_Desc_MaxVol.csv


100%|██████████| 789/789 [00:03<00:00, 224.34it/s]


              Q1_B1     Q1_B5      Q1_L      Q1_VOL  Q1_NWESPMIN  Q1_99ESPMAX  \
react_0    2.234000  4.119573  4.640026   44.228142    72.699257   561.253601   
react_1    1.700205  3.809141  7.035902   74.742134   102.298416   531.245239   
react_2    2.279648  4.045498  6.590853   68.432106   193.320679   512.047058   
react_3    1.700618  3.750941  6.774720   51.694172   229.643890   500.717560   
react_4    1.715506  6.349541  7.581655  108.862221   187.786942   480.918671   
...             ...       ...       ...         ...          ...          ...   
react_784  2.086305  3.866715  9.611885   94.293869   129.558670   480.171661   
react_785  2.067270  4.075103  9.866652  104.012665   192.906601   482.613800   
react_786  2.017458  6.086262  7.514786   94.293869   212.495544   471.484009   
react_787  1.715609  4.076038  7.524441   79.264069   230.204086   491.079895   
react_788  1.712434  3.859910  7.510997   71.533554   221.053696   492.679504   

           Q1_NATURAL CHARG

In [3]:
bfs_vol_mlib = ml.MoleculeLibrary('6_8_2_RDF_Realign_MaxVol.mlib')
construct_desc(DB_df, bfs_vol_mlib, 'bfs2', '7_1_Full_Alkene_Desc_3BFSVol.csv')

Working on:
MoleculeLibrary(backend=UkvCollectionBackend('6_8_2_RDF_Realign_MaxVol.mlib'), n_items=0)
Writing To:
7_1_Full_Alkene_Desc_3BFSVol.csv


100%|██████████| 789/789 [00:03<00:00, 236.53it/s]


              Q1_B1     Q1_B5      Q1_L    Q1_BFS2  Q1_NWESPMIN  Q1_99ESPMAX  \
react_0    2.234000  4.119573  4.640026  34.549614    72.699257   561.253601   
react_1    1.700205  3.809141  7.035902  53.748077   102.298416   531.245239   
react_2    2.279648  4.045498  6.590853  34.549614   193.320679   512.047058   
react_3    1.700618  3.750941  6.774720  44.029278   229.643890   500.717560   
react_4    1.715506  6.349541  7.581655  49.898579   187.786942   480.918671   
...             ...       ...       ...        ...          ...          ...   
react_784  2.086305  3.866715  9.611885  49.898579   129.558670   480.171661   
react_785  2.067270  4.075103  9.866652  49.898579   192.906601   482.613800   
react_786  2.017458  6.086262  7.514786  49.898579   212.495544   471.484009   
react_787  1.715609  4.076038  7.524441  49.898579   230.204086   491.079895   
react_788  1.713101  4.075352  7.749741  49.898579   170.722931   498.263794   

           Q1_NATURAL CHARGE  Q2_B1  Q2