In [10]:
import molli as ml
import pandas as pd
from pprint import pprint
import numpy as np
from tqdm import tqdm

def assign_diol_desc(
        diol_name: str, 
        alk_name: str, 
        index:int,
        new_entry:pd.DataFrame,
        start_df: pd.DataFrame,
        alk_mlib: ml.MoleculeLibrary,
        diol_mlib: ml.MoleculeLibrary,
        full_react_desc: pd.DataFrame,
        diol_cat_df: pd.DataFrame,):

        #Finds the diol and addition_face
        ml_diol = diol_mlib[diol_name]
        diol_addn_face = ml_diol.attrib['Addition']
        ml_alk = alk_mlib[alk_name]
        alk_type = ml_alk.attrib['_Alkene_Type']


        #Tests products only have one cat designation
        cat_des_test = diol_cat_df[diol_cat_df["Product ID"] == diol_name]
        if cat_des_test.shape != (1,3):
            assert cat_des_test['Catalyst Designation'].nunique() == 1, f'Same Products have different values!\n {cat_des_test}'
            assert cat_des_test['Reactant ID'].nunique() == 1, f'Different Reactants detected! {cat_des_test}'

        enantiomer_test = diol_cat_df[diol_cat_df["Reactant ID"] == alk_name]

        #Asserts when data on both catalysts is in the database, each diol has a different designation
        if enantiomer_test["Product ID"].nunique() != 1:
            other_diol_name = enantiomer_test[enantiomer_test["Product ID"] != diol_name]["Product ID"].values[0]
            # print(other_diol_name)
            other_ml_diol = diol_mlib[other_diol_name]
            other_addn_face = other_ml_diol.attrib['Addition']
            assert other_addn_face != diol_addn_face, f'Both Diols have the same face! Alkene={alk_name}, Diol1={diol_name}, Diol2={other_diol_name}'
            assert cat_des_test['Reactant ID'].nunique() == 1, f'Different Reactants detected! {cat_des_test}'

        # print(full_react_desc)
        alk_desc = full_react_desc.loc[alk_name].to_frame().T
        alk_desc.index = [index]
        alk_desc_no_y = alk_desc[alk_desc.columns[alk_desc.columns != 'ddG er (kcal/mol)']]
        alk_desc_no_y["Addition Face"] = diol_addn_face
        alk_desc_no_y['Alkene Type'] = alk_type
        
        final_df = pd.concat([new_entry, alk_desc_no_y], axis=1)
        if index == 0: 
             return final_df
        else:
            start_df = pd.concat([start_df,final_df], axis=0)
            return start_df

def create_desc(
    DB_df: pd.DataFrame,
    full_react_desc:pd.DataFrame,  
    alk_mlib: ml.MoleculeLibrary, 
    diol_mlib: ml.MoleculeLibrary, 
    vol_type: str):

    DB_df['Catalyst Designation'] = DB_df['Catalyst Designation'].str.capitalize()

    diol_alk_dict = dict(DB_df[["Product ID", "Reactant ID"]].values)

    diol_cat_df = DB_df[["Product ID", "Catalyst Designation", "Reactant ID"]]

    with diol_mlib.reading(), alk_mlib.reading():
        # print(diol_mlib)
        for i,idx in enumerate(tqdm(diol_cat_df.index)):
            new_entry: pd.DataFrame = diol_cat_df.iloc[idx].to_frame().T

            diol_name = new_entry["Product ID"].values[0]
            if diol_name not in diol_mlib:
                continue

            ml_diol = diol_mlib[diol_name]
            alk_name = diol_alk_dict[diol_name]

            if i == 0:
                # new_df = new_entry
                new_df = assign_diol_desc(
                    diol_name=diol_name,
                    alk_name=alk_name,
                    index=i,
                    new_entry=new_entry,
                    start_df=0,
                    alk_mlib=alk_mlib,
                    diol_mlib=diol_mlib,
                    full_react_desc=full_react_desc,
                    diol_cat_df=diol_cat_df
                )
            else:
                new_df = assign_diol_desc(
                    diol_name=diol_name,
                    alk_name=alk_name,
                    index=i,
                    new_entry=new_entry,
                    start_df=new_df,
                    alk_mlib=alk_mlib,
                    diol_mlib=diol_mlib,
                    full_react_desc=full_react_desc,
                    diol_cat_df=diol_cat_df
                )

        final_df = new_df.drop_duplicates()
        assert final_df.shape[0] == len(diol_mlib), f'Length of the library does not match the number of descriptors there should be! Length of lib: {len(diol_mlib)}, final_df shape = {final_df.shape}'

        print(final_df)
        
        final_df.to_csv(f"6_2_Diol_Selectivity_{vol_type}_Desc.csv",index=False)

In [11]:
max_iter = 10000
DB_df = pd.read_csv("SAD_Database.csv")

In [12]:
alk_BFSVol_mlib = ml.MoleculeLibrary(f"6_7_Realign_3BFSVol.mlib")
diol_BFSVol_mlib = ml.MoleculeLibrary(f"6_1_Diol_3BFSVol_Assign_{max_iter}iter.mlib")
BFSVol_react_desc = pd.read_csv(f"7_3_Ignore_Diff_Full_3BFSVol_Avg.csv", index_col=0)

create_desc(
    DB_df=DB_df,
    full_react_desc=BFSVol_react_desc,
    alk_mlib=alk_BFSVol_mlib, 
    diol_mlib=diol_BFSVol_mlib, 
    vol_type='3BFSVol')


100%|██████████| 1007/1007 [00:10<00:00, 92.69it/s]


     Product ID Catalyst Designation Reactant ID     Q1_B1     Q1_B5  \
0        prod_0                 Beta     react_0  2.234000  4.119573   
1        prod_1                Alpha     react_1  1.700205  3.809141   
2        prod_2                Alpha     react_2  2.279648  4.045498   
3      prod_997                Alpha     react_3  1.700618  3.750941   
4        prod_3                 Beta     react_3  1.700618  3.750941   
...         ...                  ...         ...       ...       ...   
1002   prod_989                 Beta   react_784  2.086305  3.866715   
1003   prod_990                 Beta   react_785  2.067270  4.075103   
1004   prod_991                 Beta   react_786  2.017458  6.086262   
1005   prod_992                 Beta   react_787  1.715609  4.076038   
1006   prod_993                 Beta   react_788  1.713101  4.075352   

          Q1_L    Q1_BFS2  Q1_NWESPMIN  Q1_99ESPMAX  Q1_NATURAL CHARGE  ...  \
0     4.640026  34.549614    72.699257   561.253601     

In [13]:
alk_MaxVol_mlib = ml.MoleculeLibrary(f"6_7_Realign_MaxVol.mlib")
diol_MaxVol_mlib = ml.MoleculeLibrary(f"6_1_Diol_MaxVol_Assign_{max_iter}iter.mlib")
MaxVol_react_desc = pd.read_csv(f"7_3_Ignore_Diff_Full_MaxVol_Avg.csv", index_col=0)

create_desc(
    DB_df=DB_df,
    full_react_desc=MaxVol_react_desc,
    alk_mlib=alk_MaxVol_mlib, 
    diol_mlib=diol_MaxVol_mlib, 
    vol_type='MaxVol')

100%|██████████| 1007/1007 [00:11<00:00, 86.23it/s]


     Product ID Catalyst Designation Reactant ID     Q1_B1     Q1_B5  \
0        prod_0                 Beta     react_0  2.234000  4.119573   
1        prod_1                Alpha     react_1  1.700205  3.809141   
2        prod_2                Alpha     react_2  2.279648  4.045498   
3      prod_997                Alpha     react_3  1.700618  3.750941   
4        prod_3                 Beta     react_3  1.700618  3.750941   
...         ...                  ...         ...       ...       ...   
1002   prod_989                 Beta   react_784  2.086305  3.866715   
1003   prod_990                 Beta   react_785  2.067270  4.075103   
1004   prod_991                 Beta   react_786  2.017458  6.086262   
1005   prod_992                 Beta   react_787  1.715609  4.076038   
1006   prod_993                 Beta   react_788  1.712434  3.859910   

          Q1_L      Q1_VOL  Q1_NWESPMIN  Q1_99ESPMAX  Q1_NATURAL CHARGE  ...  \
0     4.640026   44.228142    72.699257   561.253601   