In [1]:
import numpy as np
import pandas as pd

def create_final_desc_df(desc_df: pd.DataFrame, db_df:pd.DataFrame, vol_type: str):
    desc_df_no_type = desc_df.loc[:, desc_df.columns != 'Alkene Type']

    #This line replaces anything with "Not Reported" in the column with 0.0
    db_df['Temperature ©'] = db_df['Temperature ©'].replace("Not Reported", 0.0)
    assert (temp_nan := np.count_nonzero(db_df['Temperature ©'].isna())) == 0, f'There are {temp_nan} nan values in the temperature series.'

    #This recalculates the unknown temp values ddG er (kcal/mol)
    #Converts temps to Kelvin
    temps = db_df["Temperature ©"].values.astype(float) + 273.15
    er = db_df['er'].values
    ddG_vals = (temps)*(8.314)*np.log(er)*(0.000239)

    ####Creates Dataframe that averages all reported values of a reactant####
    ddG_df = pd.DataFrame(data=ddG_vals, columns=['ddG er (kcal/mol)'], index=db_df.index)
    react_id_ddG_df = pd.concat([db_df['Reactant ID'],ddG_df], axis=1)
    averaged_ddG_by_react = react_id_ddG_df.groupby('Reactant ID').mean().reset_index()
    average_ddG_react_ser = pd.Series(data=averaged_ddG_by_react['ddG er (kcal/mol)'].values, index=averaged_ddG_by_react['Reactant ID'].values, name='ddG er (kcal/mol)')
    assert np.count_nonzero(average_ddG_react_ser.isna().values) == 0, f'ddG er has nan values: {average_ddG_react_ser.isna()}'

    #Concatenates all values available for the reactant
    ignore_diff_final = pd.concat([desc_df_no_type, average_ddG_react_ser],join='inner',axis=1)
    # print('Differences Ignored DF')
    # print(ignore_diff_final)

    #This is meant to help isolate and test that certain values aren't present
    isolate_db_df = db_df[['Reactant ID','Product ID','Solvent 1 ID','Solvent 2 ID', 'Oxidant ID', 'Catalyst ID', 'er', 'Olefin Type']]

    #These are assertions to make sure there are not problems in direct comparisons
    assert np.count_nonzero(isolate_db_df['er'].isna().values) == 0, f'ddG er has nan values: {isolate_db_df[isolate_db_df["er"].isna()]}'

    #The following isolate the database such that only tBuOH/Water are used, and the same oxidant
    isolate_db_df = isolate_db_df[isolate_db_df['Solvent 1 ID'] == 'sol1_0']
    isolate_db_df = isolate_db_df[isolate_db_df['Solvent 2 ID'] == 'sol2_0']
    isolate_db_df = isolate_db_df[isolate_db_df['Oxidant ID'] == 'ox_0']

    assert isolate_db_df['Solvent 1 ID'].nunique() == 1, f'There are more than one solvent1 values: {isolate_db_df["Solvent 1 ID"].unique()}'
    assert isolate_db_df['Solvent 2 ID'].nunique() == 1, f'There are more than one solvent2 values: {isolate_db_df["Solvent 2 ID"].unique()}'
    assert isolate_db_df['Oxidant ID'].nunique() == 1, f'There are more than one oxidant ID values: {isolate_db_df["Oxidant ID"].unique()}'

    isolate_desc_df = desc_df_no_type.loc[isolate_db_df['Reactant ID'].unique()]
    # print('Differences Isolated')
    # print(isolate_desc_df)

    #There are now only 759 unique reactants that are homogeneous for this transformation compared to the original 789 that were available!
    diff_final = pd.concat([isolate_desc_df, average_ddG_react_ser], join='outer', axis=1).dropna()

    ignore_diff_final.to_csv(f'7_3_Ignore_Diff_Full_{vol_type}_Avg.csv')
    diff_final.to_csv(f'7_3_With_Diff_Full_{vol_type}_Avg.csv')


In [2]:
db = pd.read_csv('SAD_Database.csv')

max_vol_df = pd.read_csv('7_1_Full_Alkene_Desc_MaxVol.csv', index_col=0)
create_final_desc_df(max_vol_df, db, 'MaxVol')

In [3]:
bfs_vol_df = pd.read_csv('7_1_Full_Alkene_Desc_3BFSVol.csv', index_col=0)
create_final_desc_df(bfs_vol_df, db, 'BFSVol')