In [4]:
import pandas as pd
from rdkit import Chem
import numpy as np

In [5]:
# load data
df_reactions = pd.read_csv('../data/reaction_data/numbered_reaction.csv')


In [None]:
# clean notations and drop duplicates row due to multiple products
df_reactant_conditions = df_reactions[['Catalyst', 'Solvent', 'Time (min)', 'Temperature (ºC)', 'rxn_ID', 'Reactant_SMILES']]
df_reactant_conditions['Reactant_SMILES'] = [Chem.CanonSmiles(smiles) for smiles in df_reactant_conditions['Reactant_SMILES']]
solvent = {
           'Acetone '      : 'Acetone',
           'CH2Cl2/ACETONE': 'Acetone/CH2Cl2',
           'CH2Cl2/Acetone': 'Acetone/CH2Cl2',
           'CH2Cl2/TFP'    : 'CH2Cl2/\nTrifluoroacetone',
           'CH2Cl2/TFA'    : 'CH2Cl2/\nTrifluoroacetone',
           'Acetone/ch2cl2': 'Acetone/CH2Cl2',
           'TFP/Acetone'   : ' Acetone/\nTrifluoroacetone',
           'Acetone/TFP'   : ' Acetone/\nTrifluoroacetone',
           'Acetone/TFA'   : ' Acetone/\nTrifluoroacetone',
           }
time = {'does not say': np.nan}
temp = {'rt': '25'}
df_reactant_conditions['Solvent'] = df_reactant_conditions['Solvent'].replace(solvent)
df_reactant_conditions['Time (min)'] = df_reactant_conditions['Time (min)'].replace(time)
df_reactant_conditions['Temperature (ºC)'] = df_reactant_conditions['Temperature (ºC)'].replace(temp)

df_reactant_conditions.rename(columns={'Catalyst' : 'Dioxirane'}, inplace=True)
df_reactant_conditions = df_reactant_conditions.drop_duplicates()

In [None]:
# get TFDO/DMDO count
print("TFDO:", len(df_reactant_conditions[df_reactant_conditions.Dioxirane == 'TFDO']))
print("DMDO:", len(df_reactant_conditions[df_reactant_conditions.Dioxirane == 'DMDO']))

In [None]:
# plot DMDO/TFDO distribution
import seaborn as sns
import matplotlib.pyplot as plt

df_reactant_conditions.replace(np.nan, 'not specified', inplace=True)

fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(x='Dioxirane', data=df_reactant_conditions, 
             hue='Dioxirane',
             shrink=0.8,)
ax.get_legend().remove()
ax.set_xlabel('Dioxirane')
ax.set_title('Number of reactions per dioxirane reagent')
fig.savefig('dioxirane_distribution.png', dpi=300)

In [None]:
# plot solvent distribution
fig, ax = plt.subplots(figsize=(10, 6))
df_reactant_conditions.sort_values('Solvent', inplace=True)
sns.histplot(x='Solvent', data=df_reactant_conditions, 
             hue='Dioxirane',
             multiple='stack',
             shrink=0.8,)
#ax.get_legend().remove()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_xlabel('Solvent')
ax.set_title('Number of reactions per dioxirane catalyst')
fig.tight_layout()
fig.savefig('solvent_distribution.png', dpi=300)

In [None]:
# plot reaction time distribution
fig, ax = plt.subplots(figsize=(10, 6))
time_float = []
for t in df_reactant_conditions['Time (min)']:
    try:
        time_float.append(float(t))
    except:
        time_float.append(np.nan)

df_reactant_conditions['time (min)'] = time_float
#df_reactant_conditions['Time (min)'] = df_reactant_conditions['Time (min)'].map({'Time (min)': lambda x: float(x)})
df_reactant_conditions.sort_values('time (min)', inplace=True)
sns.histplot(x='time (min)', data=df_reactant_conditions, 
             hue='Dioxirane',
             shrink=0.8, multiple='stack')
#ax.get_legend().remove()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_xlabel('Time (min)')
ax.set_title('Number of reactions per Time (min)')
fig.tight_layout()

fig.savefig('time_distribution.png', dpi=300)

In [None]:
# plot temperature distribution
fig, ax = plt.subplots(figsize=(10, 6))
time_float = []
for t in df_reactant_conditions['Temperature (ºC)']:
    try:
        time_float.append(float(t))
    except:
        time_float.append(np.nan)

df_reactant_conditions['temperature (ºC)'] = time_float
#df_reactant_conditions['Time (min)'] = df_reactant_conditions['Time (min)'].map({'Time (min)': lambda x: float(x)})
df_reactant_conditions.sort_values('time (min)', inplace=True)
sns.histplot(x='temperature (ºC)', data=df_reactant_conditions, 
             hue='Dioxirane',
             bins=20,
             shrink=0.8, multiple='stack')
#ax.get_legend().remove()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
ax.set_xlabel('Temperature (ºC)')
ax.set_title('Number of reactions per Temperature (ºC)')
fig.tight_layout()

fig.savefig('temp_distribution.png', dpi=300)