## General definitions for both CSVs and plots

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import time

FINAL_DF_NAME = 'benchmark_stats.csv'

METHOD_NAMES = {
    'ringointel': 'Ringo',
    'rdkit': 'RDKit',
    'mtd': 'XTB metadynamics',
    'mmbasic': 'MacroModel LowMode+torsional sampling',
    'mmring': 'Macrocycle sampling in MacroModel',
    'crest': 'CREST',
}
ALL_METHODS = ['ringointel', 'rdkit', 'mtd']
FULL_CASES_ORDER = ['pdb_3M6G', 'csd_FINWEE10', 'pdb_2IYA', 'csd_YIVNOG', 'pdb_1NWX', 'csd_RULSUN', 'csd_MIWTER', 'pdb_2C6H', 'csd_RECRAT', 'pdb_2QZK']

PLOT_MODES = {
    'low_energy': {
        'timing_compute': lambda df: df['time'] / df['n_lower_unique'] * df['thread_avg'],
        # 'result_file': './low_energy_timings.svg',
        'time_cutoff': 100.0, # seconds per conformer
        'y_caption': 'Time per unique\nlow-energy conformer, s',
    },
    'any_energy': {
        'timing_compute': lambda df: df['time'] / (df['n_lower_unique'] + df['n_higher_unique']) * df['thread_avg'],
        # 'result_file': './any_energy_timings.svg',
        'time_cutoff': 10.0, # seconds per conformer
        'y_caption': 'Time per unique\nconformer, s',
    },
}

## Generate plots

In [None]:
def plot_df(df, y_caption, result_file, show_legend, time_cutoff=None):
    from plotnine import ggplot, aes, labs, scale_fill_manual, theme_bw, element_text, theme, element_blank, element_rect, element_line, geom_bar, position_dodge, ylim
    normal_theme = (theme_bw() +
                    theme(panel_grid_major = element_blank(),
                        panel_grid_minor = element_blank(),
                        panel_border = element_rect(colour='black', fill=None, size=1),
                        axis_line = element_line(colour='black'),
                        axis_title = element_text(size=16, face='bold', ma='center'),
                        axis_text = element_text(size=14),
                        legend_title = element_text(size=14, face='bold'),
                        legend_text = element_text(size=14),
                        figure_size=(10, 5)))

    # Colors from Set2
    colorscheme = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#a65628", "#f781bf"]

    plot = ggplot(df, aes(y='time_per_conf', x='testcase', fill='Sampling method')) + \
        geom_bar(stat='identity', position=position_dodge(width=0.9)) + \
        labs(y=y_caption, x='') + normal_theme + \
        theme(axis_text_x = element_text(angle = 90)) + scale_fill_manual(values=colorscheme)
    if time_cutoff is not None:
        plot += ylim(0, time_cutoff)
    if not show_legend:
        plot += theme(legend_position="none")
    
    print(plot)
    plot.save(result_file, verbose=False)

def plot_ten_molecules(timing_compute, **kwargs):
    df = pd.read_csv(FINAL_DF_NAME)
    df['time_per_conf'] = timing_compute(df)
    df['method'] = df['method'].replace(METHOD_NAMES)
    method_type = CategoricalDtype(categories=[method_name for method_name in METHOD_NAMES.values()], ordered=True)
    df['method'] = df['method'].astype(method_type)    
    
    # Use predefined ordering of testcases on the plot
    testcase_type = CategoricalDtype(categories=FULL_CASES_ORDER, ordered=True)
    df['testcase'] = df['testcase'].astype(testcase_type)
    df = df[df['testcase'].isin(FULL_CASES_ORDER)]

    df = df.rename(columns={'method': 'Sampling method'})

    df['testcase'] = df['testcase'].str.replace('csd_', '')
    df['testcase'] = df['testcase'].str.replace('pdb_', '')

    # Cut bar heights
    if 'time_cutoff' in kwargs:
        df['time_per_conf'] = df['time_per_conf'].apply(lambda x: kwargs['time_cutoff'] if x > kwargs['time_cutoff'] else x)

    plot_df(df, **kwargs)

def plot_all_molecules(timing_compute, **kwargs):
    df = pd.read_csv(FINAL_DF_NAME)
    df['time_per_conf'] = timing_compute(df)
    # Cut the bar heights if required
    if 'time_cutoff' in kwargs:
        df['time_per_conf'] = df['time_per_conf'].apply(lambda x: kwargs['time_cutoff'] if x > kwargs['time_cutoff'] else x)
    
    df = df[df['method'].isin(ALL_METHODS)]
    df['method'] = df['method'].replace(METHOD_NAMES)
    method_type = CategoricalDtype(categories=[method_name for method_name in METHOD_NAMES.values()], ordered=True)
    df['method'] = df['method'].astype(method_type)    
    
    df = df[~df['testcase'].isin(FULL_CASES_ORDER)]
    df['testcase'] = df['testcase'].str.replace('csd_', '')
    df['testcase'] = df['testcase'].str.replace('pdb_', '')
    order_df = df.loc[df['method'] == METHOD_NAMES['ringointel']].sort_values(by=['time_per_conf'], ascending=False)
    order_df = order_df.reset_index()
    cat_type = CategoricalDtype(categories=order_df['testcase'], ordered=True)

    df = df.rename(columns={'method': 'Sampling method'})
    df = df.reset_index()

    step_size = 20
    for i in range(0, len(order_df['testcase']), step_size):
        current_testcases = order_df['testcase'][i : i + step_size]
        cur_df = df[df['testcase'].isin(current_testcases)].copy()
        cur_df['testcase'] = cur_df['testcase'].astype(cat_type)
        cur_kwargs = {
            **kwargs,
            'result_file': kwargs['result_file'].format(plot_type=i)
        }
        plot_df(cur_df, **cur_kwargs)

    # # Initial ideas of choosing ordering of testcases on the plot
    # # sort the dataframe by time in descending order for 'noref' and 'succ'
    # order_df = df.loc[df['method'] == METHOD_NAMES['crest']].sort_values(by=['time_per_conf'], ascending=False)
    # order_df = df.loc[df['method'] == METHOD_NAMES['rdkit']].sort_values(by=['time_per_conf'], ascending=False)
    # hardest_cases = order_df['testcase'] # .tail(30).head(40)
    # df = df[df['testcase'].isin(hardest_cases)]
    # # convert 'testcase' column to categorical datatype with sorted order
    # cat_type = CategoricalDtype(categories=order_df['testcase'], ordered=True)
    # df['testcase'] = df['testcase'].astype(cat_type)

    # Some selections for in-depths analysis
    # df = df[(df['method'] == METHOD_NAMES['ringointel']) | (df['method'] == METHOD_NAMES['rdkit'])]
    # df = df[df['method'] != METHOD_NAMES['mmbasic']]

In [None]:
# Plots for figures in main text
# Hatches were added by hand in Adobe Illustrator
for plotting_mode, basic_kwargs in PLOT_MODES.items():
    print(f'============\n{plotting_mode}\n============')
    plotting_kwargs = {
        **basic_kwargs,
        'show_legend': True,
        'result_file': f'./{plotting_mode}_timings.svg',
    }
    plot_ten_molecules(**plotting_kwargs)

In [None]:
# Plots for figures in main text
# Hatches were added by hand in Adobe Illustrator
for plotting_mode, basic_kwargs in PLOT_MODES.items():
    print(f'============\n{plotting_mode}\n============')
    plotting_kwargs = {
        **basic_kwargs,
        'show_legend': False,
        'result_file': f'./{plotting_mode}_si.png',
    }
    del plotting_kwargs['time_cutoff']
    plot_ten_molecules(**plotting_kwargs)

    plotting_kwargs['result_file'] = './%s_{plot_type}.png' % plotting_mode
    plot_all_molecules(**plotting_kwargs)


## Generate CSVs for SI with testset overview and benchmark data

In [None]:
import ringo, json
from charges import CHARGES, CHARGES_MOLS
    
TESTSET_JSON = 'testcases.json'
with open(TESTSET_JSON, 'r') as f:
    sdf_mapping = json.load(f)

df = pd.read_csv(FINAL_DF_NAME)
# Load testset overview
testcases = df['testcase'].unique()

# Print the structure of testset
testcases_list = list(testcases)
testset_parts = set(name.split('_')[0] for name in testcases_list)
for part_idx, testset_part in enumerate(testset_parts):
    molecule_ids = [name.split('_')[1] for name in testcases_list if name.startswith(testset_part)]
    print(f'{part_idx+1}) {testset_part.upper()}: {", ".join(sorted(molecule_ids))}')

# Generate overview
overview_df = pd.DataFrame()
overview_df['testcase'] = testcases
def get_overview(row):
    molname = row['testcase']

    m = ringo.Molecule(sdf_mapping[molname])
    result = ringo.get_molecule_statistics(m)
    result['composition'] = ''.join([f'{element}{count}' for element, count in result['composition'].items()])
    
    if f'pdb_{molname}' in CHARGES_MOLS:
        charge = CHARGES[molname]
    else:
        charge = 0
    result['charge'] = charge

    return pd.Series(result)
new_columns = overview_df.apply(get_overview, axis=1)
overview_df = pd.concat([overview_df, new_columns], axis=1)
overview_df = overview_df.rename(columns={
    'method': 'Sampling method',
    'composition': 'Composition',
    'charge': 'Total charge',
    'num_atoms': 'Number of atoms',
    'num_heavy_atoms': 'Number of heavy atoms',
    'num_bonds': 'Number of bonds',
    'num_rotatable_bonds': 'Number of rotatable bonds',
    'num_cyclic_rotatable_bonds': 'Number of cyclic rotatable bonds',
    'largest_macrocycle_size': 'Largest macrocycle size',
    'num_dofs': 'Number of kinematic DOFs',
    'cyclomatic_number': 'Cyclomatic number (n_edges-n_atoms+1)',
})
overview_df.to_csv('testset_overview.csv', index=False)

# Load benchmark data
for item in PLOT_MODES.values():
    df[item['y_caption']] = item['timing_compute'](df)
df['method'] = df['method'].replace(METHOD_NAMES)
df = df.drop(columns=[
    'n_failed_opts',
])
df = df.rename(columns={
    'method': 'Sampling method',
    'n_total_generated': 'Conformations generated',
    'n_unique_generated': 'Unique conformers generated',
    'n_duplicates': 'Duplicates after optimization',
    'n_failed_topo': 'Incorrect topology after optimization',
    'n_higher_unique': 'High-energy conformations (>15 kcal/mol)',
    'n_lower_unique': 'Low-energy conformations (<15 kcal/mol)',
})
df.to_csv('timings_results.csv', index=False)