In [4]:
import pandas as pd
from pandas.api.types import CategoricalDtype

FINAL_DF_NAME = 'benchmark_stats.csv'

METHOD_NAMES = {
    'ringointel': 'Ringo',
    'rdkit': 'RDKit',
    'mtd': 'XTB metadynamics',
    'mmbasic': 'MacroModel LowMode+torsional sampling',
    'mmring': 'Macrocycle sampling in MacroModel',
    'crest': 'CREST',
}

def plot_timings(timing_compute, time_cutoff, result_file, y_caption):
    df = pd.read_csv(FINAL_DF_NAME)
    df['time_per_conf'] = timing_compute(df)
    df['method'] = df['method'].replace(METHOD_NAMES)
    method_type = CategoricalDtype(categories=[method_name for method_name in METHOD_NAMES.values()], ordered=True)
    df['method'] = df['method'].astype(method_type)
    df.to_csv('check.csv', index=False)
    # sort the dataframe by time in descending order for 'noref' and 'succ'
    order_df = df.loc[df['method'] == METHOD_NAMES['crest']].sort_values(by=['time_per_conf'], ascending=False)
    # order_df = df.loc[df['method'] == METHOD_NAMES['rdkit']].sort_values(by=['time_per_conf'], ascending=False)
    hardest_cases = order_df['testcase'] # .tail(30).head(40)

    # create a categorical datatype with sorted order
    cat_type = CategoricalDtype(categories=order_df['testcase'], ordered=True)
    # convert 'testcase' column to categorical datatype with sorted order
    df['testcase'] = df['testcase'].astype(cat_type)
    
    # Some selections for in-depths analysis
    # df = df[(df['method'] == METHOD_NAMES['ringointel']) | (df['method'] == METHOD_NAMES['rdkit'])]
    # df = df[df['method'] != METHOD_NAMES['mmbasic']]

    df = df.rename(columns={'method': 'Sampling method'})

    from plotnine import ggplot, aes, ggtitle, geom_point, labs, scale_y_continuous, scale_fill_manual, theme_minimal, theme_bw, element_text, theme, element_blank, element_rect, element_line, geom_bar, position_dodge, ylim

    normal_theme = (theme_bw() +
                    theme(panel_grid_major = element_blank(),
                        panel_grid_minor = element_blank(),
                        panel_border = element_rect(colour="black", fill=None, size=1),
                        axis_line = element_line(colour="black"),
                        axis_title = element_text(size=16, face="bold"),
                        axis_text = element_text(size=14),
                        legend_title = element_text(size=14, face="bold"),
                        legend_text = element_text(size=14),
                        figure_size=(10, 5)))

    # Colors from Set2
    colorscheme = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3", "#a65628", "#f781bf"]
    df = df[df['testcase'].isin(hardest_cases)]

    # Need the same ordering of testcases for all plotting modes
    CASES_ORDER = [ 'pdb_3M6G', 'csd_FINWEE10', 'pdb_2IYA', 'csd_YIVNOG', 'pdb_1NWX', 'csd_RULSUN', 'csd_MIWTER', 'pdb_2C6H', 'csd_RECRAT', 'pdb_2QZK']
    testcase_type = CategoricalDtype(categories=CASES_ORDER, ordered=True)
    
    df['testcase'] = df['testcase'].astype(testcase_type)
    df['time_per_conf'] = df['time_per_conf'].apply(lambda x: time_cutoff if x > time_cutoff else x)

    plot = ggplot(df, aes(y='time_per_conf', x='testcase', fill='Sampling method')) + \
        geom_bar(stat='identity', position=position_dodge(width=0.9)) + \
        labs(y=y_caption, x='Test macrocycle') + normal_theme + \
        theme(axis_text_x = element_text(angle = 90, hjust = 1)) + scale_fill_manual(values=colorscheme) + ylim(0, time_cutoff)
    
    print(plot)
    plot.save(result_file, verbose=False)

In [None]:
plot_modes = {
    'low_energy': {
        'timing_compute': lambda df: df['time'] / df['n_lower_unique'] * df['thread_avg'],
        'result_file': './low_energy_timings.svg',
        'time_cutoff': 100.0, # seconds per conformer
        'y_caption': 'Time per unique\nlow-energy conformer, s',
    },
    'any_energy': {
        'timing_compute': lambda df: df['time'] / (df['n_lower_unique'] + df['n_higher_unique']) * df['thread_avg'],
        'result_file': './any_energy_timings.svg',
        'time_cutoff': 10.0, # seconds per conformer
        'y_caption': 'Time per unique\nconformer, s',
    },
}

for plotting_mode, plotting_kwargs in plot_modes.items():
    print(f'============\n{plotting_mode}\n============')
    plot_timings(**plotting_kwargs)
# Hatches were added by hand in Adobe Illustrator

## Generate CSVs for SI with testset overview and benchmark data

In [29]:
import ringo, json

TESTSET_JSON = 'testcases.json'
with open(TESTSET_JSON, 'r') as f:
    sdf_mapping = json.load(f)

df = pd.read_csv(FINAL_DF_NAME)
# Load testset overview
testcases = df['testcase'].unique()
overview_df = pd.DataFrame()
overview_df['testcase'] = testcases
def get_overview(row):
    molname = row['testcase']
    m = ringo.Molecule(sdf_mapping[molname])
    result = ringo.get_molecule_statistics(m)
    result['composition'] = ''.join([f'{element}{count}' for element, count in result['composition'].items()])
    return pd.Series(result)
new_columns = overview_df.apply(get_overview, axis=1)
overview_df = pd.concat([overview_df, new_columns], axis=1)
overview_df = overview_df.rename(columns={
    'method': 'Sampling method',
    'composition': 'Composition',
    'num_atoms': 'Number of atoms',
    'num_heavy_atoms': 'Number of heavy atoms',
    'num_bonds': 'Number of bonds',
    'num_rotatable_bonds': 'Number of rotatable bonds',
    'num_cyclic_rotatable_bonds': 'Number of cyclic rotatable bonds',
    'largest_macrocycle_size': 'Largest macrocycle size',
    'num_dofs': 'Number of kinematic DOFs',
    'cyclomatic_number': 'Cyclomatic number (n_edges-n_atoms+1)',
})
overview_df.to_csv('testset_overview.csv', index=False)

# Load benchmark data
for item in plot_modes.values():
    df[item['y_caption']] = item['timing_compute'](df)
df['method'] = df['method'].replace(METHOD_NAMES)
df = df.drop(columns=[
    'n_failed_opts',
])
df = df.rename(columns={
    'method': 'Sampling method',
    'n_total_generated': 'Conformations generated',
    'n_unique_generated': 'Unique conformers generated',
    'n_duplicates': 'Duplicates after optimization',
    'n_failed_topo': 'Incorrect topology after optimization',
    'n_higher_unique': 'High-energy conformations (>15 kcal/mol)',
    'n_lower_unique': 'Low-energy conformations (<15 kcal/mol)',
})
df.to_csv('timings_results.csv', index=False)