File for loading data given a text string

In [1]:
import pandas as pd
import itertools

The following file will be used to hold summary statistics of running the optimiser various times with different configurations.

In [2]:
output_file = ''
output_file += 'reaction,sampling method,batch size,number of experiments,mean yield,standard deviation,average loss, worst-case loss\n'
output_filename = "Optimiser Summary Statistics.csv"

In [3]:
def get_summary_statistics(filename, max_yield=100.0):
    """
    Return the summary statistics for the data present in filename.
    
    -------
    Parameters:
    
    filename: string
        Path to the file with the data. Format of file should be as follows:
        seed,maximum_observed_yield
        124142,99.4
        092402,96.8
        ...
    
    max_yield: float
        The maximum observed yield across all data points for this experiment.
        Used in calculating average loss and worst-case loss
    """
    
    yields_df = pd.read_csv(filename)
    
    # Don't care about the seed column, only interested in the yields
    yields_df = yields_df[['maximum observed yield']].copy()
    
    # Mean
    average_yield = yields_df['maximum observed yield'].mean()
    
    # Standard deviation
    std_yield = yields_df['maximum observed yield'].std()
    
    # Average loss
    # This will be the average of {max_yield - yield}
    # But that's simply max_yield - the average yield!
    
    average_loss = max_yield - average_yield
    
    # Worst-case loss
    # This is the maximum of max_yield - yield
    # But that's simply max_yield - min(yield)
    
    worst_loss = max_yield - yields_df['maximum observed yield'].min()
    
    # Now add to the file, and return
    
    #output_file += f"{filename},{average_yield:.2f},{std_yield:.2f},{average_loss:.2f},{worst_loss:.2f}\n"
    return (average_yield, std_yield, average_loss, worst_loss)

Now let's run it on all the desired files

In [4]:
reaction_yields = {
    'suzuki': 100.0,
    'aryl_amination': 99.99999,
    'direct_arylation': 100.0,
}

sampling_methods = [
    'random',
    'randomts',
    'randomtsei'
]

batch_combinations = [
    '1_25_50',
    '1_50_50',
    '2_24_50',
    '2_50_50',
    '3_24_50',
    '3_51_50',
    '4_24_50',
    '4_48_50',
    '5_25_50',
    '5_50_50',
    '10_50_50'
]

for reaction, sampling_method, batch_combination in itertools.product(reaction_yields.keys(), sampling_methods, batch_combinations):
    try:
        avg, std, avg_loss, worst_loss = get_summary_statistics(f"{reaction}_{sampling_method}_{batch_combination}.csv", max_yield = reaction_yields[reaction])
        output_file += f"{reaction},{sampling_method},{batch_combination.split('_')[0]},{batch_combination.split('_')[1]},{avg:.2f},{std:.2f},{avg_loss:.2f},{worst_loss:.2f}\n"
    except FileNotFoundError as e:
        print("Missing file: ", e)



In [5]:
with open(output_filename, 'w') as f:
    f.write(output_file)