File for loading data given a text string

In [1]:
import pandas as pd
import itertools

The following file will be used to hold summary statistics of running the optimiser various times with different configurations.

In [2]:
output_file = ''
output_file += 'filename,mean yield,standard deviation,average loss, worst-case loss\n'
output_filename = "Optimiser Summary Statistics.csv"

In [3]:
def update_output_file_with_data(filename, output_file, max_yield=100.0):
    """
    Update the output file with the data present in filename.
    
    
    -------
    Parameters:
    
    filename: string
        Path to the file with the data. Format of file should be as follows:
        seed,maximum_observed_yield
        124142,99.4
        092402,96.8
        ...
        
    output_file: string
        Current state of internal output file. Format described above
    
    max_yield: float
        The maximum observed yield across all data points for this experiment.
        Used in calculating average loss and worst-case loss
    """
    
    yields_df = pd.read_csv(filename)
    
    # Don't care about the seed column, only interested in the yields
    yields_df = yields_df[['maximum observed yield']].copy()
    
    # Mean
    average_yield = yields_df['maximum observed yield'].mean()
    
    # Standard deviation
    std_yield = yields_df['maximum observed yield'].std()
    
    # Average loss
    # This will be the average of {max_yield - yield}
    # But that's simply max_yield - the average yield!
    
    average_loss = max_yield - average_yield
    
    # Worst-case loss
    # This is the maximum of max_yield - yield
    # But that's simply max_yield - min(yield)
    
    worst_loss = max_yield - yields_df['maximum observed yield'].min()
    
    # Now add to the file, and return
    
    output_file += f"{filename},{average_yield:.2f},{std_yield:.2f},{average_loss:.2f},{worst_loss:.2f}\n"
    return output_file

Now let's run it on all the desired files

In [4]:
reaction_yields = {
    'suzuki': 100.0,
    'aryl_amination': 99.99999,
    'direct_arylation': 100.0,
}

sampling_methods = [
    'random',
    'randomts',
    'randomtsei'
]

batch_combinations = [
    '1_25_50',
    '1_50_50',
    '2_24_50',
    '2_50_50',
    '3_24_50',
    '3_51_50',
    '4_24_50',
    '4_48_50',
    '5_25_50',
    '5_50_50',
    '10_50_50'
]

for reaction, sampling_method, batch_combination in itertools.product(reaction_yields.keys(), sampling_methods, batch_combinations):
    try:
        output_file = update_output_file_with_data(f"{reaction}_{sampling_method}_{batch_combination}.csv", output_file, max_yield = reaction_yields[reaction])
    except FileNotFoundError as e:
        print("Missing file: ", e)



In [5]:
output_file

'filename,mean yield,standard deviation,average loss, worst-case loss\nsuzuki_random_1_25_50.csv,97.52,1.62,2.48,6.31\nsuzuki_random_1_50_50.csv,98.71,0.78,1.29,4.15\nsuzuki_random_2_24_50.csv,97.41,1.68,2.59,7.18\nsuzuki_random_2_50_50.csv,98.88,0.86,1.12,3.84\nsuzuki_random_3_24_50.csv,97.73,1.78,2.27,6.72\nsuzuki_random_3_51_50.csv,98.85,0.93,1.15,3.80\nsuzuki_random_4_24_50.csv,97.48,1.83,2.52,7.12\nsuzuki_random_4_48_50.csv,98.66,1.36,1.34,6.57\nsuzuki_random_5_25_50.csv,97.30,2.24,2.70,11.63\nsuzuki_random_5_50_50.csv,98.49,1.26,1.51,5.18\nsuzuki_random_10_50_50.csv,98.92,0.85,1.08,3.96\nsuzuki_randomts_1_25_50.csv,96.50,1.83,3.50,7.23\nsuzuki_randomts_1_50_50.csv,98.25,1.29,1.75,5.67\nsuzuki_randomts_2_24_50.csv,95.89,1.96,4.11,9.21\nsuzuki_randomts_2_50_50.csv,98.16,1.46,1.84,6.27\nsuzuki_randomts_3_24_50.csv,95.55,2.06,4.45,9.43\nsuzuki_randomts_3_51_50.csv,98.41,1.11,1.59,5.18\nsuzuki_randomts_4_24_50.csv,95.81,2.32,4.19,10.97\nsuzuki_randomts_4_48_50.csv,98.27,1.35,1.73,5.67

In [6]:
with open(output_filename, 'w') as f:
    f.write(output_file)