In [None]:
from datetime import date
import subprocess
import os
import pickle
import heapq

# Limit numpy to 1 thread so that
# we can parallelize the error analysis
# properly

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

import matplotlib.pyplot as plt
import numpy as np
from netCDF4 import Dataset
import pandas as pd

import droplet_approximation

## General Settings

In [None]:
current_SHA = str(subprocess.check_output("git rev-parse HEAD", shell=True))[2:-3]

# Path to the top-level simulations/ data directory.
#
# NOTE: This is set to an invalid value as there is no way to set a sensible default.
#       The next cell halts execution if it's not set.
#
simulations_data_root = "/groups/drichte2/gthomsen/"

# Name of the simulation we're investigating.
#
# NOTE: This must match one of the names in the next cell!
#
#simulation_name = "Pi Chamber - RH~103%, 1-way Coupling"
simulation_name = "Pi Chamber - 2-way Coupling"
#simulation_name = "cfog"
#simulation_name = "Fatima"
#simulation_name = "Spray"

# Number of processes to use for parallel operations.  Zero means use one process
# per core on the system.
number_processes = 0

# Use a large fanout degree for our raw particle files directory hierarchy.
#
# NOTE: This must match how the particle directories were constructed.  Do *not* change
#       this unless you've updated the common datasets as well.
#
dirs_per_level   = 256

# HISTOGRAM PARAMTERS
histogram_count     = 6
radbins_count       = 400
tempbins_count      = 400
averages_count      = 1000
background_averages = []

## Error Analysis Settings

In [None]:
# This controls how much of the data to load.
# The notebook will 1/subset fraction
# of the overall dataset.
subset_fraction = 1

# Good CUSUM parameters for non-iterative, could probably be dialed in more
cusum_error_tolerance = np.array( [ 0.005, 0.05 ] )
cusum_error_threshold = np.array( [ 0.02, 0.20 ] )

norm = droplet_approximation.standard_norm

# How many processes/batches to run the analysis with
# Defaults number_processes to multiprocessing.cpu_count() - 1
number_processes = 0
number_batches = 1


# Maximum number of deviation clusters to identify
max_clusters = 7

# This sets the x-y-z limits for the deviation cluster graph.
# Since there are some deviations at very far flung parts of
# parameter space, without explicitly setting these ranges,
# the deviations are all smooshed together on the graph.
set_deviation_graph_limits = False
deviation_graph_x_range    = ( -.5,2.0 )
deviation_graph_y_range    = ( -6.75, -2.50 )
deviation_graph_z_range    = ( 1.0,1.07 )

# How many of the worst particles to graph the trajectory of
number_graphs = 3

# The worst `number_graphs` particles will be picked
# only from particles with deviations from all of the listed
# clusters. If None, select any particle.
# If using strict_graph_cluster_filter, select particles
# with ONLY the specified deviations.

#deviation_graph_cluster_filter = numpy.array( [0,1,2] )
deviation_graph_cluster_filter = None
strict_graph_cluster_filter    = False


# This array determines where to pickle the analysis reports
# to. Does not save if `None`. There must be one path for each
# model.
save_scores              = True

# Determines whether to load scores from pickled files.
# If true, deviation analysis WILL NOT BE RUN. Instead,
# the notebook will load previous deviation analysis files
# from the supplied paths.
# There must be one pickled score path for each model.
load_scores              = False

filter_be_failures = False
cold_threshold     = -np.inf

reference_evaluation  = {"bdf iterative": "bdf_iterative"}
comparison_evaluation = {"mlp gifted finale": "mlp_gifted_finale"}

filter_be_failures = False
cold_threshold     = -np.inf

additional_description = ""
time_range = [-np.inf, np.inf]

number_batches     = 10

## Change Settings Here

In [None]:
# Edit settings here:
#simulation_name = "Pi Chamber - RH~103%, 1-way Coupling"
simulation_name = "Pi Chamber - 2-way Coupling"
#simulation_name = "cfog"
#simulation_name = "Fatima"
#simulation_name = "Spray"

filter_be_failures = False
cold_threshold     = -np.inf

background_averages = ["salt masses", "air temperatures", "relative humidities", "air densities"]

save_scores           = True
load_scores           = False

# Good CUSUM parameters for iterative, could probably be dialed in more
#cusum_error_tolerance = np.array( [ 0.02, 0.10 ] )
#cusum_error_threshold = np.array( [ 0.08, 0.40 ] )

subset_fraction       = 1

#set_deviation_graph_limits = True
#deviation_graph_x_range    = ( -.5,2.0 )
#deviation_graph_y_range    = ( -6.75, -2.50 )
#deviation_graph_z_range    = ( 1.0,1.07 )

# deviation_graph_cluster_filter = np.array( [1] )
# If True, this will select the worst particle with EXACTLY the deviations in the filter
# strict_graph_cluster_filter    = True

# Output Structure
The following cell uses the user settings to auto generate an output folder. It also creates a table listing where to store all of the graphs for later analysis.

In [None]:
reference_tag  = next( iter( reference_evaluation ) )
comparison_tag = next( iter( comparison_evaluation ) )

evaluations = comparison_evaluation | reference_evaluation

descriptors = []
if filter_be_failures:
    descriptors.append( "BE Failure Filtered" )
if cold_threshold != -np.inf:
    descriptors.append( "Cold_Threshold_{:.1f}K".format( cold_threshold ) )

# Note- this affects the save location default!
additional_description = ", ".join( descriptors )

analysis_root_dir = "/groups/drichte2/droplet_approximation/data/analysis/"
analysis_suffix   = "-{:s}".format( additional_description.lower().replace( ", ", "-" ).replace( " ", "_" ) ) if additional_description != "" else ""
analysis_dir_name = "error_analysis-{:s}-{:s}_vs_{:s}{:s}/".format( simulation_name.replace(",", "")
                                                                                   .replace("~","_")
                                                                                   .replace("%","")
                                                                                   .replace(" - ", "_")
                                                                                   .replace("-", "_")
                                                                                   .replace(" ", "_"),
                                                                    reference_tag,
                                                                    comparison_tag,
                                                                    analysis_suffix ).lower().replace(" ", "_")

analysis_dir_path          = analysis_root_dir + analysis_dir_name
figures_dir_path           = analysis_dir_path + "figures/"
deviation_figures_dir_path = figures_dir_path  + "deviations/"
score_report_path          = analysis_dir_path + "score_report.pkl"
substitutions_dir_path     = analysis_dir_path + "substitutions/"

os.makedirs(figures_dir_path, exist_ok=True)
os.makedirs(substitutions_dir_path, exist_ok=True)
os.makedirs(deviation_figures_dir_path, exist_ok=True)

figure_filenames = {
    "RT AVERAGES": "radius_temperature_averages.png",
    "RT AVERAGES SHORT": "radius_temperature_averages_short.png",
    "RADIUS HISTOGRAM": "radius_histogram.png",
    "TEMPERATURE HISTOGRAM": "temperature_histogram.png",
    "RADIUS MULTI HISTOGRAM": "radius_multi_histogram.png",
    "TEMPERATURE MULTI HISTOGRAM": "temperature_multi_histogram.png",
    "DEVIATION CLUSTERS": "deviation_clusters.png",
    "PPNRMSE HISTOGRAM": "per_particle_nrmse_histogram.png",
    "BEST TRAJECTORY": "best_trajectory-pid=PARTICLE_ID.png",
    "RANDOM TRAJECTORY": "random_trajectory-pid=PARTICLE_ID.png",
    "WORST TRAJECTORY": "worst_trajectory-pid=PARTICLE_ID.png"
}

score_report_path

In [None]:
# Ensure we know where our data resides, otherwise stop execution.
if simulations_data_root is None:
    raise ValueError( "Must set simulations_data_root to run this notebook!" )

# Do we need to default the number of processes to use?
if number_processes == 0:
    number_processes = os.cpu_count()
    
# Map the simulation name to its directory beneath the simulations data root.
if simulation_name == "Pi Chamber - RH~103%, 1-way Coupling":
    simulation_directory_name = "pi_chamber-1way_rh103"
elif simulation_name == "Pi Chamber - 2-way Coupling":
    simulation_directory_name = "pi_chamber-2way"
elif simulation_name == "cfog":
    simulation_directory_name = "cfog"
elif simulation_name == "Fatima":
    simulation_directory_name = "fatima"
elif simulation_name == "Spray":
    simulation_directory_name = "spray"
else:
    raise ValueError( "Unknown simulation_name!" )

# Top-level directory of this simulation.
simulation_root = "{:s}/{:s}".format( simulations_data_root, simulation_directory_name )

# Path to the top of the raw particle files directory hierarchy and its index.
particles_root       = "{:s}/particles".format( simulation_root )
particles_index_path = "{:s}/particles.index".format( particles_root )


In [None]:
unique_particle_ids = np.fromfile( particles_index_path, dtype=np.int32 )[::subset_fraction]

parallel_read_flag = True
if parallel_read_flag:
    particles_df = droplet_approximation.batch_read_particles_data( particles_root,
                                                                    unique_particle_ids,
                                                                    dirs_per_level,
                                                                    cold_threshold=cold_threshold,
                                                                    filter_be_failures=filter_be_failures,
                                                                    evaluations=evaluations,
                                                                    number_processes=number_processes )
else:
    particles_df = droplet_approximation.read_particles_data( particles_root,
                                                              unique_particle_ids,
                                                              dirs_per_level,
                                                              cold_threshold=cold_threshold,
                                                              filter_be_failures=filter_be_failures,
                                                              evaluations=evaluations)
print( "{:d} particles in the DataFrame.".format( 
    len( particles_df ) ) )

In [None]:
radbins  = np.logspace( -8.0, -3.0, radbins_count )
tempbins = np.linspace( 273.0, 310.0, tempbins_count )

In [None]:
simulation_times = droplet_approximation.get_particles_data_simulation_times( particles_df )

histogram_times  = np.linspace( simulation_times[0], simulation_times[-1], histogram_count )
histogram_times  = simulation_times[[np.searchsorted( simulation_times, histogram_times )]][0]

averages_indexes = np.linspace( simulation_times[0], simulation_times[-1], averages_count )
averages_indexes = np.searchsorted( simulation_times, averages_indexes )
averages_times   = simulation_times[[averages_indexes]][0]

In [None]:
histograms                       = droplet_approximation.bin_particles_data( particles_df, [reference_tag, comparison_tag], histogram_times, radbins, tempbins )
rt_averages, background_averages = droplet_approximation.average_particles_data( particles_df, [reference_tag, comparison_tag], averages_times, background_averages)

In [None]:
fig, ax_h = droplet_approximation.plot_droplet_size_temperatures( averages_times, rt_averages, background_parameters=background_averages )

fig.suptitle("Average Droplet Radii/Temperature for {:s} vs. {:s} in {:s}".format( reference_tag, comparison_tag, simulation_name ) )
fig.set_figwidth(10)
fig.set_figheight(10)

ax_h[0][0].set_title("Average Radius (m)")
ax_h[0][1].set_title("Average Temperature (K)")

ax_h[2][0].set_title("Average Salt Mass (kg)")
ax_h[2][1].set_title("Average Air Temperature (K)")
ax_h[3][0].set_title("Average Relative Humidity (%)")
ax_h[3][1].set_title("Average Air Density (g/cm^3)")

ax_h[2][0].set_ylabel("Average Salt Mass (kg)")
ax_h[2][1].set_ylabel("Average Air Temperature (K)")
ax_h[3][0].set_ylabel("Average Relative Humidity (%)")
ax_h[3][1].set_ylabel("Average Air Density (g/cm^3)")

plt.savefig(figures_dir_path + figure_filenames["RT AVERAGES"], bbox_inches="tight" )

In [None]:
fig, ax_h = droplet_approximation.plot_droplet_size_temperatures( averages_times, rt_averages )

fig.suptitle("Average Droplet Radii/Temperature for {:s} vs. {:s} in {:s}".format( reference_tag, comparison_tag, simulation_name ) )
fig.set_figwidth(10)
fig.set_figheight(10)

ax_h[0][0].set_title("Average Radius (m)")
ax_h[0][1].set_title("Average Temperature (K)")

plt.savefig(figures_dir_path + figure_filenames["RT AVERAGES SHORT"], bbox_inches="tight" )

In [None]:
histogram_count = len(histogram_times)

fig, axs = plt.subplots(np.ceil(histogram_count/2.0).astype(int), 2, constrained_layout=True)
fig.suptitle("Radius Histograms of BE vs. BDF for {:s}".format( simulation_name ) )
fig.set_figheight(10)
fig.set_figwidth(10)
for i,time in enumerate(histogram_times):
    print("Histogram: ", time)
    index = [i//2, i%2]

    bins = radbins
    counts = histograms[reference_tag][0][i, :]
    axs[*index].hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=reference_tag)
    counts = histograms[comparison_tag][0][i, :]
    axs[*index].hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=comparison_tag)
    axs[*index].legend()
    #axs[*index].axvspan(-4.2757, 0, facecolor='lightgray', alpha=0.8)
    axs[*index].set_title(f"Radius Histogram at Time {time:.2f}s")
    axs[*index].set(xlabel="Log Radius (m)", ylabel="# of Particles", xlim=(1.0e-8, 1.0e-3) )
    axs[*index].set_xscale( "log" )
    axs[*index].grid( alpha=0.1, color="black" )
    axs[*index].minorticks_on()


plt.savefig(figures_dir_path + figure_filenames["RADIUS MULTI HISTOGRAM"], bbox_inches="tight" )

In [None]:
histogram_count = len(histogram_times)

fig, axs = plt.subplots(np.ceil(histogram_count/2.0).astype(int), 2, constrained_layout=True)
fig.suptitle("Temperature Histograms of BE vs. BDF for {:s}".format( simulation_name ) 
            )
fig.set_figheight(10)
fig.set_figwidth(10)
for i,time in enumerate(histogram_times):
    print("Histogram: ", time)
    index = [i//2, i%2]

    bins = tempbins
    counts = histograms[reference_tag][1][i, :]
    axs[*index].hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=reference_tag)
    counts = histograms[comparison_tag][1][i, :]
    axs[*index].hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=comparison_tag)
    axs[*index].legend()
    axs[*index].set_title(f"Temperature Histogram at Time {time:.2f}s")
    axs[*index].set(xlabel="Particle Temperature (K)", ylabel="# of Particles", xlim=(273, 310) )
    axs[*index].grid( alpha=0.1, color="black" )
    axs[*index].minorticks_on()

plt.savefig(figures_dir_path + figure_filenames["TEMPERATURE MULTI HISTOGRAM"], bbox_inches="tight" )

In [None]:
histogram_sample = 0

fig = plt.figure()
plt.title("Radius Histogram of BE vs. BDF for {:s} at time {:.2f}s".format( simulation_name, histogram_times[histogram_sample] ) )

fig.set_figheight(10)
fig.set_figwidth(10)

bins = radbins
counts = histograms[reference_tag][0][histogram_sample, :]
plt.hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=reference_tag)
counts = histograms[comparison_tag][0][histogram_sample, :]
plt.hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=comparison_tag)
plt.legend()

plt.xlabel("Radius (m)")
plt.ylabel("# of Particles")
plt.xlim( (1.0e-8, 1.0e-3) )
plt.xscale( "log" )
plt.grid(alpha=0.1, color="black")
plt.minorticks_on()

plt.savefig(figures_dir_path + figure_filenames["RADIUS HISTOGRAM"], bbox_inches="tight" )

In [None]:
histogram_sample = 1

fig = plt.figure()
plt.title("Temperature Histogram of BE vs. BDF for {:s} at time {:.2f}s".format( simulation_name, histogram_times[histogram_sample] ) )

fig.set_figheight(10)
fig.set_figwidth(10)

bins = tempbins
counts = histograms[reference_tag][1][histogram_sample, :]
plt.hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=reference_tag)
counts = histograms[comparison_tag][1][histogram_sample, :]
plt.hist(bins[:len(counts)],bins=bins, weights=counts, alpha=0.5, label=comparison_tag)
plt.legend()

plt.xlabel("Temperature (K)")
plt.ylabel("# of Particles")
plt.xlim( (273, 310) )
plt.grid(alpha=0.1, color="black")
plt.minorticks_on()

plt.savefig(figures_dir_path + figure_filenames["TEMPERATURE HISTOGRAM"], bbox_inches="tight" )

In [None]:
del particles_df

# Error Statistics Calcuations/Plotting
Now that averages and histograms have been generated, we unload our particles dataframe to free up memory and the error analysis pipeline.

In [None]:
# Load or calculate score_reports
if load_scores:
    try:
        with open( score_report_path, "rb" ) as score_file:
            score_report = pickle.load( score_file )
    except Exception as e:
        print(f"Failed to open {score_report_path}. Encountered error:\n {e}")
else:
    # Extract subset_fraction/256 of the particles and score
    ids_index = np.fromfile( particles_root + "/particles.index", dtype=np.int32 )
    filtered_ids = ids_index[::subset_fraction]

    score_report = droplet_approximation.ScoringReport( particles_root,
                                  filtered_ids, 
                                  dirs_per_level,
                                  reference_evaluation,
                                  comparison_evaluation,
                                  cusum_error_tolerance=cusum_error_tolerance, 
                                  cusum_error_threshold=cusum_error_threshold,
                                  norm=norm,
                                  number_processes=number_processes,
                                  number_batches=number_batches,
                                  filter_be_failures=filter_be_failures,
                                  cold_threshold=cold_threshold,
                                  max_clusters=12 )

    # Dump pickled results
    try:
        if save_scores is not None:
            with open( score_report_path, "wb" ) as score_file:
                pickle.dump( score_report, score_file )
    except Exception as e:
        print("Failed to save comparison between {:s}/{:s} to file {:s} due to the following exception: \n {:}".format( reference_tag,
                                                                                                                         comparison_tag,
                                                                                                                         pickled_score_path,
                                                                                                                         e ))


In [None]:
# We set precision to 2 because otherwise everything is labeled
# with very long decimals. We can fix this more thoroughly later
# This only matters if label_centers=True
with np.printoptions( precision=2 ):
    fig,ax = score_report.plot_deviations(label_centers=False, thinning_ratio=1)

fig.set_size_inches( ( 10,10 ) )
if set_deviation_graph_limits:
    ax.set_ylim3d( deviation_graph_x_range ) 
    ax.set_xlim3d( deviation_graph_y_range ) 
    ax.set_zlim3d( deviation_graph_z_range ) 

plt.savefig( figures_dir_path + figure_filenames["DEVIATION CLUSTERS"], bbox_inches="tight" )

In [None]:
%matplotlib inline
ppNRMSE = np.array( list( score_report.per_particle_nrmse.values() ) )
fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(10)
plt.minorticks_on()
plt.grid(color="black", alpha=0.1)
plt.xlabel("Per Particle NRMSE")
plt.ylabel("Particle Count")
plt.title("Per Partficle NRMSE Histogram for {:s} {:s} vs. {:s} - {:s}".format( simulation_name, reference_tag, comparison_tag, additional_description ) )
plt.hist( ppNRMSE, bins=100 )
plt.yscale( "log" )

plt.savefig( figures_dir_path + figure_filenames["PPNRMSE HISTOGRAM"], bbox_inches="tight" )
plt.close()

In [None]:
#Graph the best, random, and worst graphs
deviation_particle_ids = score_report.deviation_particle_ids
target_particle_ids = [heapq.nsmallest( 1, score_report.per_particle_nrmse, 
                                        key=score_report.per_particle_nrmse.get )[0],
                       np.random.choice( ids_index, 1)[0],
                       heapq.nlargest( 1, score_report.per_particle_nrmse, 
                                        key=score_report.per_particle_nrmse.get )[0]]
figure_identifiers = ["BEST TRAJECTORY", "RANDOM TRAJECTORY", "WORST TRAJECTORY"]

particles_df = droplet_approximation.read_particles_data( particles_root,
                                                                    target_particle_ids,
                                                                    dirs_per_level,
                                                                    cold_threshold=cold_threshold,
                                                                    filter_be_failures=filter_be_failures,
                                                                    evaluations=evaluations )
for i in range(3):
    particle_df = particles_df.loc[target_particle_ids[i]]
    background_parameters = {
        "Salt Mass (kg)": particle_df["salt masses"],
        "Air Temperature (K)": particle_df["air temperatures"],
        "Relative Humidity (%)": particle_df["relative humidities"],
        "Air Density (g/cm^3)": particle_df["air densities"]
    }
    fig, ax_h = droplet_approximation.plot_droplet_size_temperatures_scoring( particle_df, score_report, background_parameters=background_parameters)
    #fig, ax_h = droplet_approximation.plot_droplet_size_temperatures_dataframe( particle_df, [reference_tag, comparison_tag], background_parameters=background_parameters)
    fig.set_figwidth( 14 )
    fig.set_figheight( 12 )
    figure_filenames[figure_identifiers[i]] = figure_filenames[figure_identifiers[i]].replace( "PARTICLE_ID", "{:d}".format( particle_df.name ) )
    plt.savefig( figures_dir_path + figure_filenames[figure_identifiers[i]], bbox_inches="tight" )
    plt.close()




cluster_count = np.max( score_report.deviation_clusters ) + 1

cluster_particle_ids        = [ [] for _ in range( cluster_count ) ]
cluster_ppNRMSEs            = [ [] for _ in range( cluster_count ) ]

# Collect the ids and NRMSE for the particles in each deviation cluster
deviation_particle_ids      = score_report.deviation_particle_ids
change_points               = np.array( np.where( deviation_particle_ids[1:] != deviation_particle_ids[:-1] )[0] ) + 1
    
start_index = 0
count = 0
for end_index in change_points:
    particle_id       = deviation_particle_ids[start_index]
    particle_clusters = np.unique( score_report.deviation_clusters[start_index:end_index] )

    if count % 100000 == 0:
        print(f"At {count} out of {len( change_points )}")
    for cluster_index in particle_clusters:
        cluster_particle_ids[cluster_index].append( particle_id )
        cluster_ppNRMSEs[cluster_index].append( score_report.per_particle_nrmse[particle_id] )

    start_index = end_index
    count += 1

In [None]:
cluster_image_substitution_table = pd.DataFrame( columns=["CLUSTER PPNRMSE HISTOGRAM",
                                                          "CLUSTER TRAJECTORY 1",
                                                          "CLUSTER TRAJECTORY 2",
                                                          "CLUSTER TRAJECTORY 3"] )
cluster_text_substitution_table  = pd.DataFrame( columns=["CLUSTER NAME",
                                                          "CLUSTER COUNT",
                                                          "CLUSTER CENTER",
                                                          "CLUSTER MEAN PPNRMSE",
                                                          "CLUSTER MEDIAN PPNRMSE"])
image_substitution_rows = []
text_substitution_rows  = []

# Graph/Tabulate the results for each cluster
for cluster_index in range( cluster_count ):
    image_substitution_row = {}
    text_substitution_row  =  {}
    
    ppNRMSE = np.array( cluster_ppNRMSEs[cluster_index] )

    text_substitution_row["CLUSTER NAME"] = "Cluster {:d}".format( cluster_index )
    text_substitution_row["CLUSTER COUNT"] = ppNRMSE.shape[0]
    text_substitution_row["CLUSTER CENTER"] = ["{:.2e}".format( cluster_center ) for cluster_center in score_report.cluster_centers[cluster_index]]
    text_substitution_row["CLUSTER MEAN PPNRMSE"] = "{:.3e}".format( np.mean( ppNRMSE ) )
    text_substitution_row["CLUSTER MEDIAN PPNRMSE"] = "{:.3e}".format( np.median( ppNRMSE ) )

    fig = plt.figure()
    fig.set_figwidth(10)
    fig.set_figheight(10)
    plt.minorticks_on()
    plt.grid(color="black", alpha=0.1)
    plt.xlabel("Per Particle NRMSE")
    plt.ylabel("Particle Count")
    plt.title("Per Partficle NRMSE Histogram for Spray {:s} vs. {:s} - {:s}".format( reference_tag, comparison_tag, additional_description ) )
    plt.hist( ppNRMSE, bins=int(np.ceil( np.sqrt( ppNRMSE.shape[0] ) ) ) )
    plt.yscale( "log" )

    histogram_file_name = "{:s}cluster_{:d}_ppNRMSE_histogram.png".format( deviation_figures_dir_path, cluster_index )
    plt.savefig( histogram_file_name, bbox_inches="tight" )
    image_substitution_row["CLUSTER PPNRMSE HISTOGRAM"] = "cluster_{:d}_ppNRMSE_histogram.png".format( cluster_index )
    plt.close()

    target_particle_ids = np.random.choice( cluster_particle_ids[cluster_index], 3 )
    particles_df= droplet_approximation.read_particles_data( particles_root,
                                                                    target_particle_ids,
                                                                    dirs_per_level,
                                                                    cold_threshold=cold_threshold,
                                                                    filter_be_failures=filter_be_failures,
                                                                    evaluations=evaluations )
    for graph_index in range(3):
        particle_df = particles_df.loc[target_particle_ids[graph_index]]
        background_parameters = {
            "Salt Mass (kg)": particle_df["salt masses"],
            "Air Temperature (K)": particle_df["air temperatures"],
            "Relative Humidity (%)": particle_df["relative humidities"],
            "Air Density (g/cm^3)": particle_df["air densities"]
        }
        fig, ax_h = droplet_approximation.plot_droplet_size_temperatures_scoring( particle_df, score_report, background_parameters=background_parameters)
        #fig, ax_h = droplet_approximation.plot_droplet_size_temperatures_dataframe( particle_df, [reference_tag, comparison_tag], background_parameters=background_parameters)

        fig.set_figwidth( 10 )
        fig.set_figheight( 10 )
        
        figure_save_path = "{:s}cluster_{:d}-trajectory_{:d}-pid={}.png".format( deviation_figures_dir_path, cluster_index, graph_index, particle_df.name ) 
        image_substitution_row["CLUSTER TRAJECTORY {:d}".format( graph_index + 1 )] = "cluster_{:d}-trajectory_{:d}-pid={}.png".format( cluster_index, graph_index, particle_df.name )
        plt.savefig( figure_save_path, bbox_inches="tight" )
        
        plt.close()

    image_substitution_rows.append( image_substitution_row )
    text_substitution_rows.append( text_substitution_row )

cluster_image_substitution_table = pd.concat( [pd.DataFrame( [image_substitution_row] ) for image_substitution_row in image_substitution_rows] )
cluster_text_substitution_table  = pd.concat( [pd.DataFrame( [text_substitution_row] ) for text_substitution_row in text_substitution_rows] )

    

# Output Substitution Tables
We create tables between the image names and their locations. This table enables a script to come through and use the images to auto-populate a google slide for this error analysis.

In [None]:

image_substitution_table = pd.DataFrame()

image_substitution_table["Figure Name"] = figure_filenames.keys()
image_substitution_table["Figure Filename"] = figure_filenames.values()

image_substitution_table.to_csv( substitutions_dir_path + "image_substitution_table.csv", header=False, index=False )

text_substitution_table = pd.DataFrame()

ppNRMSE = np.array( list( score_report.per_particle_nrmse.values() ) )

text_substitutions = {
    "TITLE": "Error Analysis of {:s} vs. {:s} in {:s}".format( reference_tag, comparison_tag, simulation_name ),
    "DATE": date.today().strftime("%d/%m/%Y"),
    "SHA": current_SHA,
    "SUBTITLE": "{:s} vs. {:s} - {:s}".format( reference_tag, comparison_tag, simulation_name ),
    "NRMSE": "{:.3e}".format( score_report.net_nrmse ),
    "MEAN PPNRMSE": "{:.3e}".format( np.mean( ppNRMSE ) ),
    "MEDIAN PPNRMSE": "{:.3e}".format( np.median( ppNRMSE ) )
}

text_substitution_table["Figure Name"]     = text_substitutions.keys()
text_substitution_table["Figure Filename"] = text_substitutions.values()

text_substitution_table.to_csv( substitutions_dir_path + "text_substitution_table.csv", header=False, index=False )

cluster_image_substitution_table.to_csv( substitutions_dir_path + "cluster_image_substitution_table.csv", index=False )
cluster_text_substitution_table.to_csv( substitutions_dir_path + "cluster_text_substitution_table.csv", index=False )