In [None]:
import heapq
import multiprocessing
import os

# Limit numpy to 1 thread so that
# we can parallelize the error analysis
# properly
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

import matplotlib.pyplot as plt
from matplotlib import colors

import numpy as np
import torch
import pandas as pd

import pickle

from droplet_approximation import *

# Likewise limit pytroch to 1 thread
torch.set_num_threads( 1 )
torch.set_num_interop_threads( 1 )


In [None]:
# Do NOT edit this cell. Instead, make any changes you want in the cell below it by setting
# these variables.

# Commit for each model
current_SHA = "79a3442545133bfe38cecf9b67ab928538842b23"

# Change this to fit wherever testing data is stored. TODO: update this to match particles exploration
simulation_name = "Pi Chamber 1way RH103"
particles_root  = "/groups/drichte2/droplet_approximation/data/simulations/pi_chamber-1way-rh103/particles"
#particles_root  = "../data/particles"
dirs_per_level  = 256

# This controls how much of the data to load.
# The notebook will 1/subset fraction
# of the overall dataset.
subset_fraction = 1

# Good CUSUM parameters for non-iterative, could probably be dialed in more
cusum_error_tolerance = np.array( [ 0.005, 0.05 ] )
cusum_error_threshold = np.array( [ 0.02, 0.20 ] )

norm = standard_norm

# How many processes/batches to run the analysis with
# Defaults number_processes to multiprocessing.cpu_count() - 1
number_processes = 0
number_batches = 1


# Maximum number of deviation clusters to identify
max_clusters = 7

# This sets the x-y-z limits for the deviation cluster graph.
# Since there are some deviations at very far flung parts of
# parameter space, without explicitly setting these ranges,
# the deviations are all smooshed together on the graph.
set_deviation_graph_limits = False
deviation_graph_x_range    = ( -.5,2.0 )
deviation_graph_y_range    = ( -6.75, -2.50 )
deviation_graph_z_range    = ( 1.0,1.07 )

# How many of the worst particles to graph the trajectory of
number_graphs = 3

# The worst `number_graphs` particles will be picked
# only from particles with deviations from all of the listed
# clusters. If None, select any particle.
# If using strict_graph_cluster_filter, select particles
# with ONLY the specified deviations.

#deviation_graph_cluster_filter = numpy.array( [0,1,2] )
deviation_graph_cluster_filter = None
strict_graph_cluster_filter    = False


# This array determines where to pickle the analysis reports
# to. Does not save if `None`. There must be one path for each
# model.
save_scores              = True

# Determines whether to load scores from pickled files.
# If true, deviation analysis WILL NOT BE RUN. Instead,
# the notebook will load previous deviation analysis files
# from the supplied paths.
# There must be one pickled score path for each model.
load_scores              = False

filter_be_failures = False
cold_threshold     = -np.inf

reference_evaluation  = {"be": ""}
comparison_evaluation = {"bdf iterative": "bdf_iterative"}

filter_be_failures = False
cold_threshold     = -np.inf

additional_description = ""

In [None]:
# Edit settings here:
reference_tag  = next( iter( reference_evaluation ) )
comparison_tag = next( iter( comparison_evaluation ) )

#filter_be_failures = True
#cold_threshold     = 284.0
descriptors = []
if filter_be_failures:
    descriptors.append( "BE Failure Filtered" )
if cold_threshold != -np.inf:
    descriptors.append( "Cold_Threshold={:.1f}K".format( cold_threshold ) )

# Note- this affects the save location default!
additional_description = ", ".join( descriptors )

score_report_dir      = "/groups/drichte2/droplet_approximation/data/analysis/tmp/"
score_report_suffix   = "-{:s}".format( additional_description.lower().replace( ", ", "-" ).replace( " ", "_" ) ) if additional_description != "" else ""
score_report_filename = "scoring_report-{:s}-{:s}_vs_{:s}{:s}.pkl".format( simulation_name.replace(" ", "_"),
                                                                           reference_tag,
                                                                           comparison_tag,
                                                                           score_report_suffix ).lower().replace(" ", "_")
pickled_score_path    = score_report_dir + score_report_filename
save_scores           = True
load_scores           = False

number_batches     = 10

#iterative = True
# Good CUSUM parameters for iterative, could probably be dialed in more
#cusum_error_tolerance = np.array( [ 0.02, 0.10 ] )
#cusum_error_threshold = np.array( [ 0.08, 0.40 ] )
# You might want more clusters to be identified for iterative as well. This number has not been tuned thoroughly
#max_clusters = 12

#set_deviation_graph_limits = True
#deviation_graph_x_range    = ( -.5,2.0 )
#deviation_graph_y_range    = ( -6.75, -2.50 )
#deviation_graph_z_range    = ( 1.0,1.07 )

# This will select the worst particles deviations from ONLY clusters 1 and 3
#strict_graph_cluster_filter    = True
#deviation_graph_cluster_filter = np.array( [1, 3] )
score_report_filename

In [None]:
if save_scores and load_scores:
    raise( Exception("save_scores and load_scores set to True! Set at least one to false to continue."))

In [None]:
if number_processes == 0:
    number_processes = multiprocessing.cpu_count() - 1
number_processes

In [None]:
# Load or calculate score_reports
if load_scores:
    try:
        with open( pickled_score_path, "rb" ) as score_file:
            score_report = pickle.load( score_file )
    except Exception as e:
        print(f"Failed to open {load_path}. Encountered error:\n {e}")
else:
    # Extract subset_fraction/256 of the particles and score
    ids_index = np.fromfile( particles_root + "/particles.index", dtype=np.int32 )
    filtered_ids = ids_index[::subset_fraction]

    score_report = ScoringReport( particles_root,
                                  filtered_ids, 
                                  dirs_per_level,
                                  reference_evaluation,
                                  comparison_evaluation,
                                  cusum_error_tolerance=cusum_error_tolerance, 
                                  cusum_error_threshold=cusum_error_threshold,
                                  norm=norm,
                                  number_processes=number_processes,
                                  number_batches=number_batches,
                                  filter_be_failures=filter_be_failures,
                                  cold_threshold=cold_threshold,
                                  max_clusters=12 )

    # Dump pickled results
    try:
        if pickled_score_path is not None:
            with open( pickled_score_path, "wb" ) as score_file:
                pickle.dump( pickled_score_path, score_file )
    except Exception as e:
        print("Failed to save comparison between {:s}/{:s} to file {:s} due to the following exception: \n {:}".format( reference_tag,
                                                                                                                         comparison_tag,
                                                                                                                         pickled_score_path,
                                                                                                                         e ))


In [None]:
# We set precision to 2 because otherwise everything is labeled
# with very long decimals. We can fix this more thoroughly later
# This only matters if label_centers=True
with np.printoptions( precision=2 ):
    fig,ax = score_report.plot_deviations(label_centers=False, thinning_ratio=10)

fig.set_size_inches( ( 8,12 ) )
if set_deviation_graph_limits:
    ax.set_ylim3d( deviation_graph_x_range ) 
    ax.set_xlim3d( deviation_graph_y_range ) 
    ax.set_zlim3d( deviation_graph_z_range ) 

In [None]:
ppNRMSE = np.array( list( score_report.per_particle_nrmse.values() ) )
print( f"For comparison {reference_tag}/{comparison_tag}:\n"
    + f"Overall NRMSE: {score_report.net_nrmse}\n"
    + f"Mean Per Particle NRMSE: {np.mean( ppNRMSE )}\n"
    + f"Median Per Particle NRMSE: {np.median( ppNRMSE )}\n" )

In [None]:
# Load corresponding particle df
# Select the worst particles based on NRMSE and filters

if deviation_graph_cluster_filter is None:
    target_particle_ids = heapq.nlargest( number_graphs, score_report.per_particle_nrmse, 
                                          key=score_report.per_particle_nrmse.get )
else:
    # Iterates over each particle's deviations to see if they contain the deviations
    # in the cluster filter.
    deviation_particle_ids      = score_report.deviation_particle_ids
    change_points               = np.array( np.where( deviation_particle_ids[1:] != deviation_particle_ids[:-1] )[0] ) + 1
    filtered_per_particle_nrmse = {}
    
    start_index = 0
    for end_index in change_points:
        particle_id       = deviation_particle_ids[start_index]
        particle_clusters = score_report.deviation_clusters[start_index:end_index]
        if np.all( np.isin( deviation_graph_cluster_filter, particle_clusters ) ):
            # If strict filtering is on and there are additional deviations
            # continue without adding the particle to the list.
            if strict_graph_cluster_filter and not np.all( np.isin( particle_clusters, deviation_graph_cluster_filter ) ):
                start_index = end_index
                continue

            filtered_per_particle_nrmse[particle_id] = score_report.per_particle_nrmse[particle_id]

        start_index = end_index

    target_particle_ids = heapq.nlargest( number_graphs, filtered_per_particle_nrmse, 
        key=filtered_per_particle_nrmse.get )

#ids_index = np.fromfile( particles_root + "/particles.index", dtype=np.int32 )
#target_particle_ids = np.random.choice( ids_index, 3 )

df = read_particles_data( particles_root, target_particle_ids, dirs_per_level, evaluations={"bdf iterative": "bdf_iterative"}, cold_threshold=cold_threshold )
df

In [None]:
plt.figure()
plt.minorticks_on()
plt.grid(color="black", alpha=0.1)
plt.xlabel("Per Particle NRMSE")
plt.ylabel("Particle Count")
plt.title("Per Particle NRMSE Histogram for {:s} vs. {:s} {:s}".format( reference_tag, comparison_tag, additional_description ) )
plt.hist( ppNRMSE )
plt.yscale( "log" )
plt.show()

In [None]:
# For each particle, each model will yield 3 figures.
# The first will be the figure against BDF
# The second will be the figure against BE (with deviations highlighted)
# The third will be the figure's CUSUM analysis (calculated within the notebook)

# This is the same colormap as in `scoring.py` for plotting deviations
colormap = plt.get_cmap("tab20")

for particle_id in target_particle_ids:
    particle_df      = df.loc[particle_id]
    input_parameters = np.stack( particle_df[[
        "input be radii",
        "input be temperatures",
        "salt masses",
        "air temperatures",
        "relative humidities",
        "air densities",
        "integration times"
    ]].to_numpy(), axis=-1 )

    reference_output  = np.stack( particle_df[["output {:s} radii".format( "be" ),
                                               "output {:s} temperatures".format( "be" )]],
                                  axis=-1 )
    
    comparison_output = np.stack( particle_df[["output {:s} radii".format( "bdf iterative" ),
                                               "output {:s} temperatures".format( "bdf iterative" )]],
                                  axis=-1 )
    times = particle_df["times"]

    if filter_be_failures:
        be_mask           = particle_df["be statuses"] == 0
        input_parameters  = input_parameters[be_mask, :]
        reference_output  = reference_output[be_mask, :]
        comparison_output = comparison_output[be_mask, :]
        times             = times[be_mask]

    fig, ax_h = analyze_model_particle_performance( times, reference_output, comparison_output )

    fig.suptitle("Comparison between {:s}/{:s} with {:s} \n On particle {:d} from trace {:s}\n ppNRMSE: {:.4f} \n SHA: {:s}".format( reference_tag,
                                                                                                                                     comparison_tag,
                                                                                                                                     additional_description,
                                                                                                                                     particle_id, 
                                                                                                                                     simulation_name,
                                                                                                                                     score_report.per_particle_nrmse[ particle_id ],
                                                                                                                                     current_SHA ) )
    
    # TODO: reintroduce deviation graphing
    """
    be_mask             = be_success_mask( input_parameters[:, 0] )
    actual_particle_times = np.delete( np.cumsum( np.insert( input_parameters[:, -1],
                                                             0,
                                                             0.0 )[:-1] ),
                                      ~be_mask ) + particle_df["birth time"]
    input_parameters = input_parameters[be_mask]
    times            = np.insert( np.cumsum( input_parameters[:, -1] ), 0, 0.0 )[:-1] + particle_df["birth time"]


    model_output = np.stack(particle_df[["output test_bdf radii", "output test_bdf temperatures"]].to_numpy(), axis=-1)[be_mask]
    be_output  = np.stack(particle_df[["output be radii", "output be temperatures"]].to_numpy(), axis=-1)[be_mask]

    model_outputs = [model_output]
    
    print(len(model_outputs[0][:-1]))
    print(len(be_output))

    for model_index in range( model_count ): 
        fig_h_be, ax_h_be = analyze_model_particle_performance(
            times,
            input_parameters[:, :2], 
            model_outputs[model_index],
            norm
        )

        fig_h_be.suptitle( f"Droplet trajectory overview for particle {particle_id} on model {model_names[model_index]} vs. BE\n SHA: {commit_SHAs[model_index]}" ) 

        fig_h_cusum, ax_h_cusum = plt.subplots( 2, 2, figsize=(9,8) )
        fig_h_cusum.suptitle( f"Droplet trajectory overview part 2 for particle {particle_id} on model { model_names[model_index]}\n SHA: {commit_SHAs[model_index] }" ) 

        #model_cusum = np.abs( calculate_cusum( ( normed_be_output - normed_model_outputs[model_index] ).T, cusum_error_tolerance ) )

        #ax_h_cusum[0][0].set_title("Radius CUSUM chart") 
        #ax_h_cusum[0][0].plot( times, model_cusum[0].T, label=["positive radius cusum", "negative radius cusum"] )
        #ax_h_cusum[0][0].set_xlabel( "time (s)" )
        #ax_h_cusum[0][0].axhline( y=cusum_error_threshold[0], color="red",linewidth=1, linestyle="--",label="cusum divergence threshold" )

        #ax_h_cusum[0][0].set_ylabel("CUSUM")

        #ax_h_cusum[0][1].plot( times, particle_df["relative humidities"][be_mask] )
        #ax_h_cusum[0][1].set_title( "RH versus time for Particle " + str( model_index ) )
        #ax_h_cusum[0][1].set_xlabel( "time (s)" )
        #ax_h_cusum[0][1].set_ylabel( "Relative Humidity (%)" ) 


        #ax_h_cusum[1][0].plot( times, particle_df["air temperatures"][be_mask] - particle_df["input temperatures"][be_mask] )
        #ax_h_cusum[1][0].set_title( f"Temperature Difference for Particle {particle_id}" )
        #ax_h_cusum[1][0].set_xlabel( "time (s)" )
        #ax_h_cusum[1][0].set_ylabel( "Air Temperature (K)" ) 

        #ax_h_cusum[1][1].plot( times, particle_df["air temperatures"][be_mask] )
        #ax_h_cusum[1][1].set_title( f"Air Temperatures for Particle {particle_id}")
        #ax_h_cusum[1][1].set_xlabel( "time (s)" )
        #ax_h_cusum[1][1].set_ylabel( "time step (s)" ) 

        for k, deviation_index in enumerate( np.where( score_reports[model_index].deviation_particle_ids == particle_id )[0] ):
            deviation_parameter = score_reports[model_index].deviation_parameters[deviation_index]
            deviation_time      = score_reports[model_index].deviation_times[deviation_index]
            deviation_cluster   = score_reports[model_index].deviation_clusters[deviation_index]

            line_label = f"{deviation_parameter.name.lower()} deviation, cluster {deviation_cluster}"

            if deviation_parameter == DeviationParameter.RADIUS:
                ax_h_be[0][0].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_be[1][0].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                
                ax_h_cusum[0][0].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[0][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][0].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
            else:
                ax_h_be[0][1].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_be[1][1].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                
                ax_h_cusum[0][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][0].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )

        ax_h_cusum[0][0].legend()
        ax_h_cusum[0][1].legend()
        ax_h_cusum[1][0].legend()
        ax_h_cusum[1][1].legend()

        ax_h_be[0][0].legend()
        ax_h_be[0][1].legend()
        ax_h_be[1][0].legend()
        ax_h_be[1][1].legend()
        
        fig_h_be.tight_layout()
        fig_h_cusum.tight_layout()

        fig_h_be.show()
        fig_h_cusum.show()
    """
