In [None]:
import heapq
import multiprocessing
import os

# Limit numpy to 1 thread so that
# we can parallelize the error analysis
# properly
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

import matplotlib.pyplot as plt
from matplotlib import colors

import numpy as np
import torch
import pandas as pd

import pickle

from droplet_approximation import *

# Likewise limit pytroch to 1 thread
torch.set_num_threads( 1 )
torch.set_num_interop_threads( 1 )


In [None]:
# Do NOT edit this cell. Instead, make any changes you want in the cell below it by setting
# these variables.

# This model corresponds to "Model Box Uncoupled 400M l1 Residual 14 Epochs" in the group drive
model_load_paths = ["../models/network_box_uncoupled_400M_l1_residual_epoch_14.pt"]
model_names      = [ path.split( "/" )[-1].split( "." )[0].replace( "_", " " ) for path in model_load_paths ]

# This gets passed to `set_parameter_ranges` in the notebook and in
# the analysis processes.
parameter_ranges = None

# Commit for each model
commit_SHAs = ["369bacc0ba5c7367f17ec71707a1df64afd1b6f5"]

# Change this to fit wherever testing data is stored
particles_root  = "/afs/crc.nd.edu/group/RichterLab/droplet_approximation/data/simulations/pi_chamber-rh103-1way/particles"
dirs_per_level  = 256

# This controls how much of the data to load.
# The notebook will load subset_fraction / dirs_per_level
# of the overall dataset.
subset_fraction = 256

# Controls whether to do the graphing/analysis with iterative or direct inference
iterative = False

# Good CUSUM parameters for non-iterative, could probably be dialed in more
cusum_error_tolerance = np.array( [ 0.005, 0.05 ] )
cusum_error_threshold = np.array( [ 0.02, 0.20 ] )

norm = standard_norm

# How many processes/batches to run the analysis with
# Defaults number_processes to multiprocessing.cpu_count() - 1
number_processes = 0
number_batches = 1


# Maximum number of deviation clusters to identify
max_clusters = 7

# This sets the x-y-z limits for the deviation cluster graph.
# Since there are some deviations at very far flung parts of
# parameter space, without explicitly setting these ranges,
# the deviations are all smooshed together on the graph.
set_deviation_graph_limits = False
deviation_graph_x_range    = ( -.5,2.0 )
deviation_graph_y_range    = ( -6.75, -2.50 )
deviation_graph_z_range    = ( 1.0,1.07 )

# How many of the worst particles to graph the trajectory of
number_graphs = 3

# The worst `number_graphs` particles will be picked
# only from particles with deviations from all of the listed
# clusters. If None, select any particle.
# If using strict_graph_cluster_filter, select particles
# with ONLY the specified deviations.

#deviation_graph_cluster_filter = numpy.array( [0,1,2] )
deviation_graph_cluster_filter = None
strict_graph_cluster_filter    = False


# This array determines where to pickle the analysis reports
# to. Does not save if `None`. There must be one path for each
# model.
save_scores              = True
pickled_score_save_paths = [ None for _ in range( len( model_load_paths ) ) ]

# Determines whether to load scores from pickled files.
# If true, deviation analysis WILL NOT BE RUN. Instead,
# the notebook will load previous deviation analysis files
# from the supplied paths.
# There must be one pickled score path for each model.
load_scores              = False
pickled_score_load_paths = [ "PATH_TO_PICKLED_SCORE.pkl" for _ in range( len( model_load_paths ) ) ]

In [None]:
# Edit settings here:

# These ranges correspond to "Model Box Uncoupled 400M l1 Residual 14 Epochs" in the group drive
parameter_ranges = {
    "radius": ( -6.75, -3.00 ),
    "relative_humidity": ( 0.98, 1.11 )
}

score_report_dir = "/afs/crc.nd.edu/group/RichterLab/droplet_approximation/data/analysis/tmp/"

# If you want to load less of the data, set the following to what fraction out of 256 you want loaded.
# This allows for quick testing with the notebook without rerunning the entire dataset.
#subset_fraction = 5

#iterative = True
# Good CUSUM parameters for iterative, could probably be dialed in more
#cusum_error_tolerance = np.array( [ 0.02, 0.10 ] )
#cusum_error_threshold = np.array( [ 0.08, 0.40 ] )
# You might want more clusters to be identified for iterative as well. This number has not been tuned thoroughly
#max_clusters = 12

#set_deviation_graph_limits = True
#deviation_graph_x_range    = ( -.5,2.0 )
#deviation_graph_y_range    = ( -6.75, -2.50 )
#deviation_graph_z_range    = ( 1.0,1.07 )

# General format for these files will be "(score_report_dir)(model_name)_(iterative if iterative)_scoring_report.pkl"
#save_scores              = True
#pickled_score_save_paths = [ f"{score_report_dir}{model_name.replace(" ", "_")}{'_iterative' if iterative else ''}_scoring_report.pkl" for model_name in model_names ]
load_scores              = True
pickled_score_load_paths = [ f"{score_report_dir}{model_name.replace(" ", "_")}{'_iterative' if iterative else ''}_scoring_report.pkl" for model_name in model_names ]

# This will select the worst particles deviations from ONLY clusters 1 and 3
#strict_graph_cluster_filter    = True
#deviation_graph_cluster_filter = np.array( [1, 3] )

In [None]:
if number_processes == 0:
    number_processes = multiprocessing.cpu_count() - 1

In [None]:
# This is to make it work with the current model. Remove if using a new model
set_parameter_ranges( parameter_ranges )

In [None]:
model_count = len( model_load_paths )

model_list  = [ ResidualNet() for i in range( model_count ) ]

for i in range( model_count ):
    model_list[i].load_state_dict( torch.load( model_load_paths[i] ) )

In [None]:
# Load or calculate score_reports
if load_scores:
    score_reports = []
    for load_path in pickled_score_load_paths:
        try:
            with open( load_path, "rb" ) as score_file:
                score_reports.append( pickle.load( score_file ) )
        except Exception as e:
            print(f"Failed to open {load_path}. Encountered error:\n {e}")
else:
    # Extract subset_fraction/256 of the particles and score
    ids_index = np.fromfile( particles_root + "/particles.index", dtype=np.int32 )
    filtered_ids = ids_index[ ( ( ids_index // 256 ) % 256 < subset_fraction ) ]

    score_reports = [ ScoringReport( particles_root,
                                     filtered_ids, 
                                     dirs_per_level,
                                     model_list[model_i], 
                                     model_names[model_i], 
                                     "cpu", 
                                     cusum_error_tolerance=cusum_error_tolerance, 
                                     cusum_error_threshold=cusum_error_threshold,
                                     iterative=iterative,
                                     norm=norm,
                                     number_processes=number_processes,
                                     number_batches=number_batches,
                                     max_clusters=12,
                                     parameter_ranges=parameter_ranges )
                      for model_i in range ( model_count ) ]

    # Dump pickled results
    try:
        for model_index, save_path in enumerate( pickled_score_save_paths ):
            if save_path is None:
                continue
            with open( save_path, "wb" ) as score_file:
                pickle.dump( score_reports[model_index], score_file )
    except Exception as e:
        print(f"Failed to save model {model_names[model_index]} to file {save_path} due to the following exception: \n {e}")


In [None]:
for score_report in score_reports:
    # We set precision to 2 because otherwise everything is labeled
    # with very long decimals. We can fix this more thoroughly later
    with np.printoptions( precision=2 ):
        fig,ax = score_report.plot_deviations(label_centers=False, thinning_ratio=1)

    fig.set_size_inches( ( 8,12 ) )
    if set_deviation_graph_limits:
        ax.set_ylim3d( deviation_graph_x_range ) 
        ax.set_xlim3d( deviation_graph_y_range ) 
        ax.set_zlim3d( deviation_graph_z_range ) 

In [None]:
for score_report in score_reports:
    score_reports[0].net_nrmse

In [None]:
arr = np.array( [1,1,1,1,2,2,2,3,3,5,5,5,5,5,-2,-2,-2,1,2,2,1,1] )

In [None]:
ppNRMSE = np.array( list( score_reports[0].per_particle_nrmse.values() ) )

In [None]:
for model_index, score_report in enumerate( score_reports ):
    print( f"For model {model_names[model_index]}:\n"
         + f"Overall NRMSE: {score_report.net_nrmse}\n"
         + f"Mean Per Particle NRMSE: {np.mean( ppNRMSE )}\n"
         + f"Median Per Particle NRMSE: {np.median( ppNRMSE )}\n" )

In [None]:
# Load corresponding particle df
# Select the worst particles based on NRMSE

if deviation_graph_cluster_filter is None:
    target_particle_ids = heapq.nlargest( number_graphs, score_reports[0].per_particle_nrmse, 
                                          key=score_reports[0].per_particle_nrmse.get )
else:
    # Iterates over each particle's deviations to see if they contain the deviations
    # in the cluster filter.
    deviation_particle_ids      = score_reports[0].deviation_particle_ids
    change_points               = np.array( np.where( deviation_particle_ids[1:] != deviation_particle_ids[:-1] )[0] ) + 1
    filtered_per_particle_nrmse = {}
    
    start_index = 0
    for end_index in change_points:
        particle_id       = deviation_particle_ids[start_index]
        particle_clusters = score_reports[0].deviation_clusters[start_index:end_index]
        if np.all( np.isin( deviation_graph_cluster_filter, particle_clusters ) ):
            # If strict filtering is on and there are additional deviations
            # continue without adding the particle to the list.
            if strict_graph_cluster_filter and not np.all( np.isin( particle_clusters, deviation_graph_cluster_filter ) ):
                start_index = end_index
                continue

            filtered_per_particle_nrmse[particle_id] = score_reports[0].per_particle_nrmse[particle_id]

        start_index = end_index

    target_particle_ids = heapq.nlargest( number_graphs, filtered_per_particle_nrmse, 
        key=filtered_per_particle_nrmse.get )

    
target_particle_ids

df = read_particles_data( particles_root, target_particle_ids, dirs_per_level )
df

In [None]:
# For each particle, each model will yield 3 figures.
# The first will be the figure against BDF
# The second will be the figure against BE (with deviations highlighted)
# The third will be the figure's CUSUM analysis (calculated within the notebook)

# This is the same colormap as in `scoring.py` for plotting deviations
colormap = colors.ListedColormap( ["red", "blue", "green", "orange", "black", "yellow", "teal", "gold", "magenta", "lightgreen", "navy", "dimgray", "lightcoral"] )

for particle_id in target_particle_ids:
    particle_df      = df.loc[particle_id]
    input_parameters = np.stack( particle_df[[
        "input radii",
        "input temperatures",
        "salt masses",
        "air temperatures",
        "relative humidities",
        "air densities",
        "integration times"
    ]].to_numpy(), axis=-1 )

    be_mask             = be_success_mask( input_parameters[:, 0] )
    actual_particle_times = np.delete( np.cumsum( np.insert( input_parameters[:, -1],
                                                             0,
                                                             0.0 )[:-1] ),
                                      ~be_mask ) + particle_df["birth time"]
    input_parameters = input_parameters[be_mask]
    times            = np.insert( np.cumsum( input_parameters[:, -1] ), 0, 0.0 )[:-1] + particle_df["birth time"]

    if iterative:
        model_outputs = [ do_iterative_inference(
                                input_parameters[:, :-1], 
                                times,
                                model_list[model_index],
                                "cpu"
                            ) for model_index in range( model_count ) ]
    else:
        model_outputs = [ np.insert( 
                                    do_inference(
                                        input_parameters[:, :-1],
                                        input_parameters[:, -1],
                                        model_list[model_index],
                                        "cpu"
                                    )[:-1, :],
                                    0, 
                                    input_parameters[0, :2],
                                    axis=0
                                ) for model_index in range( model_count ) ]


    bdf_output = do_iterative_bdf(
        input_parameters[:, :-1],
        times
    )
    be_output  = input_parameters[:, :2]

    normed_model_outputs = [ norm( model_output ) for model_output in model_outputs ]
    normed_bdf_output    = norm( bdf_output )
    normed_be_output     = norm( be_output )

    for model_index in range( model_count ): 
        fig_h_bdf, ax_h_bdf = analyze_model_particle_performance(
            times,
            bdf_output,
            model_outputs[model_index],
            norm
        )

        fig_h_bdf.suptitle( f"Droplet trajectory overview for particle {particle_id} on model {model_names[model_index]} vs. BDF\n SHA: {commit_SHAs[model_index]}" ) 

        fig_h_be, ax_h_be = analyze_model_particle_performance(
            times,
            input_parameters[:, :2], 
            model_outputs[model_index],
            norm
        )

        fig_h_be.suptitle( f"Droplet trajectory overview for particle {particle_id} on model {model_names[model_index]} vs. BE\n SHA: {commit_SHAs[model_index]}" ) 

        fig_h_cusum, ax_h_cusum = plt.subplots( 2, 2, figsize=(9,8) )
        fig_h_cusum.suptitle( f"Droplet trajectory overview part 2 for particle {particle_id} on model { model_names[model_index]}\n SHA: {commit_SHAs[model_index] }" ) 

        model_cusum = np.abs( calculate_cusum( ( normed_be_output - normed_model_outputs[model_index] ).T, cusum_error_tolerance ) )

        ax_h_cusum[0][0].set_title("Radius CUSUM chart") 
        ax_h_cusum[0][0].plot( times, model_cusum[0].T, label=["positive radius cusum", "negative radius cusum"] )
        ax_h_cusum[0][0].set_xlabel( "time (s)" )
        ax_h_cusum[0][0].axhline( y=cusum_error_threshold[0], color="red",linewidth=1, linestyle="--",label="cusum divergence threshold" )

        ax_h_cusum[0][0].set_ylabel("CUSUM")

        ax_h_cusum[0][1].plot( times, particle_df["relative humidities"][be_mask] )
        ax_h_cusum[0][1].set_title( "RH versus time for Particle " + str( model_index ) )
        ax_h_cusum[0][1].set_xlabel( "time (s)" )
        ax_h_cusum[0][1].set_ylabel( "Relative Humidity (%)" ) 


        ax_h_cusum[1][0].plot( times, particle_df["air temperatures"][be_mask] - particle_df["input temperatures"][be_mask] )
        ax_h_cusum[1][0].set_title( f"Temperature Difference for Particle {particle_id}" )
        ax_h_cusum[1][0].set_xlabel( "time (s)" )
        ax_h_cusum[1][0].set_ylabel( "Air Temperature (K)" ) 

        ax_h_cusum[1][1].plot( times, particle_df["air temperatures"][be_mask] )
        ax_h_cusum[1][1].set_title( f"Air Temperatures for Particle {particle_id}")
        ax_h_cusum[1][1].set_xlabel( "time (s)" )
        ax_h_cusum[1][1].set_ylabel( "time step (s)" ) 

        for k, deviation_index in enumerate( np.where( score_reports[model_index].deviation_particle_ids == particle_id )[0] ):
            deviation_parameter = score_reports[model_index].deviation_parameters[deviation_index]
            deviation_time      = score_reports[model_index].deviation_times[deviation_index]
            deviation_cluster   = score_reports[model_index].deviation_clusters[deviation_index]

            line_label = f"{deviation_parameter.name.lower()} deviation, cluster {deviation_cluster}"

            if deviation_parameter == DeviationParameter.RADIUS:
                ax_h_be[0][0].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_be[1][0].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                
                ax_h_cusum[0][0].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[0][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][0].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
            else:
                ax_h_be[0][1].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_be[1][1].axvline( x=deviation_time,linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                
                ax_h_cusum[0][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][0].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )
                ax_h_cusum[1][1].axvline( x=deviation_time, linewidth=1, linestyle="--", label=line_label, color=colormap( deviation_cluster ) )

        ax_h_cusum[0][0].legend()
        ax_h_cusum[0][1].legend()
        ax_h_cusum[1][0].legend()
        ax_h_cusum[1][1].legend()

        ax_h_be[0][0].legend()
        ax_h_be[0][1].legend()
        ax_h_be[1][0].legend()
        ax_h_be[1][1].legend()
        
        fig_h_bdf.tight_layout()
        fig_h_be.tight_layout()
        fig_h_cusum.tight_layout()

        fig_h_bdf.show()
        fig_h_be.show()
        fig_h_cusum.show()
