Import Statements

In [None]:
import os
import statistics as st
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

Define Global Variables

In [None]:
continents = ["Africa", "North America", "South America", "Oceania", "Eastern Europe", "Western Europe", "Middle East", "South Asia", "Southeast-East Asia", "Central Asia"]
ANALYSIS_DIR = "<datetime-stamp of multiple model training run>"
NUM_MODELS = 100
MODEL_NAME = "DCSAGE"
WINDOW_SIZE = 7
REC_PRED_LEN = 30

## Load Perturbed Predictions from Models
These cells load the roll_win_pert_preds.npy saved array of perturbed recursive predictions, which
were saved by the node perturbation analysis file. The array contains recursive predictions under
perturbations of each of the 10 nodes. 

Here we load the saved array, and turn the Numpy array back into a Python list starting from the 
inside out. We do this since the Pandas DataFrame within the array need to be recovered, and the
DataFrames are easily accesible once in Python arrays.

In [None]:
roll_win_pert_pred_nested_np = np.load("./analysis-runs-multiple-models/{}/prediction_saves/roll_win_pert_preds.npy".format(ANALYSIS_DIR))
roll_win_pert_pred_nested_list = list(roll_win_pert_pred_nested_np)
roll_win_pert_pred_nested_list = [list(np_arr) for np_arr in roll_win_pert_pred_nested_list]

for rolling_window_idx in range(len(roll_win_pert_pred_nested_list)):
    for model_idx in range(len(roll_win_pert_pred_nested_list[0])):
        roll_win_pert_pred_nested_list[rolling_window_idx][model_idx] = list(roll_win_pert_pred_nested_list[rolling_window_idx][model_idx])

for rolling_window_idx in range(len(roll_win_pert_pred_nested_list)):
    for model_idx in range(len(roll_win_pert_pred_nested_list[0])):
        for perturbed_idx in range(len(roll_win_pert_pred_nested_list[0][0])):
            roll_win_pert_pred_nested_list[rolling_window_idx][model_idx][perturbed_idx] = list(roll_win_pert_pred_nested_list[rolling_window_idx][model_idx][perturbed_idx])

for rolling_window_idx in range(len(roll_win_pert_pred_nested_list)):
    for model_idx in range(len(roll_win_pert_pred_nested_list[0])):
        for perturbed_idx in range(len(roll_win_pert_pred_nested_list[0][0])):
            for country_idx in range(len(roll_win_pert_pred_nested_list[0][0][0])):
                roll_win_pert_pred_nested_list[rolling_window_idx][model_idx][perturbed_idx][country_idx] = pd.DataFrame(data=roll_win_pert_pred_nested_list[rolling_window_idx][model_idx][perturbed_idx][country_idx], 
                columns=["Regular Predictions", "Ground Truth", "Extended Recursive Predictions", "Day Index"])

## Loading Unperturbed Predictions for Models
These cells load the roll_win_unpert_preds.npy array, which contain the unperturbed recursive 
predictions saved during node perturbation analysis. We again load the array and turn it from 
a Numpy array back into a Python list of Pandas DataFrames.

Since there are 10 country to perturb but only 1 unperturbed dataloader, this array will have
1 less dimension that roll_win_pert_preds.npy.

In [None]:
roll_win_unpert_pred_nested_np = np.load("./analysis-runs-multiple-models/{}/prediction_saves/roll_win_unpert_preds.npy".format(ANALYSIS_DIR))
roll_win_unpert_pred_nested_list = list(roll_win_unpert_pred_nested_np)
roll_win_unpert_pred_nested_list = [list(np_arr) for np_arr in roll_win_unpert_pred_nested_list]

for rolling_window_idx in range(len(roll_win_unpert_pred_nested_list)):
    for model_idx in range(len(roll_win_unpert_pred_nested_list[0])):
        roll_win_unpert_pred_nested_list[rolling_window_idx][model_idx] = list(roll_win_unpert_pred_nested_list[rolling_window_idx][model_idx])

for rolling_window_idx in range(len(roll_win_unpert_pred_nested_list)):
    for model_idx in range(len(roll_win_unpert_pred_nested_list[0])):
        for country_idx in range(len(roll_win_unpert_pred_nested_list[0][0])):
            roll_win_unpert_pred_nested_list[rolling_window_idx][model_idx][country_idx] = pd.DataFrame(data=roll_win_unpert_pred_nested_list[rolling_window_idx][model_idx][country_idx], 
            columns=["Regular Predictions", "Ground Truth", "Extended Recursive Predictions", "Day Index"])

## Calculate Bias Correction

Correction factor is average of daywise ratio of ground truth over median curve.

In [None]:
def bias_correction(window_prediction_np_array, window_idx):
    """
    This function accepts a numpy array of unperturbed predictions on 1 window by all models, and returns 
    10 numbers representing the bias corrections for 10 continents for the specified window.

    Bias correction is computed by putting ground truth on x-axis and mean recursive prediction on y-axis
    for each continent, and then calculating slope of the correlation plot.

    Args:
        - window_prediction_np_array: unperturbed predictions for all models on one window, 
            shape (100, 10, 30, 4)
    """
    ground_truth = window_prediction_np_array[:,:,:,1]  # shape (100, 10, 30)
    ground_truth = ground_truth[0,:,:]  # Ground truth same for all models, get from 1st model. Shape (10, 30)

    all_model_recursive_preds = window_prediction_np_array[:,:,:,2]  # shape (100, 10, 30)
    median_recursive_pred = np.median(all_model_recursive_preds, axis=0)  # shape (10, 30)

    daywise_ratios = ground_truth / median_recursive_pred  # shape (10, 30)
    averaged_ratios = np.mean(daywise_ratios, axis=1)  # shape (10,)
    return averaged_ratios
    

In [None]:
rolling_window_bias_corrections = []
for window in range(0, len(roll_win_unpert_pred_nested_np)):
    window_bias_corrections = bias_correction(roll_win_unpert_pred_nested_np[window], window)
    rolling_window_bias_corrections.append(window_bias_corrections)

rolling_window_bias_corrections = np.array(rolling_window_bias_corrections)
print(rolling_window_bias_corrections.shape)

# Take mean of bias correction factors calculated for each node across recursive prediction windows,
# obtaining 1 correction factor for each node.
window_bias_corrections = np.mean(rolling_window_bias_corrections, axis=0)
print(window_bias_corrections.shape)

## Plot 100 model Recursive Prediction Visual After Bias Correction

In [None]:
from dataloader.node_perturbation_dataloader import Covid10CountriesUnperturbedDataset

In [1]:
def plot_bias_corrected_100_rec_pred_with_starting_input(roll_win_unpert_pred_nested_list, rolling_window):
    """
    This function plots the 5x2 figure where each subplot represents one node. For each subplot, the ground
    truth for the node is plotted as a thick orange line, while the 100 other lines colored in black represent
    the recursive predictions of the 100 models for that node.

    Bias corrections are applied on this plot, according to the average ratio on a window between ground truth and recursive prediction median/mean curve.
    
    Args:
        - roll_win_unpert_pred_nested_list: (523, 100, 10, 30, 4)
        - rolling window: The window for which to create this plot
    """
    dataset = Covid10CountriesUnperturbedDataset(
        dataset_npz_path="/Users/syedrizvi/Desktop/Projects/GNN_Project/DCSAGE/Node-Perturbation/datasets/10_continents_dataset_v19_node_pert.npz",
        window_size=WINDOW_SIZE, 
        data_split="entire-dataset-smooth", 
        avg_graph_structure=False)
    
    assert len(dataset.all_window_edge_attr) - WINDOW_SIZE - REC_PRED_LEN == len(roll_win_unpert_pred_nested_list), "Inconsistent window counts."
    
    first_window_cases = np.zeros((len(roll_win_unpert_pred_nested_list) + WINDOW_SIZE, 10))
    first_window_cases[:WINDOW_SIZE,:] = dataset.all_window_node_feat[rolling_window,:,:,1]  # Shape [7, 10]
    first_window_cases[first_window_cases == 0] = np.nan

    # Make dataset to show first WINDOW_SIZE days that went into model
    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(40, 16))
    plt.rcParams.update({'font.size': 20})
    fig.suptitle("DCSAGE Unperturbed Median Ratio Corrected Recursive Predictions Coverage (Window {})".format(rolling_window), fontsize= 30)

    idx = 0
    for row in ax:
        for col in row:
            visual_dict = {}
            for i in range(NUM_MODELS):
                vals = np.array(roll_win_unpert_pred_nested_list[rolling_window][i][idx]["Extended Recursive Predictions"])  # Shape (30,)
                vals *= window_bias_corrections[idx]
                vals = np.pad(vals, (WINDOW_SIZE, 0), "constant", constant_values=(0,0))
                vals[vals == 0] = np.nan
                visual_dict["Model {}".format(i)] = vals

            visual_dict["Day Index"] = list(range(-1 * WINDOW_SIZE, len(roll_win_unpert_pred_nested_list[rolling_window][0][idx]["Extended Recursive Predictions"])))
            visual_df = pd.DataFrame(visual_dict)

            # Ground truth is same for all models, pick from first model
            vals2 = np.array(roll_win_unpert_pred_nested_list[rolling_window][0][idx]["Ground Truth"])
            vals2 = np.pad(vals2, (WINDOW_SIZE, 0), "constant", constant_values=(0,0))
            vals2[vals2 == 0] = np.nan
            visual_dict2 = {"Ground Truth": vals2}
            visual_dict2["Day Index"] = list(range(-1 * WINDOW_SIZE, len(roll_win_unpert_pred_nested_list[rolling_window][0][idx]["Ground Truth"]))) 
            visual_df2 = pd.DataFrame(visual_dict2)

            visual_df3 = pd.DataFrame({
                "Starting Input": first_window_cases[:,idx],
                "Day Index": list(range(-1 * WINDOW_SIZE, len(first_window_cases) - WINDOW_SIZE))
            })

            sns.lineplot(ax=col, x='Day Index', y='value', hue='variable', data=pd.melt(visual_df, ['Day Index']), palette=['gray'] * NUM_MODELS)
            sns.lineplot(ax=col, x='Day Index', y='value', hue='variable', data=pd.melt(visual_df2, ['Day Index']), linewidth = 8, palette=['orange'])
            sns.lineplot(ax=col, x='Day Index', y='value', hue='variable', data=pd.melt(visual_df3, ['Day Index']), linewidth = 8, palette=['blue'])
            col.set_title(continents[idx])
            col.set_ylim([0, 6])
            box = col.get_position()
            col.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            col.legend().remove()
            idx += 1

    plt.savefig("./window{}_corrected_{}models_rec_pred_coverage.png".format(rolling_window, NUM_MODELS), bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
# Make sure to use same window that bias correction was calculated on
for window in range(0, len(roll_win_unpert_pred_nested_list), 50):
    plot_bias_corrected_100_rec_pred_with_starting_input(roll_win_unpert_pred_nested_list, rolling_window=window)

## Apply bias correction to all rolling window prediction lists

In [None]:
for rolling_window_idx in range(len(roll_win_pert_pred_nested_list)):
    for model_idx in range(len(roll_win_pert_pred_nested_list[0])):
        for perturbed_idx in range(len(roll_win_pert_pred_nested_list[0][0])):
            for country_idx in range(len(roll_win_pert_pred_nested_list[0][0][0])):
                # (523, 100, 10, 10, 30, 4)
                roll_win_pert_pred_nested_list[rolling_window_idx][model_idx][perturbed_idx][country_idx]["Extended Recursive Predictions"] *= window_bias_corrections[country_idx]

for rolling_window_idx in range(len(roll_win_unpert_pred_nested_list)):
    for model_idx in range(len(roll_win_unpert_pred_nested_list[0])):
        for country_idx in range(len(roll_win_unpert_pred_nested_list[0][0])):
            # (523, 100, 10, 30, 4)
            roll_win_unpert_pred_nested_list[rolling_window_idx][model_idx][country_idx]["Extended Recursive Predictions"] *= window_bias_corrections[country_idx]

## Compute Node Sensitivity Scores

In [None]:
def node_perturbation_prediction_difference_heatmap(perturb_df_nested_lists, regular_df_nested_list):
    """
    This function is a helper function for computing the difference between perturbed and unperturbed 
    predictions, a precursor step for computing sensitivity scores later on. This function is meant to 
    be called for a single model on a single window, and will return prediction difference in the form 
    of a 10x10 matrix where the row is the perturbed geographical region, and the column represents 
    the affected region. Each cell represents the difference in perturbed and unperturbed prediction 
    summed over 30 days of recursive prediction.
    
    Args:
        - perturb_df_nested_lists: (10, 10, 30, 4): 10 perturbed countries, 
            10 countries in graph, pd.DataFrame of shape (30, 4)
        - regular_df_nested_list: (10, 30, 4): 10 countries in graph, pd.DataFrame (30, 4)
    """
    aggreg_differences_lists = []

    for perturbed_country_idx in range(10):
        aggreg_differences = []
        for country_idx in range(10):
            if country_idx == perturbed_country_idx:
                aggreg_differences.append(np.nan)
            else:
                difference_list = perturb_df_nested_lists[perturbed_country_idx][country_idx]['Extended Recursive Predictions'] - regular_df_nested_list[country_idx]['Extended Recursive Predictions']
                
                difference_list = np.abs(difference_list) # Take absolute value of prediction difference
                aggreg_differences.append(difference_list.sum())

        aggreg_differences_lists.append(aggreg_differences)
    return aggreg_differences_lists

Calculate prediction difference array across entire dataset. If you have previously computed this, skip
ahead to load the precomputed array.

In [None]:
roll_win_aggreg_diff_nested_list = []
for roll_win_idx in range(len(roll_win_unpert_pred_nested_list)):
    if roll_win_idx % 20 == 0:
        print("Rolling window", roll_win_idx)

    model_sens_score_nested_lists = []
    for model_idx in range(len(roll_win_unpert_pred_nested_list[0])):
        aggreg_differences_lists = node_perturbation_prediction_difference_heatmap(
            roll_win_pert_pred_nested_list[roll_win_idx][model_idx], 
            roll_win_unpert_pred_nested_list[roll_win_idx][model_idx])
        model_sens_score_nested_lists.append(aggreg_differences_lists)
    
    roll_win_aggreg_diff_nested_list.append(model_sens_score_nested_lists)

# roll_win_aggreg_diff_nested_list ends up being [num_windows, num_models, num_nodes, num_nodes]

In [None]:
# Convert to numpy array and save array
roll_win_aggreg_diff_nested_list = np.array(roll_win_aggreg_diff_nested_list)
print(roll_win_aggreg_diff_nested_list.shape)

np.save("./{}_7day_100model_meanagg_v19_10x10_bias_corrected_unsigned.npy".format(MODEL_NAME), np.array(roll_win_aggreg_diff_nested_list))

If you have previously computed and saved the prediction difference array, then run this cell to 
load in the precomputed array.

In [None]:
roll_win_aggreg_diff_nested_list = np.load("./analysis-runs-multiple-models/" + ANALYSIS_DIR + "/prediction_saves/DCSAGE_7day_100model_meanagg_v19_10x10_bias_corrected_unsigned.npy")
print(roll_win_aggreg_diff_nested_list.shape)

Compute Sensitivity Scores From Prediction Difference Arrays

In [None]:
print("Shape of prediction difference array:", roll_win_aggreg_diff_nested_list.shape)

sensitivty_score_nested_np = np.nansum(np.array(roll_win_aggreg_diff_nested_list), axis=3)
print("Shape of sensitivity score array:", sensitivty_score_nested_np.shape)

## Plot Sensitivity Score Trends Over Rolling Windows (All Models)

In [None]:
def plot_multiple_model_sens_score_trends_lineplot(roll_win_aggreg_diff_nested_list):
    """
    This function plots the 5x2 figure of sensitivity score lineplots over rolling windows 
    (x-axis), where each model is a single line on the subplot (no averaging across models).
    Sensitivity scores are calculated by summing across row of 10x10 sensitivity array.
    
    Args:
        - roll_win_aggreg_diff_nested_list: 10x10 sensitivity array, shape (num_windows, NUM_MODELS, 10, 10)
    """
    # Sum the fourth dimension to get sensitivity scores, (num_windows, NUM_MODELS, 10).
    sensitivty_score_nested_np = np.nansum(np.array(roll_win_aggreg_diff_nested_list), axis=3)

    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(30,15))
    plt.rcParams.update({'font.size': 20})
    fig.suptitle("{} {} Model Sensitivity Score Trends".format(MODEL_NAME, NUM_MODELS), fontsize= 30)

    node_idx = 0
    for row in ax:
        for col in row:
            node_subplot_dict = { "Model_{}".format(model_idx): sensitivty_score_nested_np[:,model_idx,node_idx] for model_idx in range(NUM_MODELS) }
            node_subplot_dict["Rolling Window Index"] = list(range(len(roll_win_aggreg_diff_nested_list)))
            visual_df = pd.DataFrame(node_subplot_dict)

            sns.lineplot(ax=col, x='Rolling Window Index', y='Sensitivity Scores', hue='Model', data=pd.melt(visual_df, ['Rolling Window Index'], value_name="Sensitivity Scores", var_name="Model"))
            col.set_title(continents[node_idx])
            col.legend().remove()
            col.set_ylim([-70, 70])
            node_idx += 1

    filename = str(NUM_MODELS) + "_models_sens_score_trends"
    plt.tight_layout()
    plt.savefig("./" + filename + '.png', bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
plot_multiple_model_sens_score_trends_lineplot(roll_win_aggreg_diff_nested_list)

## Plot Sensitvity Score Distribution

In [None]:
def plot_multiple_model_roll_win_sens_distribution(roll_win_aggreg_diff_nested_list):
    """
    This function plots the 5x2 figure of sensitivity score distributions for each of the
    10 continents. Each distribution will contain num_windows * num_models sensitivity scores.
    Sensitivity scores are calculated by summing across row of 10x10 sensitivity array.
    
    Args:
        - roll_win_aggreg_diff_nested_list: 10x10 sensitivity array, shape (num_windows, NUM_MODELS, 10, 10)
    """
    sensitivty_score_nested_np = np.nansum(np.array(roll_win_aggreg_diff_nested_list), axis=3)

    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(35,16))
    plt.rcParams.update({'font.size': 20})
    fig.suptitle("{} {} Model Sensitivity Score Distributions".format(MODEL_NAME, NUM_MODELS), fontsize=30)

    pert_node_idx = 0
    for row in ax:
        for col in row:
            country_sensitivity_scores = list(sensitivty_score_nested_np[:,:,pert_node_idx].flatten())
            print("{} has {} sensitivity scores in distribution.".format(continents[pert_node_idx], len(country_sensitivity_scores)))

            visual_df = pd.DataFrame({
                "Sensitivity Score": country_sensitivity_scores,
            })

            sns.histplot(ax=col, x='Sensitivity Score', data=visual_df, kde=True)
            mode = st.mode(country_sensitivity_scores)
            median = np.median(np.array(country_sensitivity_scores))
            mean = np.mean(np.array(country_sensitivity_scores))
            stddev = np.array(country_sensitivity_scores).std()
            col.set_title(continents[pert_node_idx] + "\nMode " + str(round(mode, 2)) + ", Mean: " + str(round(mean, 2)) + "\nMedian: " + str(round(median, 2)) + ", Std: " + str(round(stddev, 2)))
            col.set_xlim([-40, 40])
            pert_node_idx += 1

    plt.tight_layout()
    filename = str(NUM_MODELS) + "_models_sens_score_distrib"
    filename = "{}_models_sens_score_distrib".format(NUM_MODELS)
    plt.savefig("./" + filename + '.png', bbox_inches='tight', facecolor="white")
    plt.clf()
    plt.close()

In [None]:
plot_multiple_model_roll_win_sens_distribution(roll_win_aggreg_diff_nested_list)

## Fit EV Distribution For Each Node on All Windows

In [None]:
sensitivty_score_nested_np.shape

In [None]:
FIT_NORMAL = False

Fit EV distribution on each node on each window, save location and scale parameters

In [None]:
roll_win_loc_params = []
roll_win_scale_params = []
for roll_win in range(len(sensitivty_score_nested_np)):
    location_params = []
    scale_params = []
    for node_idx in range(10):
        values = sensitivty_score_nested_np[roll_win,:,node_idx]
        if FIT_NORMAL:
            params = stats.norm.fit(values)
        else:
            # params = stats.gumbel_l.fit(values)
            params = stats.gumbel_r.fit(values)
        location_params.append(params[0])
        scale_params.append(params[1])
    
    roll_win_loc_params.append(location_params)
    roll_win_scale_params.append(scale_params)

In [None]:
roll_win_loc_params = np.array(roll_win_loc_params)
roll_win_scale_params = np.array(roll_win_scale_params)
print(roll_win_loc_params.shape)
print(roll_win_scale_params.shape)

Renaming arrays for plotting later on

In [None]:
roll_win_fitted_means = roll_win_loc_params
roll_win_fitted_stds = roll_win_scale_params

Visualize Fitted EV Distributions

In [None]:
for single_country_idx in range(10):
    for single_window_idx in [0, 100, 200, 300, 400, 500]:
        x = np.linspace(60, 0, 100)
        if FIT_NORMAL:
            plt.plot(x, stats.norm.pdf(x, roll_win_loc_params[single_window_idx, single_country_idx], roll_win_scale_params[single_window_idx, single_country_idx]), 'r-', label='normal pdf')
        else:
            # plt.plot(x, stats.gumbel_l.pdf(x, roll_win_loc_params[single_window_idx, single_country_idx], roll_win_scale_params[single_window_idx, single_country_idx]), 'r-', label='EV pdf')
            plt.plot(x, stats.gumbel_r.pdf(x, roll_win_loc_params[single_window_idx, single_country_idx], roll_win_scale_params[single_window_idx, single_country_idx]), 'r-', label='EV pdf')
        
        plt.rcParams.update({'font.size': 16})
        sns.histplot(x=sensitivty_score_nested_np[single_window_idx,:,single_country_idx].flatten(), stat="density", label=continents[single_country_idx] + " scores")
        
        mean = np.mean(sensitivty_score_nested_np[single_window_idx,:,single_country_idx].flatten())
        median = np.median(sensitivty_score_nested_np[single_window_idx,:,single_country_idx].flatten())
        std = np.std(sensitivty_score_nested_np[single_window_idx,:,single_country_idx].flatten())

        if FIT_NORMAL:
            title = "Normal Distribution fitted on {} on window {}\nMean: {:.4f}, Median: {:.4f}, STD: {:.4f}".format(continents[single_country_idx], single_window_idx, mean, median, std)
            plt.title(title, fontsize=16)
            plt.savefig("./normal_unfiltered_fit_{}_win{}.png".format(continents[single_country_idx], single_window_idx), bbox_inches="tight", facecolor="white")
        else:
            title = "EV Distribution fitted on {} on window {}\nMean: {:.4f}, Median: {:.4f}, STD: {:.4f}".format(continents[single_country_idx], single_window_idx, mean, median, std)
            plt.title(title, fontsize=16)
            plt.savefig("./ev_unfiltered_fit_{}_win{}.png".format(continents[single_country_idx], single_window_idx), bbox_inches="tight", facecolor="white")
        
        # plt.show()
        plt.clf()

## Plot Model Average Sensitivity Score Trends After Filtering

In [None]:
def plot_multiple_model_average_sens_score_trend_after_filtering(roll_win_fitted_means, roll_win_fitted_stds):
    """
    This function plots the 5x2 figure of average sensitivity score with a 1 STD interval above and below the
    mean. Here, mean and std are the mean and std calculated above by fitting gumbel_L distribution on each
    window for each model.
    
    Args:
        - roll_win_fitted_means: shape (num_windows, 10)
        - roll_win_fitted_stds: shape (num_windows, 10)
    """

    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(30,15))
    plt.rcParams.update({'font.size': 20})
    fig.suptitle("{} {} Model Average Fitted Sensitivity Score Mean".format(MODEL_NAME, NUM_MODELS), fontsize= 30)
    
    node_idx = 0
    for row in ax:
        for col in row:
            # all_model_country_scores is (num_models, num_roll_windows)
            col.plot(roll_win_fitted_means[:, node_idx])
            col.fill_between(list(range(len(roll_win_fitted_means))), (roll_win_fitted_means[:, node_idx] - roll_win_fitted_stds[:, node_idx]), (roll_win_fitted_means[:, node_idx] + roll_win_fitted_stds[:, node_idx]), color='b', alpha=.1)
            col.set_title(continents[node_idx])
            col.set_xlabel("Rolling Window Index")
            col.set_ylabel("Fitted Sensitivity Score")
            col.set_ylim([0, 40])
            node_idx += 1

    filename = "{}_models_avg_sens_scores_unfiltered".format(NUM_MODELS)
    plt.tight_layout()
    plt.savefig("./" + filename + '.png', bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
plot_multiple_model_average_sens_score_trend_after_filtering(roll_win_fitted_means, roll_win_fitted_stds)

In [None]:
def plot_violin_plot_fitted_mu(roll_win_fitted_means):
    """
    This function plots a violin plot with country on the x-axis and fitted sensitivity score mu parameter
    on the y-axis. The figure is meant to show the different continent distribution on the same y-axis, 
    for comparison among continents after we rank the countries.
    
    Args:
        - roll_win_fitted_means: [num_windows, 10] array
    """
    roll_win_fitted_means_normalized = roll_win_fitted_means / roll_win_fitted_means.max()
    continent_mu_parameters = []
    corresp_country_name = []

    continent_names = ["\n".join(name.split(" ")) for name in continents]
    colors = sns.color_palette().as_hex()
    medians = []

    for node_idx in range(10):
        continent_mu_parameters += list(roll_win_fitted_means_normalized[:,node_idx].flatten())
        medians.append(np.median(roll_win_fitted_means_normalized[:,node_idx].flatten()))
        corresp_country_name += [continent_names[node_idx]] * len(roll_win_fitted_means_normalized[:,node_idx].flatten())
    
    colors_sorted = [x for _,x in sorted(zip(medians, colors), reverse=True)]
    continent_names_sorted = [x for _,x in sorted(zip(medians, continent_names), reverse=True)]

    visual_df = pd.DataFrame({
        "Continent": corresp_country_name,
        "Fitted Sensitivity Mu Parameter (Scaled to 0 - 1)": continent_mu_parameters
    })

    plt.figure(figsize=(20,8))
    plt.rcParams.update({'font.size': 20})
    sns.violinplot(data=visual_df, x="Continent", y="Fitted Sensitivity Mu Parameter (Scaled to 0 - 1)", 
                    order=continent_names_sorted, palette=colors_sorted)
    plt.title("{} {} Models Fitted Sensitivity Mu Parameter Violin Plot (Unfiltered)".format(MODEL_NAME, NUM_MODELS), fontsize=24)
    
    filename = "{}_models_fitted_mu_violin_plot_unfiltered".format(NUM_MODELS)
    plt.savefig("./" + filename + '.png', bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
plot_violin_plot_fitted_mu(roll_win_fitted_means)

In [None]:
def plot_node_avg_sens_trend(roll_win_fitted_means):
    """
    This function makes a figure showing node average sensitivity score over rolling windows.
    
    Args:
        - roll_win_gumbel_means: shape (num_windows, 10)
    """
    roll_win_fitted_means_normalized = roll_win_fitted_means / roll_win_fitted_means.max()
    node_dict = { continents[node_idx]: roll_win_fitted_means_normalized[:, node_idx] for node_idx in range(10) }
    node_dict["Rolling Window Index"] = list(range(len(roll_win_fitted_means_normalized)))
    visual_df = pd.DataFrame(node_dict)

    plt.figure(figsize=(16, 8), dpi=80)
    plt.rcParams.update({'font.size': 20})
    sns.lineplot(x='Rolling Window Index', y='Fitted Sensitivity Mu Parameter (Scaled 0 - 1)', hue='Continent', data=pd.melt(visual_df, ['Rolling Window Index'], value_name="Fitted Sensitivity Mu Parameter (Scaled 0 - 1)", var_name="Continent"))
    plt.title("{} {} Models Fitted Sensitivity Mu Parameter Trend Unfiltered".format(MODEL_NAME, NUM_MODELS), fontsize=24)
    plt.xlabel('Rolling Window Idx')  
    plt.ylabel('Fitted Sensitivity Mu Parameter (Scaled 0 - 1)')
    plt.ylim(0, 1)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    filename = "{}_models_node_avg_sens_score_unfiltered".format(NUM_MODELS)
    plt.savefig("./" + filename + '.png', bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
plot_node_avg_sens_trend(roll_win_fitted_means)

## Plotting Rankings

In [None]:
def plot_ranks_by_sensitivty_score_mean(roll_win_aggreg_diff_nested_list):
    """
    This function plots the continents rankings over rolling windows, ranked by sensitivity score 
    mean. This is not our final ranking, we just want to compare this to fitted distribution
    rankings.
    
    Args:
        - roll_win_aggreg_diff_nested_list: 10x10 sensitivity array, shape (num_windows, NUM_MODELS, 10, 10)
    """
    colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 
                'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']

    sensitivty_score_nested_np = np.nansum(np.array(roll_win_aggreg_diff_nested_list), axis=3)
    new_models_100_avg = np.nanmean(sensitivty_score_nested_np, axis=1)

    # Compute ranks
    ranks = np.zeros((len(new_models_100_avg),len(new_models_100_avg[0])))
    for i in range(len(new_models_100_avg)):
        array = new_models_100_avg[i,:]
        temp = (-array).argsort()  # negative array if we want highest sensitivity to be 1st palce
        ranks[i,:] = np.arange(len(array))[temp.argsort()] + 1  # Each position tells rank of model at that index

    plt.figure(figsize=(20, 6))
    plt.rcParams.update({'font.size': 20})
    for i in range(10):
        plt.plot(ranks[:,i], "o-", mfc="w", label=continents[i], color=colors[i])

    plt.gca().invert_yaxis()
    plt.title("Ranking by Sensitivity Score Mean Unfiltered", fontsize=24)
    plt.xlabel("Rolling Window Index")
    plt.ylabel("Ranking")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    filename = "ranking_by_sensitivity_mean_unfiltered"
    plt.savefig("./" + filename + '.png', bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
plot_ranks_by_sensitivty_score_mean(roll_win_aggreg_diff_nested_list)

In [None]:
def plot_ranks_by_fitted_mean(roll_win_fitted_means):
    """
    This function plots the continents rankings over rolling windows, ranked by fitted gumbel 
    distribution mean.
    
    Args:
        - roll_win_fitted_means: shape (num_windows, 10)
    """
    colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 
                'tab:brown', 'tab:pink', 'tab:gray', 'tab:olive', 'tab:cyan']

    # Compute ranks
    ranks = np.zeros(roll_win_fitted_means.shape)
    for roll_win in range(len(roll_win_fitted_means)):
        array = roll_win_fitted_means[roll_win,:]
        temp = (-array).argsort()  # Do negative arra for highest sensitivity 1st place
        ranks[roll_win,:] = np.arange(len(array))[temp.argsort()] + 1  # Each position tells rank of model at that index

    plt.figure(figsize=(20, 6))
    plt.rcParams.update({'font.size': 20})
    for i in range(10):
        plt.plot(ranks[:,i], "o-", mfc="w", label=continents[i], color=colors[i])

    plt.gca().invert_yaxis()
    plt.title("Ranking by Fitted Distribution Mean Unfiltered", fontsize=24)
    plt.xlabel("Rolling Window Index")
    plt.ylabel("Ranking")
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    filename = "ranking_by_fitted_distrib_mean_unfiltered"
    plt.savefig("./" + filename + '.png', bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
plot_ranks_by_fitted_mean(roll_win_fitted_means)

In [None]:
def save_csv_for_sensitivity_rank_bump_chart(roll_win_fitted_means):
    """
    If a saved csv file is needed to plot on a third-party website, this function saves the necessary lists
    (fitted mu parameter list, rolling window index list, and region name list) needed to plot bump charts.
    Website used to generate area bump chart: rawgraphs.io

    Args:
        - roll_win_fitted_means: shape (num_windows, 10)
    """
    roll_win_fitted_means_normalized = roll_win_fitted_means / roll_win_fitted_means.max()
    continent_name_list = []
    window_idx_list = []
    score_list = []

    for roll_window_idx in range(len(roll_win_fitted_means_normalized)):
        for node_idx in range(10):
            continent_name_list.append(continents[node_idx])
            window_idx_list.append(roll_window_idx)
            score_list.append(roll_win_fitted_means_normalized[roll_window_idx, node_idx])
    
    bump_chart_df = pd.DataFrame({
        "Geographical Region": continent_name_list,
        "Rolling Window Index": window_idx_list,
        "Fitted Mu": score_list
    })
    bump_chart_df.to_csv("./bump_chart_data.csv", index=False)

In [None]:
save_csv_for_sensitivity_rank_bump_chart(roll_win_fitted_means)

## Correlation Plot Between Flight Rankings and Fitted Mu Rankings

In [None]:
datasetv19 = np.load("./datasets/10_continents_dataset_v19_node_pert.npz")
datasetv19.files

In [None]:
daywise_outgoing_flights = np.nansum(datasetv19['flight_matrix_unscaled'], axis=1)
daywise_outgoing_flights.shape

Change flight counts from days to rolling windows by summing across recursive prediction rolling window. Length of rolling window is 30 + WINDOW_LENGTH, because model takes 1 window and then predicts 30 days, so it takes 30 + WINDOW_SIZE days of flight data in total for 1 rolling window.

len(daywise_outgoing_flights) - 30 - WINDOW_SIZE - WINDOW_SIZE is because a rolling window is 30 + WINDOW_SIZE in length, and subtracting the second WINDOW_SIZE accounts for the dataloader creating windows from days and having WINDOW_SIZE less days than windows.

In [None]:
rollwin_outgoing_flights = [daywise_outgoing_flights[idx: idx + 30 + WINDOW_SIZE, :].sum(axis=0) for idx in range(len(daywise_outgoing_flights) - 30 - WINDOW_SIZE - WINDOW_SIZE)]
rollwin_outgoing_flights = np.array(rollwin_outgoing_flights)
rollwin_outgoing_flights.shape

In [None]:
# Uncomment if want to log10 transform flights
# rollwin_outgoing_flights = np.log10(rollwin_outgoing_flights, where=rollwin_outgoing_flights != 0)

### Plotting Fitted Distribution Mu Parameter against number of flights Scatterplot

In [None]:
fitted_means = []
flight_numbers = []
continents_name = []
roll_win_fitted_means_normalized = roll_win_fitted_means / roll_win_fitted_means.max()

for i in range(10):
    fitted_means += list(roll_win_fitted_means_normalized[:,i])
    flight_numbers += list(rollwin_outgoing_flights[:,i])
    continents_name += [continents[i]] * len(roll_win_fitted_means_normalized)

visual_df = pd.DataFrame({
    "Normalized μ Value": fitted_means,
    "Summed Outgoing Flights Over Rolling Window": flight_numbers,
    "Continent": continents_name
})

plt.figure(figsize=(8,8))
plt.rcParams.update({'font.size': 16})
sns.scatterplot(data=visual_df, y="Normalized μ Value", x="Summed Outgoing Flights Over Rolling Window", hue="Continent")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig("./{}_models_fitted_mu_vs_flights_scatterplot.png".format(NUM_MODELS), bbox_inches="tight", facecolor="white")
plt.clf()
plt.close()

### Plotting Fitted Distribution Mu Parameter Against Num Cases Scatterplot

In [None]:
num_cases_matrix = datasetv19['feature_matrix_smooth'][:,:,1]
print(num_cases_matrix.max())
print(num_cases_matrix.min())
print(num_cases_matrix.mean())
num_cases_matrix.shape

In [None]:
# Scale back up to normal scale to avoid summing log values
num_cases_matrix = np.power(10, num_cases_matrix, where=num_cases_matrix != 0)

In [None]:
rollwin_ncases = [num_cases_matrix[idx: idx + 30 + WINDOW_SIZE, :].sum(axis=0) for idx in range(len(num_cases_matrix) - 30 - WINDOW_SIZE - WINDOW_SIZE)]
rollwin_ncases = np.array(rollwin_ncases)
rollwin_ncases.shape

In [None]:
# Uncomment if want to log10 transform summation back down to log10 scale
# rollwin_ncases = np.log10(rollwin_ncases, where=rollwin_ncases != 0)

In [None]:
fitted_mus = []
ncases = []
continents_name = []

for i in range(10):
    fitted_mus += list(roll_win_fitted_means_normalized[:,i])
    ncases += list(rollwin_ncases[:,i])
    continents_name += [continents[i]] * len(roll_win_fitted_means_normalized)

print(len(fitted_mus))
print(len(ncases))
print(len(continents_name))

visual_df = pd.DataFrame({
    "Fitted Distribution Sensitivity (Scaled to 0 - 1)": fitted_mus,
    "Summed Cases Over Rolling Window": ncases,
    "Continents": continents_name
})

plt.figure(figsize=(8,8))
plt.rcParams.update({'font.size': 16})
sns.scatterplot(data=visual_df, x="Summed Cases Over Rolling Window", y="Normalized μ Value", hue="Continents")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig("./{}_models_fitted_mu_vs_ncases_scatterplot.png".format(NUM_MODELS), bbox_inches="tight", facecolor="white")
plt.clf()
plt.close()