In [None]:
import os
import statistics as st
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
continents = ["Africa", "North America", "South America", "Oceania", "Eastern Europe", "Western Europe", "Middle East", "South Asia", "Southeast-East Asia", "Central Asia"]
ANALYSIS_DIR = "2022-04-13-21_31_29"
NUM_MODELS = 100
MODEL_NAME = "DCSAGE"
WINDOW_SIZE = 7
REC_PRED_LEN = 30

In [None]:
feature_perturbation_array = np.load(os.path.join("./feature-pert-runs", ANALYSIS_DIR, "prediction_saves", "roll_win_feat_pert_preds.npy"))
feature_perturbation_array.shape

Take anti-log of all predictions, need to sum up real values

In [None]:
feature_perturbation_array = np.power(10, feature_perturbation_array, where=feature_perturbation_array != 0)

Load cases over rolling windows

In [None]:
datasetv19 = np.load("./10_continents_dataset_v19_node_pert.npz")
datasetv19.files

In [None]:
num_cases_matrix = datasetv19['feature_matrix_smooth'][:,:,1]
print(num_cases_matrix.max())
print(num_cases_matrix.min())
print(num_cases_matrix.mean())
num_cases_matrix.shape

In [None]:
# Scale back up to normal scale, cannot sum log values
num_cases_matrix = np.power(10, num_cases_matrix, where=num_cases_matrix != 0)

In [None]:
rollwin_ncases = [num_cases_matrix[idx: idx + 30, :].sum(axis=0) for idx in range(WINDOW_SIZE, len(num_cases_matrix) - 30 - WINDOW_SIZE)]
rollwin_ncases = np.array(rollwin_ncases)
rollwin_ncases.shape

In [None]:
# Uncomment if want to log10 transform summation back down to log10 scale
# rollwin_ncases = np.log10(rollwin_ncases, where=rollwin_ncases != 0)

## Plot 100 model curves of unperturbed, min containment, and max containment

In [None]:
def global_cases_vs_rolling_windows(feat_pert_arr, plot_type="unpert"):
    assert plot_type in ["unpert", "min_containment", "max_containment"]

    # Separate our unperturbed, min containment, and max containment
    unpert_cases_arr = feat_pert_arr[:,:,0,:,:]  # (523, 100, 30, 10)
    min_containment_cases_arr = feat_pert_arr[:,:,1,:,:]
    max_containment_cases_arr = feat_pert_arr[:,:,2,:,:]

    # Sum over 10 nodes and 30 days within rolling windows -> global cases
    unpert_global_cases = unpert_cases_arr.sum(axis=(2,3))  # (523, 100)
    min_containment_global_cases = min_containment_cases_arr.sum(axis=(2,3))
    max_containment_global_cases = max_containment_cases_arr.sum(axis=(2,3))

    # Scale back down to log scale
    unpert_global_cases = np.log10(unpert_global_cases, where=unpert_global_cases != 0)
    min_containment_global_cases = np.log10(min_containment_global_cases, where=min_containment_global_cases != 0)
    max_containment_global_cases = np.log10(max_containment_global_cases, where=max_containment_global_cases != 0)

    # Make plot
    plt.figure(figsize=(8,8))
    if plot_type == "unpert":
        visual_dict1 = { "Model {}".format(i): unpert_global_cases[:,i] for i in range(NUM_MODELS) }
        visual_dict1["Rolling Window Index"] = list(range(len(unpert_global_cases)))
        visual_df1 = pd.DataFrame(visual_dict1)
        sns.lineplot(x='Rolling Window Index', y='value', hue='variable', data=pd.melt(visual_df1, ['Rolling Window Index']), palette=['gray'] * NUM_MODELS)
    if plot_type == "min_containment":
        visual_dict2 = { "Model {}".format(i): min_containment_global_cases[:,i] for i in range(NUM_MODELS) }
        visual_dict2["Rolling Window Index"] = list(range(len(min_containment_global_cases)))
        visual_df2 = pd.DataFrame(visual_dict2)
        sns.lineplot(x='Rolling Window Index', y='value', hue='variable', data=pd.melt(visual_df2, ['Rolling Window Index']), palette=['green'] * NUM_MODELS)
    if plot_type == "max_containment":
        visual_dict3 = { "Model {}".format(i): max_containment_global_cases[:,i] for i in range(NUM_MODELS) }
        visual_dict3["Rolling Window Index"] = list(range(len(max_containment_global_cases)))
        visual_df3 = pd.DataFrame(visual_dict3)
        sns.lineplot(x='Rolling Window Index', y='value', hue='variable', data=pd.melt(visual_df3, ['Rolling Window Index']), palette=['red'] * NUM_MODELS)

    plt.title("{} {} Models Global Cases vs Rolling Window".format(MODEL_NAME, NUM_MODELS))
    plt.ylim([5, 7.5])
    plt.legend().remove()
    plt.savefig("./global_cases_vs_rolling_window_{}_models_{}.png".format(NUM_MODELS, plot_type), bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
global_cases_vs_rolling_windows(feature_perturbation_array, plot_type="unpert")
global_cases_vs_rolling_windows(feature_perturbation_array, plot_type="min_containment")
global_cases_vs_rolling_windows(feature_perturbation_array, plot_type="max_containment")

## Fitting Normal and EV distribution on rolling windows to see good fit

In [None]:
def kl_divergence(p, q):
	return sum(p[i] * np.log2(p[i]/q[i]) for i in range(len(p)))

In [None]:
containment_idx = 2
cases_arr = feature_perturbation_array[:,:,containment_idx,:,:]  # (523, 100, 30, 10)
global_cases = cases_arr.sum(axis=(2,3))  # Sum over 10 nodes and 30 days within rolling windows -> global cases (523, 100)

In [None]:
# Fit several distributions per window and node, see which gives lowest KL-Divergence
distributions = ['gumbel_l', 'norm']
roll_win_list = []

for roll_win in range(len(global_cases)):
    if roll_win % 50 == 0:
        print("Rolling window", roll_win)
    
    distrib_KL_diverg_scores = []
    for distrib_name in distributions:
        values = global_cases[roll_win].flatten()
        distribution = getattr(stats, distrib_name)
        parameters = distribution.fit(values)

        # Getting empirical probabilities from seaborn histplot bins
        fig = sns.histplot(x=global_cases[roll_win].flatten(), kde=True, stat="probability")
        xvals, empirical_probs = fig.get_lines()[0].get_data()
        if distrib_name == "gumbel_l" or distrib_name == "gumbel_r" or distrib_name == "norm":
            theoretical_probs = distribution.pdf(xvals, loc=parameters[0], scale=parameters[1])
        else:
            raise Exception("Unknown distribution specified:")
        plt.clf()

        # Replace zeros with a small value, qk cannot be zero
        empirical_probs[empirical_probs == 0] = 0.001
        theoretical_probs[theoretical_probs == 0] = 0.001
        kl_diverg = kl_divergence(p=empirical_probs, q=theoretical_probs)
        distrib_KL_diverg_scores.append(kl_diverg)
    
    idx_min_KL_diverg = distrib_KL_diverg_scores.index(min(distrib_KL_diverg_scores))
    roll_win_list.append(distributions[idx_min_KL_diverg])

In [None]:
distrib_counts = { "gumbel_l": 0, "norm": 0 }
for roll_win in range(len(roll_win_list)):
    distrib_counts[roll_win_list[roll_win]] += 1

distrib_counts

Distribution best fit counts unperturbed: <br/>
{'gumbel_l': 91, 'norm': 432} <br/>

Distribution best fit counts min containment: <br/>
{'gumbel_l': 397, 'norm': 126} <br/>

Distribution best fit counts max containment: <br/>
{'gumbel_l': 113, 'norm': 410}

In [None]:
print(91 + 397 + 113)
print(432 + 126 + 410)

In [None]:
unpert_distrib_save_path = os.path.join("./feature-pert-runs", ANALYSIS_DIR, "roll_win_distribs_unpert")
if not os.path.exists(unpert_distrib_save_path):
    os.mkdir(unpert_distrib_save_path)

min_cont_distrib_save_path = os.path.join("./feature-pert-runs", ANALYSIS_DIR, "roll_win_distribs_min_cont")
if not os.path.exists(min_cont_distrib_save_path):
    os.mkdir(min_cont_distrib_save_path)

max_cont_distrib_save_path = os.path.join("./feature-pert-runs", ANALYSIS_DIR, "roll_win_distribs_max_cont")
if not os.path.exists(max_cont_distrib_save_path):
    os.mkdir(max_cont_distrib_save_path)

In [None]:
FIT_NORMAL = True

In [None]:
def fit_parameters(feature_perturbation_array, containment_type_idx=0):
    assert containment_type_idx in [0, 1, 2]  # 0 is unpert, 1 is min containment, 2 is max containment
    cases_arr = feature_perturbation_array[:,:,containment_type_idx,:,:]  # (523, 100, 30, 10)
    global_cases = cases_arr.sum(axis=(2,3))  # Sum over 10 nodes and 30 days within rolling windows -> global cases (523, 100)

    roll_win_loc_params = []
    roll_win_scale_params = []
    for roll_win in range(len(global_cases)):
        values = global_cases[roll_win,:]
        if FIT_NORMAL:
            params = stats.norm.fit(values)
        else:
            # params = stats.gumbel_l.fit(values)
            params = stats.gumbel_r.fit(values)
        
        roll_win_loc_params.append(params[0])
        roll_win_scale_params.append(params[1])
    
    return np.array(roll_win_loc_params), np.array(roll_win_scale_params)

In [None]:
# unpert_roll_win_loc_params, unpert_roll_win_scale_params = fit_parameters(feature_perturbation_array, containment_type_idx=0)
# min_cont_roll_win_loc_params, min_cont_roll_win_scale_params = fit_parameters(feature_perturbation_array, containment_type_idx=1)
# max_cont_roll_win_loc_params, max_cont_roll_win_scale_params = fit_parameters(feature_perturbation_array, containment_type_idx=2)

containment_idx = 0
roll_win_loc_params, roll_win_scale_params = fit_parameters(feature_perturbation_array, containment_type_idx=containment_idx)

cases_arr = feature_perturbation_array[:,:,containment_idx,:,:]  # (523, 100, 30, 10)
global_cases = cases_arr.sum(axis=(2,3))  # Sum over 10 nodes and 30 days within rolling windows -> global cases (523, 100)

In [None]:
# Plot distribution on each window
for single_window_idx in range(0, len(global_cases), 50):
    max_val, min_val = global_cases[single_window_idx].max(), global_cases[single_window_idx].min()
    x = np.linspace(max_val + 50, min_val - 50, 100)

    if FIT_NORMAL:
        plt.plot(x, stats.norm.pdf(x, roll_win_loc_params[single_window_idx], roll_win_scale_params[single_window_idx]), 'r-', label='Normal pdf')
    else:
        # plt.plot(x, stats.gumbel_l.pdf(x, roll_win_loc_params[single_window_idx, single_country_idx], roll_win_scale_params[single_window_idx, single_country_idx]), 'r-', label='EV pdf')
        plt.plot(x, stats.gumbel_r.pdf(x, roll_win_loc_params[single_window_idx], roll_win_scale_params[single_window_idx]), 'r-', label='EV pdf')
    
    sns.histplot(x=global_cases[single_window_idx].flatten(), stat="density", label="Global Summed Cases")
    
    mean = np.mean(global_cases[single_window_idx].flatten())
    median = np.median(global_cases[single_window_idx].flatten())
    std = np.std(global_cases[single_window_idx].flatten())

    if FIT_NORMAL:
        title = "Normal Distribution fitted on window {}\nMean: {:.4f}, Median: {:.4f}, STD: {:.4f}".format(single_window_idx, mean, median, std)
        plt.title(title)
        plt.savefig("./normal_unfiltered_fit_win{}_cont_idx_{}.png".format(single_window_idx, containment_idx), bbox_inches="tight", facecolor="white")
    else:
        title = "EV Distribution fitted on window {}\nMean: {:.4f}, Median: {:.4f}, STD: {:.4f}".format(single_window_idx, mean, median, std)
        plt.title(title)
        plt.savefig("./ev_unfiltered_fit_win{}_cont_idx_{}.png".format(single_window_idx, containment_idx), bbox_inches="tight", facecolor="white")
    
    # plt.show()
    plt.clf()

## Plot Raw Global Cases Sum vs Rolling Window Trend Plot

In [None]:
rollwin_ncases.shape

In [None]:
def global_cases_vs_rolling_windows(feat_pert_arr):
    # Separate our unperturbed, min containment, and max containment
    unpert_cases_arr = feat_pert_arr[:,:,0,:,:]  # (523, 100, 30, 10)
    min_containment_cases_arr = feat_pert_arr[:,:,1,:,:]
    max_containment_cases_arr = feat_pert_arr[:,:,2,:,:]

    # Sum over 10 nodes and 30 days within rolling windows -> global cases
    unpert_global_cases = unpert_cases_arr.sum(axis=(2,3))  # (523, 100)
    min_containment_global_cases = min_containment_cases_arr.sum(axis=(2,3))
    max_containment_global_cases = max_containment_cases_arr.sum(axis=(2,3))

    # Average over 100 models - Normal distribution fits better
    unpert_global_cases = unpert_global_cases.mean(axis=1)
    min_containment_global_cases = min_containment_global_cases.mean(axis=1)
    max_containment_global_cases = max_containment_global_cases.mean(axis=1)
    # unpert_global_cases = np.median(unpert_global_cases, axis=1)
    # min_containment_global_cases = np.median(min_containment_global_cases, axis=1)
    # max_containment_global_cases = np.median(max_containment_global_cases, axis=1)
    x_values = list(range(len(unpert_global_cases)))

    # Scale back down to log scale
    unpert_global_cases = np.log10(unpert_global_cases, where=unpert_global_cases != 0)
    min_containment_global_cases = np.log10(min_containment_global_cases, where=min_containment_global_cases != 0)
    max_containment_global_cases = np.log10(max_containment_global_cases, where=max_containment_global_cases != 0)

    # Get rolling window ground truth cases: sum up over 10 nodes in (523, 10)
    summed_gt_cases = rollwin_ncases.sum(axis=1)
    # Scale ground truth cases back down to log10 scale
    summed_gt_cases = np.log10(summed_gt_cases, where=summed_gt_cases != 0)

    plt.figure(figsize=(8,8))
    sns.lineplot(x=x_values, y=unpert_global_cases, label="Unperturbed Global Cases Sum", color="gray")
    sns.lineplot(x=x_values, y=min_containment_global_cases, label="Min Containment Global Cases Sum", color="red")
    sns.lineplot(x=x_values, y=max_containment_global_cases, label="Max Containment Global Cases Sum", color="green")
    sns.lineplot(x=x_values, y=summed_gt_cases, label="Ground Truth Global Cases Sum", linewidth=4, color="orange")
    plt.title("{} {} Models Global Cases vs Rolling Window".format(MODEL_NAME, NUM_MODELS))
    plt.xlabel("Rolling Window Index")
    plt.ylabel("Global Summed Cases (Log10 Scale)")
    plt.savefig("./global_cases_vs_rolling_window.png", bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
global_cases_vs_rolling_windows(feature_perturbation_array)

## Plot Continent-Wise Cases vs Rolling Windows

In [None]:
def continent_cases_vs_rolling_windows(feat_pert_arr):
    # Separate our unperturbed, min containment, and max containment
    unpert_cases_arr = feat_pert_arr[:,:,0,:,:]  # (523, 100, 30, 10)
    min_containment_cases_arr = feat_pert_arr[:,:,1,:,:]
    max_containment_cases_arr = feat_pert_arr[:,:,2,:,:]

    # Sum over 10 nodes and 30 days within rolling windows -> global cases
    unpert_global_cases = unpert_cases_arr.sum(axis=(2))  # (523, 100, 10)
    min_containment_global_cases = min_containment_cases_arr.sum(axis=(2))
    max_containment_global_cases = max_containment_cases_arr.sum(axis=(2))

    # Average over 100 models - Normal distribution fits better
    unpert_global_cases = unpert_global_cases.mean(axis=1)  # (523, 10)
    min_containment_global_cases = min_containment_global_cases.mean(axis=1)
    max_containment_global_cases = max_containment_global_cases.mean(axis=1)
    x_values = list(range(len(unpert_global_cases)))

    # Scale back down to log scale
    unpert_global_cases = np.log10(unpert_global_cases, where=unpert_global_cases != 0)
    min_containment_global_cases = np.log10(min_containment_global_cases, where=min_containment_global_cases != 0)
    max_containment_global_cases = np.log10(max_containment_global_cases, where=max_containment_global_cases != 0)

    # Get Ground Truth Cases
    rollwin_ncases_log10 = np.log10(rollwin_ncases, where=rollwin_ncases != 0)  # (523, 10)

    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(40, 16))
    fig.suptitle("{} Continent-Wise Summed Cases vs Rolling Windows".format(MODEL_NAME), fontsize= 30)

    idx = 0
    for row in ax:
        for col in row:
            sns.lineplot(ax=col, x=x_values, y=unpert_global_cases[:,idx], label="Unperturbed Continent Cases Sum", color="gray")
            sns.lineplot(ax=col, x=x_values, y=min_containment_global_cases[:,idx], label="Min Containment Summed Continent Cases", color="red")
            sns.lineplot(ax=col, x=x_values, y=max_containment_global_cases[:,idx], label="Max Containment Summed Continent Cases", color="green")
            sns.lineplot(ax=col, x=x_values, y=rollwin_ncases_log10[:,idx], label="Ground Truth Summed Continent Cases", linewidth=4, color="orange")
            col.set_title(continents[idx])
            col.set_xlabel("Rolling Window Index")
            col.set_ylabel("Global Summed Cases (Log10 Scale)")
            col.set_ylim([2.5,7.5])
            idx += 1
    plt.savefig("./continent_cases_vs_rolling_window.png", bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
continent_cases_vs_rolling_windows(feature_perturbation_array)

In [None]:
def continent_cases_percent_change_vs_rolling_windows(feat_pert_arr):
    # Separate our unperturbed, min containment, and max containment
    unpert_cases_arr = feat_pert_arr[:,:,0,:,:]  # (523, 100, 30, 10)
    min_containment_cases_arr = feat_pert_arr[:,:,1,:,:]
    max_containment_cases_arr = feat_pert_arr[:,:,2,:,:]

    # Sum over 10 nodes and 30 days within rolling windows -> global cases
    unpert_global_cases = unpert_cases_arr.sum(axis=(2))  # (523, 100, 10)
    min_containment_global_cases = min_containment_cases_arr.sum(axis=(2))
    max_containment_global_cases = max_containment_cases_arr.sum(axis=(2))

    # Average over 100 models - Normal distribution fits better
    unpert_global_cases = unpert_global_cases.mean(axis=1)  # (523, 10)
    min_containment_global_cases = min_containment_global_cases.mean(axis=1)
    max_containment_global_cases = max_containment_global_cases.mean(axis=1)
    x_values = list(range(len(unpert_global_cases)))

    # Scale back down to log scale
    unpert_global_cases = np.log10(unpert_global_cases, where=unpert_global_cases != 0)
    min_containment_global_cases = np.log10(min_containment_global_cases, where=min_containment_global_cases != 0)
    max_containment_global_cases = np.log10(max_containment_global_cases, where=max_containment_global_cases != 0)

    # Get percent changes
    min_cont_percent_changes = min_containment_global_cases - unpert_global_cases
    min_cont_percent_changes /= unpert_global_cases
    max_cont_percent_changes = max_containment_global_cases - unpert_global_cases
    max_cont_percent_changes /= unpert_global_cases

    fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(40, 16))
    fig.suptitle("{} Continent-Wise Percent Change in Cases vs Rolling Windows".format(MODEL_NAME), fontsize= 30)

    idx = 0
    for row in ax:
        for col in row:
            sns.lineplot(ax=col, x=x_values, y=min_cont_percent_changes[:,idx], label="Min Containment Percent Change", color="red")
            sns.lineplot(ax=col, x=x_values, y=max_cont_percent_changes[:,idx], label="Max Containment Percent Change", color="green")
            col.set_title(continents[idx])
            col.set_xlabel("Rolling Window Index")
            col.set_ylabel("Global Summed Cases (Log10 Scale)")
            # col.set_ylim([2.5,7.5])
            idx += 1
    plt.savefig("./continent_percent_change_vs_rolling_window.png", bbox_inches='tight', facecolor='white')
    plt.clf()
    plt.close()

In [None]:
continent_cases_percent_change_vs_rolling_windows(feature_perturbation_array)

In [None]:
unpert_cases_arr = feature_perturbation_array[:,:,0,:,:]  # (523, 100, 30, 10)
min_containment_cases_arr = feature_perturbation_array[:,:,1,:,:]
max_containment_cases_arr = feature_perturbation_array[:,:,2,:,:]

# Sum over 10 nodes and 30 days within rolling windows -> global cases
unpert_global_cases = unpert_cases_arr.sum(axis=(2,3))  # (523, 100)
min_containment_global_cases = min_containment_cases_arr.sum(axis=(2,3))
max_containment_global_cases = max_containment_cases_arr.sum(axis=(2,3))

# Average over 100 models - Normal distribution fits better
unpert_global_cases = unpert_global_cases.mean(axis=1)
min_containment_global_cases = min_containment_global_cases.mean(axis=1)
max_containment_global_cases = max_containment_global_cases.mean(axis=1)
# unpert_global_cases = np.median(unpert_global_cases, axis=1)
# min_containment_global_cases = np.median(min_containment_global_cases, axis=1)
# max_containment_global_cases = np.median(max_containment_global_cases, axis=1)

In [None]:
temp_idx = 100
print(unpert_global_cases[temp_idx], min_containment_global_cases[temp_idx], max_containment_global_cases[temp_idx])