# August 29 - Comparing actual vs synthetic samples

In [None]:
# Default imports
import math
import os
import sys
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import matplotlib.ticker as mtick

# Add the path to the parent directory to augment search for module
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
if par_dir not in sys.path:
    sys.path.append(par_dir)
    
# Import the custom plotting module
from plot_utils import plot_utils
import random
import torch
from plot_utils import notebook_utils_2

## Steps : 

1. Construct a dataset object and randomly sample 65536 events from the actual dataset
2. Load the synthetic samples generated by the model with 128 latent dimensions.
3. Plot the 1D histogram for both the actual randomly sampled events and syntheticly generated events
4. Compare the histogram using the KS test

### 1. Actual dataset input

In [None]:
## Dataset imports
from torch.utils.data import DataLoader
from io_utils.data_handling import WCH5Dataset
from torch.utils.data.sampler import SubsetRandomSampler

In [None]:
# Create the dataset
dset=WCH5Dataset("/fast_scratch/nuVAE_data/3Mevents_IWCD.h5",
                 0.2, 0.1, 0.1, 0.1, "train_ae_or_vae_only",
                 shuffle=True, reduced_dataset_size=2000000)

test_iter=DataLoader(dset, batch_size=1024,
                     shuffle=False,
                     sampler=SubsetRandomSampler(dset.test_indices))

In [None]:
# Initialize a list to hold the events read from the dataset
actual_events = []
test_dset_iter = iter(test_iter)

for i in range(64):
    data= next(test_dset_iter)
    actual_events.append(data[0][:,:,:,:19].float())

In [None]:
print(len(actual_events))

In [None]:
print(actual_events[0].size())

In [None]:
for i in range(len(actual_events)):
    actual_events[i] = actual_events[i].numpy().ravel()
    
print(actual_events[0].shape)

In [None]:
actual_events_np = np.array(actual_events).ravel()
print(actual_events_np.shape)

### 2. Synthetic dataset input

In [None]:
dump_dir = "/home/akajal/WatChMaL/VAE/dumps/" + "20190829_224606" + "/"
model_status = "trained"
np_arr_path = dump_dir + "samples/" + "ENet" + "_" + model_status + ".npz"

np_arr = np.load(np_arr_path)
np_samples, np_labels, np_energies = np_arr["samples"], np_arr["predicted_labels"], np_arr["predicted_energies"]

In [None]:
print(np_samples.shape)

In [None]:
synthetic_events = np_samples.ravel()
print(synthetic_events.shape)

### 3. Plot the histograms for both datasets overlaid

In [None]:
num_bins = 100

In [None]:
# Initialize the plot and corresponding parameters
fig, ax = plt.subplots(figsize=(16,9),facecolor="w")
ax.tick_params(axis="both", labelsize=20)

# Setup the bins beforehand
bins = np.linspace(min(np.amin(actual_events_np),np.amin(synthetic_events),1),
                   max(np.amax(actual_events_np),np.amax(synthetic_events)),
                   num_bins)

# Plot the histograms overlaid
plt.hist(actual_events_np, bins, density=False,
         label="Actual simulated events", color="red",
         alpha=0.5, stacked=True)

plt.hist(synthetic_events, bins, density=False,
         label="Synthetic generated events", color="blue",
         alpha=0.5, stacked=True)

# Setup the axes
ax.set_xlabel("Charge, c", fontsize=20)
ax.set_ylabel("Number of hits", fontsize=20)

plt.margins(0.2)
plt.grid(True)

plt.yscale("log")
plt.legend(loc="upper right", prop={"size":20})
plt.title(r"Actual vs Synthetic Charge Distributions (65526 samples)",fontsize=20)

plt.show()

plt.savefig("actual_vs_synthethic_distribution_ld_128.svg", format="svg", dpi=300)

In [None]:
# Initialize the plot and corresponding parameters
fig, ax = plt.subplots(figsize=(16,9),facecolor="w")
ax.tick_params(axis="both", labelsize=20)

# Setup the bins beforehand
bins = np.linspace(min(np.amin(actual_events_np),np.amin(synthetic_events),1),
                   max(np.amax(actual_events_np),np.amax(synthetic_events)),
                   num_bins)

# Plot the histograms overlaid
plt.hist(actual_events_np, bins, density=False,
         label="Actual simulated events", color="red",
         alpha=0.5, stacked=True)

plt.hist(synthetic_events, bins, density=False,
         label="Synthetic generated events", color="blue",
         alpha=0.5, stacked=True)

# Setup the axes
ax.set_xlabel("Charge, c", fontsize=20)
ax.set_ylabel("Number of hits", fontsize=20)

plt.margins(0.2)
plt.grid(True)

plt.legend(loc="upper right", prop={"size":20})
plt.title(r"Actual vs Synthetic Charge Distributions (65526 samples)",fontsize=20)

plt.show()

plt.savefig("actual_vs_synthethic_distribution_ld_128.svg", format="svg", dpi=300)

In [None]:
# Initialize the plot and corresponding parameters
fig, ax = plt.subplots(figsize=(16,9),facecolor="w")
ax.tick_params(axis="both", labelsize=20)

# Setup the bins beforehand
bins = np.linspace(min(np.amin(actual_events_np),np.amin(synthetic_events),1),
                   max(np.amax(actual_events_np),np.amax(synthetic_events)),
                   num_bins)

# Plot the histograms overlaid
plt.hist(actual_events_np, bins, density=False,
         label="Actual simulated events", color="red",
         alpha=0.5, stacked=False)

plt.hist(synthetic_events, bins, density=False,
         label="Synthetic generated events", color="blue",
         alpha=0.5, stacked=False)

# Setup the axes
ax.set_xlabel("Charge, c", fontsize=20)
ax.set_ylabel("Number of hits", fontsize=20)

plt.margins(0.2)
plt.grid(True)

plt.yscale("log")
plt.xscale("log")
plt.legend(loc="upper right", prop={"size":20})
plt.title(r"Actual vs Synthetic Charge Distributions (65526 samples)",fontsize=20)

plt.show()

plt.savefig("actual_vs_synthethic_distribution_ld_128.svg", format="svg", dpi=300)

## Attempting to do the KS test using scipy.stats.kstest

In [None]:
# Initialize the plot and corresponding parameters
fig, ax = plt.subplots(figsize=(16,9),facecolor="w")
ax.tick_params(axis="both", labelsize=20)

# Setup the bins beforehand
bins = np.linspace(min(np.amin(actual_events_np),np.amin(synthetic_events),1),
                   max(np.amax(actual_events_np),np.amax(synthetic_events)),
                   num_bins)

# Plot the histograms overlaid
actual_values, actual_bins, _ = plt.hist(actual_events_np, bins, density=True,
                                         label="Actual simulated events", color="red",
                                         alpha=0.5, stacked=False)

synthetic_values, synthetic_bins, _ = plt.hist(synthetic_events, bins, density=True,
                                               label="Synthetic generated events", color="blue",
                                               alpha=0.5, stacked=False)

In [None]:
print(actual_values)

In [None]:
print(actual_bins)

## Simply use `scipy.stats.ks_2samp()` to calculate the KS statistic

In [None]:
import scipy.stats as stats

In [None]:
ks_statistic_1, p_value_1 = stats.ks_2samp(actual_events_np, synthetic_events)

In [None]:
print("KS statistic for actual vs synthetic datasets : ", ks_statistic_1)
print("p value for actual vs synthetic datasets : ", p_value_1)

### Now compare this ks_statistic with one calculated from two random samples from the actual dataset

In [None]:
# Create the dataset
dset=WCH5Dataset("/fast_scratch/nuVAE_data/3Mevents_IWCD.h5",
                 0.2, 0.1, 0.1, 0.1, "train_ae_or_vae_only",
                 shuffle=True, reduced_dataset_size=2000000)

val_iter=DataLoader(dset, batch_size=1024,
                    shuffle=False,
                    sampler=SubsetRandomSampler(dset.val_indices))

# Initialize a list to hold the events read from the dataset
actual_events_2 = []
val_dset_iter = iter(val_iter)

for i in range(64):
    data= next(val_dset_iter)
    actual_events_2.append(data[0][:,:,:,:19].float())
    
for i in range(len(actual_events_2)):
    actual_events_2[i] = actual_events_2[i].numpy().ravel()

actual_events_np_2 = np.array(actual_events_2).ravel()

In [None]:
ks_statistic_2, p_value_2 = stats.ks_2samp(actual_events_np, actual_events_np_2)

print("KS statistic for actual vs actual datasets : ", ks_statistic_2)
print("p value for actual vs actual datasets : ", p_value_2)

In [None]:
# Initialize the plot and corresponding parameters
fig, ax = plt.subplots(figsize=(16,9),facecolor="w")
ax.tick_params(axis="both", labelsize=20)

# Setup the bins beforehand
bins = np.linspace(min(np.amin(actual_events_np),np.amin(actual_events_np_2),1),
                   max(np.amax(actual_events_np),np.amax(actual_events_np_2)),
                   num_bins)

# Plot the histograms overlaid
plt.hist(actual_events_np, bins, density=False,
         label="Actual simulated events 1", color="red",
         alpha=0.5, stacked=False)

plt.hist(actual_events_np_2, bins, density=False,
         label="Actual simulated events 2", color="blue",
         alpha=0.5, stacked=False)

# Setup the axes
ax.set_xlabel("Charge, c", fontsize=20)
ax.set_ylabel("Number of hits", fontsize=20)

plt.margins(0.2)
plt.grid(True)

plt.yscale("log")
plt.xscale("log")
plt.legend(loc="upper right", prop={"size":20})
plt.title(r"Actual vs Actual Charge Distributions (65526 samples)",fontsize=20)

plt.show()

plt.savefig("actual_vs_actual_distribution_ld_128.svg", format="svg", dpi=300)