In [2]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from scipy.stats import ks_2samp
import itertools
from torch.utils.data import DataLoader, TensorDataset



In [9]:
class DataProcessor():
    def __init__(self, particle, datatype = 'geant'):
        super(DataProcessor,self).__init__()
        self.CLASS_ONE = particle
        self.datatype = datatype
        self.raw = h5py.File('/fast_scratch_1/QVAE/data/calo/'+self.CLASS_ONE+'.hdf5', 'r')
        if datatype == 'geant':
            self.gen_type = None
            self.dataset = self.get_geant_dataset()
        elif datatype == 'rbm':
            self.gen_type = 'RBM'
            self.dataset = self.get_gen_dataset()
        elif datatype == 'qpu':
            self.gen_type = 'DWAVE'
            self.dataset = self.get_gen_dataset()
        else:
            print('Invalid datatype')
        

    def flattened_class_data(self, class_num):
        c_layer_0 = torch.tensor(np.array(class_num['layer_0']))
        c_layer_1 = torch.tensor(np.array(class_num['layer_1']))
        c_layer_2 = torch.tensor(np.array(class_num['layer_2']))
        c_flatten_0 = torch.flatten(c_layer_0, start_dim=1)
        c_flatten_1 = torch.flatten(c_layer_1, start_dim=1)
        c_flatten_2 = torch.flatten(c_layer_2, start_dim=1)
        flat_class_data = torch.cat((c_flatten_0,c_flatten_1,c_flatten_2), dim=1)
        return flat_class_data

    def get_geant_dataset(self):
        full_dataset = self.flattened_class_data(self.raw)
        # 计算前80%数据的结束索引
        cutoff = int(full_dataset.size(0) * 0.9)
        # 获取后20%的数据
        last_20_percent_dataset = full_dataset[cutoff:]
        return last_20_percent_dataset


    def get_gen_dataset(self):
        gen_dir = f'/home/blazerjia/Publication/Saved_samples/{self.gen_type}/{self.CLASS_ONE}/synthetic_images_{self.CLASS_ONE}.pt'
        gen_data = torch.load(gen_dir)
        gen_tensor = torch.cat(gen_data).double()
        dataset = TensorDataset(gen_tensor)
        dataloader = DataLoader(dataset, batch_size=len(gen_tensor), shuffle=True)
        shuffled_gen_tensor = next(iter(dataloader))[0]  # 获取一个shuffle后的batch
        return shuffled_gen_tensor



In [10]:

def chi2_ndf_histogram_density_1(ground_truth, generated_samples, num_bins):
    # Compute the common bins for both histograms
    min_val = min(np.min(ground_truth), np.min(generated_samples))
    max_val = max(np.max(ground_truth), np.max(generated_samples))
    bins = np.linspace(min_val, max_val, num_bins + 1)
    
    bin_width = (max_val - min_val) / num_bins
    # Compute histograms
    gt_hist, _ = np.histogram(ground_truth, bins=bins)
    gen_hist, _ = np.histogram(generated_samples, bins=bins)
    
    min_count = 0.1
    gt_hist = np.where(gt_hist == 0, min_count, gt_hist)
    gen_hist = np.where(gen_hist == 0, min_count, gen_hist)
    
    gt_hist_density = gt_hist / (np.sum(gt_hist) * bin_width)
    gen_hist_density = gen_hist / (np.sum(gen_hist) * bin_width)
    gt_errors_density = np.sqrt(gt_hist) / (np.sum(gt_hist) * bin_width)

    chi2 = np.sum((gt_hist_density - gen_hist_density) ** 2 / gt_errors_density ** 2)
    
    N = num_bins
    # Number of parameters (assuming 1 parameter for this example)
    p = 1
    # Degrees of freedom
    NDF = N - p
    # Calculate chi-square per degree of freedom
    chi2_ndf_value = chi2 / NDF
    
    return chi2_ndf_value

def chi2_ndf_histogram_density_2(ground_truth, generated_samples, num_bins):
    # Compute the common bins for both histograms
    min_val = min(np.min(ground_truth), np.min(generated_samples))
    max_val = max(np.max(ground_truth), np.max(generated_samples))
    bins = np.linspace(min_val, max_val, num_bins + 1)
    
    # Compute histograms
    gt_hist, _ = np.histogram(ground_truth, bins=bins)
    gen_hist, _ = np.histogram(generated_samples, bins=bins)
    ratio = np.sum(gt_hist) / np.sum(gen_hist)
    gen_hist = gen_hist * ratio

    min_count = 0.1
    gt_hist = np.where(gt_hist == 0, min_count, gt_hist)
    gen_hist = np.where(gen_hist == 0, min_count, gen_hist)
    gt_errors = np.sqrt(gt_hist)
    
    # Calculate chi-square
    chi2 = np.sum((gt_hist - gen_hist) ** 2 / gt_errors ** 2)

    N = num_bins
    p = 1
    
    # Degrees of freedom
    NDF = N - p
    
    # Calculate chi-square per degree of freedom
    chi2_ndf_value = chi2 / NDF
    
    return chi2_ndf_value

In [5]:
particles = ['gamma', 'eplus', 'piplus']
datatypes = ['geant', 'qpu', 'rbm']

In [6]:
# 1 and 2 mean different ways to calculate chi2_ndf

def measure_chi2_ndf_energy_1(particle, datatype_1, datatype_2, num_bins = 100):
    data_1 = DataProcessor(particle, datatype_1).dataset
    data_1_energy = torch.sum(data_1, dim=1).detach().numpy()
    data_2 = DataProcessor(particle, datatype_2).dataset
    data_2_energy = torch.sum(data_2, dim=1).detach().numpy()
    chi2_ndf_value = chi2_ndf_histogram_density_1(data_1_energy, data_2_energy, num_bins)
    print(f"Chi-square per degree of freedom (χ²/NDF) for {particle} between {datatype_1} and {datatype_2}: {chi2_ndf_value:.3g}")
    return chi2_ndf_value

def measure_chi2_ndf_energy_2(particle, datatype_1, datatype_2, num_bins = 100):
    data_1 = DataProcessor(particle, datatype_1).dataset
    data_1_energy = torch.sum(data_1, dim=1).detach().numpy()
    data_2 = DataProcessor(particle, datatype_2).dataset
    data_2_energy = torch.sum(data_2, dim=1).detach().numpy()
    chi2_ndf_value = chi2_ndf_histogram_density_2(data_1_energy, data_2_energy, num_bins)
    print(f"Chi-square per degree of freedom (χ²/NDF) for {particle} between {datatype_1} and {datatype_2}: {chi2_ndf_value:.3g}")
    return chi2_ndf_value

def measure_chi2_ndf_sparsity_1(particle, datatype_1, datatype_2, num_bins = 100):
    data_1 = DataProcessor(particle, datatype_1).dataset
    data_1_sparsity = (data_1 != 0).sum(dim=1).float() / data_1.size(1)
    data_2 = DataProcessor(particle, datatype_2).dataset
    data_2_sparsity = (data_2 != 0).sum(dim=1).float() / data_2.size(1)
    chi2_ndf_value = chi2_ndf_histogram_density_1(data_1_sparsity.numpy(), data_2_sparsity.numpy(), num_bins)
    print(f"Chi-square per degree of freedom (χ²/NDF) for {particle} between {datatype_1} and {datatype_2}: {chi2_ndf_value:.3g}")
    return chi2_ndf_value

def measure_chi2_ndf_sparsity_2(particle, datatype_1, datatype_2, num_bins = 100):
    data_1 = DataProcessor(particle, datatype_1).dataset
    data_1_sparsity = (data_1 != 0).sum(dim=1).float() / data_1.size(1)
    data_2 = DataProcessor(particle, datatype_2).dataset
    data_2_sparsity = (data_2 != 0).sum(dim=1).float() / data_2.size(1)
    chi2_ndf_value = chi2_ndf_histogram_density_2(data_1_sparsity.numpy(), data_2_sparsity.numpy(), num_bins)
    print(f"Chi-square per degree of freedom (χ²/NDF) for {particle} between {datatype_1} and {datatype_2}: {chi2_ndf_value:.3g}")
    return chi2_ndf_value

In [11]:
energy_chi2_ndf_dict = {}
print("Energy: Method 1")
for particle in particles:
    energy_chi2_ndf_dict[particle] = {}
    for datatype_1, datatype_2 in itertools.combinations(datatypes, 2):
        chi2_ndf_value = measure_chi2_ndf_energy_1(particle, datatype_1, datatype_2, num_bins=50)
        energy_chi2_ndf_dict[particle][f'{datatype_1}_{datatype_2}'] = chi2_ndf_value

Energy: Method 1
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and qpu: 4.94
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and rbm: 9.13
Chi-square per degree of freedom (χ²/NDF) for gamma between qpu and rbm: 1.04
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and qpu: 4.9
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and rbm: 4.11
Chi-square per degree of freedom (χ²/NDF) for eplus between qpu and rbm: 1.42
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and qpu: 9.87
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and rbm: 7.53
Chi-square per degree of freedom (χ²/NDF) for piplus between qpu and rbm: 1.91


In [12]:
energy_chi2_ndf_dict = {}
print("Energy: Method 2")
for particle in particles:
    energy_chi2_ndf_dict[particle] = {}
    for datatype_1, datatype_2 in itertools.combinations(datatypes, 2):
        chi2_ndf_value = measure_chi2_ndf_energy_2(particle, datatype_1, datatype_2, num_bins=50)
        energy_chi2_ndf_dict[particle][f'{datatype_1}_{datatype_2}'] = chi2_ndf_value

Energy: Method 2
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and qpu: 4.94
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and rbm: 9.13
Chi-square per degree of freedom (χ²/NDF) for gamma between qpu and rbm: 1.04
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and qpu: 4.9
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and rbm: 4.11
Chi-square per degree of freedom (χ²/NDF) for eplus between qpu and rbm: 1.42
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and qpu: 9.87
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and rbm: 7.52
Chi-square per degree of freedom (χ²/NDF) for piplus between qpu and rbm: 1.9


In [13]:
sparsity_chi2_ndf_dict = {}
print("Sparsity: Method 1")
for particle in particles:
    sparsity_chi2_ndf_dict[particle] = {}
    for datatype_1, datatype_2 in itertools.combinations(datatypes, 2):
        chi2_ndf_value = measure_chi2_ndf_sparsity_1(particle, datatype_1, datatype_2, num_bins=50)
        sparsity_chi2_ndf_dict[particle][f'{datatype_1}_{datatype_2}'] = chi2_ndf_value

Sparsity: Method 1
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and qpu: 28.4
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and rbm: 28.6
Chi-square per degree of freedom (χ²/NDF) for gamma between qpu and rbm: 1.39
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and qpu: 25.1
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and rbm: 104
Chi-square per degree of freedom (χ²/NDF) for eplus between qpu and rbm: 1.14
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and qpu: 63.2
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and rbm: 57.8
Chi-square per degree of freedom (χ²/NDF) for piplus between qpu and rbm: 2.87


In [14]:
sparsity_chi2_ndf_dict = {}
print("Sparsity: Method 2")
for particle in particles:
    sparsity_chi2_ndf_dict[particle] = {}
    for datatype_1, datatype_2 in itertools.combinations(datatypes, 2):
        chi2_ndf_value = measure_chi2_ndf_sparsity_2(particle, datatype_1, datatype_2, num_bins=50)
        sparsity_chi2_ndf_dict[particle][f'{datatype_1}_{datatype_2}'] = chi2_ndf_value


Sparsity: Method 2
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and qpu: 28.4
Chi-square per degree of freedom (χ²/NDF) for gamma between geant and rbm: 28.6
Chi-square per degree of freedom (χ²/NDF) for gamma between qpu and rbm: 1.39
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and qpu: 25.1
Chi-square per degree of freedom (χ²/NDF) for eplus between geant and rbm: 104
Chi-square per degree of freedom (χ²/NDF) for eplus between qpu and rbm: 1.14
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and qpu: 63.2
Chi-square per degree of freedom (χ²/NDF) for piplus between geant and rbm: 57.8
Chi-square per degree of freedom (χ²/NDF) for piplus between qpu and rbm: 2.87
