Normalize the dataset using sklearn standard scaler and save the scaled data to disk

In [1]:
import h5py
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
ef = h5py.File('/fast_scratch/QVAE/data/calo/eplus.hdf5','r')
gf = h5py.File('/fast_scratch/QVAE/data/calo/gamma.hdf5','r')
pf = h5py.File('/fast_scratch/QVAE/data/calo/piplus.hdf5','r')

In [3]:
for key in ef.keys():
    print(key, ef[key].shape, ef[key].dtype)

energy (100000, 1) float64
layer_0 (100000, 3, 96) float64
layer_1 (100000, 12, 12) float64
layer_2 (100000, 12, 6) float64
overflow (100000, 3) float64


In [4]:
hdfs = [ef, gf, pf]

In [5]:
nplcats = []
for i, hdf in enumerate(hdfs):
    npl0 = np.array(hdf['layer_0'])
    npl1 = np.array(hdf['layer_1'])
    npl2 = np.array(hdf['layer_2'])
    
    npl0 = npl0.reshape(npl0.shape[0], -1)
    npl1 = npl1.reshape(npl1.shape[0], -1)
    npl2 = npl2.reshape(npl2.shape[0], -1)
    
    nplcats.append(np.concatenate([npl0, npl1, npl2], axis=1))

In [6]:
nplcatscaled = []
epsilon = 1e-2

for i in range(len(nplcats)):
    nparr = nplcats[i]
    nparr = np.where(nparr > 0., nparr, np.nan)
    transformer = StandardScaler().fit(nparr)
    nparr = transformer.transform(nparr)
    
    nparr = np.where(np.isnan(nparr), np.inf, nparr)
    
    for j in range(nparr.shape[1]):
        arrmin = np.amin(nparr[:, j])
        if arrmin < 0 and not np.isnan(arrmin):
            nparr[:, j] -= arrmin
            nparr[:, j] += epsilon
            
    nparr = np.where(np.isinf(nparr), 0, nparr)
    
    for j in range(nparr.shape[1]):
        arrmin = np.amin(nparr[:, j])
        if arrmin < 0:
            print(j, arrmin)
            
    nplcatscaled.append(nparr)

In [7]:
ef_scaled = h5py.File('/fast_scratch/QVAE/data/calo_scaled/eplus.hdf5','w')
gf_scaled = h5py.File('/fast_scratch/QVAE/data/calo_scaled/gamma.hdf5','w')
pf_scaled = h5py.File('/fast_scratch/QVAE/data/calo_scaled/piplus.hdf5','w')

In [8]:
hdfs_scaled = [ef_scaled, gf_scaled, pf_scaled]

In [9]:
layer_shapes = {}
for key in hdf.keys():
    if key == "energy" or key == "overflow":
        pass
    else:
        layer_shapes[key] = hdf[key].shape

In [10]:
layer_shapes

{'layer_0': (100000, 3, 96),
 'layer_1': (100000, 12, 12),
 'layer_2': (100000, 12, 6)}

In [11]:
for hdf, hdf_scaled, scaled_data in zip(hdfs, hdfs_scaled, nplcatscaled):
    offset = 0
    for key in hdf.keys():
        if key == "energy" or key == "overflow":
            hdf_scaled.create_dataset(key, data=hdf[key])
        else:
            layer_shape = layer_shapes[key]
            print(scaled_data.shape)
            layer_data = scaled_data[:, offset:offset+(layer_shape[1]*layer_shape[2])]
            print(layer_data.shape)
            layer_data = layer_data.reshape(layer_shape)
            hdf_scaled.create_dataset(key, data=layer_data)
            offset += layer_shape[1]*layer_shape[2]

(100000, 504)
(100000, 288)
(100000, 504)
(100000, 144)
(100000, 504)
(100000, 72)
(100000, 504)
(100000, 288)
(100000, 504)
(100000, 144)
(100000, 504)
(100000, 72)
(100000, 504)
(100000, 288)
(100000, 504)
(100000, 144)
(100000, 504)
(100000, 72)


In [12]:
for hdf_scaled in hdfs_scaled:
    for key in hdf_scaled.keys():
        print(key, hdf_scaled[key].shape, hdf_scaled[key].dtype)

energy (100000, 1) float64
layer_0 (100000, 3, 96) float64
layer_1 (100000, 12, 12) float64
layer_2 (100000, 12, 6) float64
overflow (100000, 3) float64
energy (100000, 1) float64
layer_0 (100000, 3, 96) float64
layer_1 (100000, 12, 12) float64
layer_2 (100000, 12, 6) float64
overflow (100000, 3) float64
energy (100000, 1) float64
layer_0 (100000, 3, 96) float64
layer_1 (100000, 12, 12) float64
layer_2 (100000, 12, 6) float64
overflow (100000, 3) float64


In [13]:
for hdf_scaled in hdfs_scaled:
    hdf_scaled.close()

In [14]:
for nplcat in nplcatscaled:
    print(nplcat.shape)

(100000, 504)
(100000, 504)
(100000, 504)


In [15]:
transformers

NameError: name 'transformers' is not defined