In [1]:
import h5py
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
ef = h5py.File('/fast_scratch/QVAE/data/calo/eplus.hdf5','r')
gf = h5py.File('/fast_scratch/QVAE/data/calo/gamma.hdf5','r')
pf = h5py.File('/fast_scratch/QVAE/data/calo/piplus.hdf5','r')

In [3]:
hdfs = [ef, gf, pf]
nplcats = []

for hdf in hdfs:
    npl0 = np.array(hdf['layer_0'])
    npl1 = np.array(hdf['layer_1'])
    npl2 = np.array(hdf['layer_2'])
    
    npl0 = npl0.reshape(npl0.shape[0], -1)
    npl1 = npl1.reshape(npl1.shape[0], -1)
    npl2 = npl2.reshape(npl2.shape[0], -1)
    
    nplcats.append(np.concatenate([npl0, npl1, npl2], axis=1))

In [4]:
nplcatscaled = []
transformers = []
arrmins = [[], [], []]
epsilon = 1e-2

for i in range(len(nplcats)):
    nparr = nplcats[i]
    nparr = np.where(nparr > 0., nparr, np.nan)
    transformer = StandardScaler().fit(nparr)
    nparr = transformer.transform(nparr)
    transformers.append(transformer)
    
    nparr = np.where(np.isnan(nparr), np.inf, nparr)
    
    for j in range(nparr.shape[1]):
        arrmin = np.amin(nparr[:, j])
        
        if arrmin < 0 and not np.isnan(arrmin):
            nparr[:, j] -= arrmin
            nparr[:, j] += epsilon
            arrmins[i].append(arrmin)
        else:
            arrmins[i].append(0.)
            
    nparr = np.where(np.isinf(nparr), 0, nparr)
    
    for j in range(nparr.shape[1]):
        arrmin = np.amin(nparr[:, j])
        if arrmin < 0:
            print(j, arrmin)
            
    nplcatscaled.append(nparr)

In [5]:
ef_scaled = h5py.File('/fast_scratch/QVAE/data/calo_scaled/eplus.hdf5','w')
gf_scaled = h5py.File('/fast_scratch/QVAE/data/calo_scaled/gamma.hdf5','w')
pf_scaled = h5py.File('/fast_scratch/QVAE/data/calo_scaled/piplus.hdf5','w')

In [6]:
hdfs_scaled = [ef_scaled, gf_scaled, pf_scaled]
layer_shapes = {}
for key in hdf.keys():
    if key == "energy" or key == "overflow":
        pass
    else:
        layer_shapes[key] = hdf[key].shape

In [7]:
layer_shapes

{'layer_0': (100000, 3, 96),
 'layer_1': (100000, 12, 12),
 'layer_2': (100000, 12, 6)}

In [8]:
for hdf, hdf_scaled, scaled_data in zip(hdfs, hdfs_scaled, nplcatscaled):
    offset = 0
    for key in hdf.keys():
        if key == "energy" or key == "overflow":
            hdf_scaled.create_dataset(key, data=hdf[key])
        else:
            layer_shape = layer_shapes[key]
            print(scaled_data.shape)
            layer_data = scaled_data[:, offset:offset+(layer_shape[1]*layer_shape[2])]
            print(layer_data.shape)
            layer_data = layer_data.reshape(layer_shape)
            hdf_scaled.create_dataset(key, data=layer_data)
            offset += layer_shape[1]*layer_shape[2]

(100000, 504)
(100000, 288)
(100000, 504)
(100000, 144)
(100000, 504)
(100000, 72)
(100000, 504)
(100000, 288)
(100000, 504)
(100000, 144)
(100000, 504)
(100000, 72)
(100000, 504)
(100000, 288)
(100000, 504)
(100000, 144)
(100000, 504)
(100000, 72)


In [9]:
for hdf_scaled in hdfs_scaled:
    for key in hdf_scaled.keys():
        print(key, hdf_scaled[key].shape, hdf_scaled[key].dtype)

energy (100000, 1) float64
layer_0 (100000, 3, 96) float64
layer_1 (100000, 12, 12) float64
layer_2 (100000, 12, 6) float64
overflow (100000, 3) float64
energy (100000, 1) float64
layer_0 (100000, 3, 96) float64
layer_1 (100000, 12, 12) float64
layer_2 (100000, 12, 6) float64
overflow (100000, 3) float64
energy (100000, 1) float64
layer_0 (100000, 3, 96) float64
layer_1 (100000, 12, 12) float64
layer_2 (100000, 12, 6) float64
overflow (100000, 3) float64


In [10]:
for hdf_scaled in hdfs_scaled:
    hdf_scaled.close()

In [11]:
for nplcat in nplcatscaled:
    print(nplcat.shape)

(100000, 504)
(100000, 504)
(100000, 504)


In [12]:
print(len(arrmins[0]))

504


In [None]:
nplcatinv = []

for i in range(len(nplcatscaled)):
    nparr = nplcatscaled[i]
    nparr = np.where(nparr > 0., nparr, np.nan)
    
    for j in range(nparr.shape[1]):
        arrmin = arrmins[i][j]
        if arrmin < 0. and not np.isnan(arrmin):
            nparr[:, j] -= epsilon
            nparr[:, j] += arrmin
            
    transformer = transformers[i]
    nparr = transformer.inverse_transform(nparr)
    
    nparr = np.where(np.isinf(nparr), 0, nparr)
    nplcatinv.append(nparr)

In [None]:
for i in range(len(nplcatinv)):
    nparrorig = nplcats[i]
    nparrinv = nplcatinv[i]
    
    for j in range(nparrorig.shape[1]):
        diff = np.sum(nparrorig[:, j] - nparrinv[:, j])
        if diff > 0:
            print(i, j, diff)

In [None]:
params = transformers[0].get_params()

In [None]:
params

In [13]:
import joblib

In [None]:
joblib.dump(transformers[0], 'scaler.gz')
transformer = joblib.load('scaler.gz')

In [11]:
print(len(transformers))

3


In [14]:
joblib.dump(transformers[0], '/fast_scratch/QVAE/data/calo_scaled/eplus_scaler.gz')
joblib.dump(transformers[1], '/fast_scratch/QVAE/data/calo_scaled/gamma_scaler.gz')
joblib.dump(transformers[2], '/fast_scratch/QVAE/data/calo_scaled/piplus_scaler.gz')

['/fast_scratch/QVAE/data/calo_scaled/piplus_scaler.gz']

In [None]:
ld_transformers = []
ld_transformers.append(joblib.load('/fast_scratch/QVAE/data/calo_scaled/eplus_scaler.gz'))
ld_transformers.append(joblib.load('/fast_scratch/QVAE/data/calo_scaled/gamma_scaler.gz'))
ld_transformers.append(joblib.load('/fast_scratch/QVAE/data/calo_scaled/piplus_scaler.gz'))

In [None]:
nplcatinv = []

for i in range(len(nplcatscaled)):
    nparr = nplcatscaled[i]
    nparr = np.where(nparr > 0., nparr, np.inf)
    
    for j in range(nparr.shape[1]):
        arrmin = arrmins[i][j]
        if arrmin < 0. and not np.isnan(arrmin):
            nparr[:, j] -= epsilon
            nparr[:, j] += arrmin
            
    transformer = ld_transformers[i]
    nparr = transformer.inverse_transform(nparr)
    
    nparr = np.where(np.isnan(nparr), 0, nparr)
    nplcatinv.append(nparr)

In [None]:
for i in range(len(nplcatinv)):
    nparrorig = nplcats[i]
    nparrinv = nplcatinv[i]
    
    for j in range(nparrorig.shape[1]):
        diff = np.sum(nparrorig[:, j] - nparrinv[:, j])
        if diff > 1e-4:
            print(i, j, diff)

In [None]:
print(len(arrmins))

In [15]:
print(arrmins[0])

[-0.5959064373416986, -0.5622603200408175, -0.37513110935013355, -0.6737464227445097, -0.629446319873555, -0.6720499945434836, -0.7302354625912914, -0.7172861209373235, -0.7028860397764091, -0.7345743016415225, -0.6836477418781669, -0.6973972609377652, -0.7288815012062925, -0.7322435141756855, -0.7567346767560692, -0.7187761469688931, -0.7493992596107317, -0.6886990813275131, -0.5955661933364701, -0.758772870266937, -0.7486757372817129, -0.6744731828368059, -0.7979090057187314, -0.7702256173688452, -0.8121844160731246, -0.7760967778174547, -0.7753829365363561, -0.7408317852620152, -0.7798878372220637, -0.7350784816719097, -0.8244806915064258, -0.837450391153935, -0.8265436305293953, -0.8012540710792301, -0.8441669480976528, -0.8272459138771697, -0.8425398132824912, -0.8498636207731, -0.850767351049863, -0.856530789339261, -0.8758901766232691, -0.8556008494014192, -0.847688302994781, -0.8769419360557953, -0.8651428142733057, -0.897401725331356, -0.8217828948031094, -0.9123910466292217, 

In [16]:
for i, ptype in enumerate(["eplus", "gamma", "piplus"]):
    filepath = "/fast_scratch/QVAE/data/calo_scaled/" + ptype + "_amin.npy"
    with open(filepath, 'wb') as f:
        np.save(f, arrmins[i])

In [None]:
def method(idx, a=10):
    return a

In [None]:
method(1)

In [None]:
a = np.random.randn(50)

In [None]:
len(a[:50])

In [None]:
a, b = [50, 10]

In [None]:
a

In [None]:
b

In [None]:
np.logspace(, 10, 100)

In [None]:
np.log10(0)

In [None]:
5^/2

In [None]:
a = [1, 2]
b = [3, 4]

In [None]:
c = [np.divide(ai, bi) for ai, bi in zip(a, b)]

In [None]:
c

In [None]:
a = {"a":1, "b":2}

In [None]:
a[[a,b]]