In [25]:
import numpy as np
import h5py
import pandas as pd
import os
import itertools

In [26]:
def downsize(df,size,seed,labels):
    jet_dict = {}
    np.random.seed(seed)
    for label in labels:
        jet_dict[label] = np.random.choice(a=df[df[label]==1].j_index, size=size,replace=False )
    mini_df = pd.DataFrame()
    for label in labels:
        mini_df = pd.concat([mini_df,df[df.j_index.isin(jet_dict[label])]],axis=0)
    return mini_df

In [16]:
def downsize_unbalanced(df,max_size,seed,labels,ratio):
    '''max_size is the baseline. It is 1 in ratio.'''
    jet_dict = {}
    np.random.seed(seed)
    for i,label in enumerate(labels):
        jet_dict[label] = np.random.choice(a=df[df[label]==1].j_index, size= int(max_size*ratio[i]),replace=False )
    mini_df = pd.DataFrame()
    
    for label in labels:
        mini_df = pd.concat([mini_df,df[df.j_index.isin(jet_dict[label])]],axis=0)
    return mini_df

In [4]:
def loadNdownsize (filePath,features,labels,size,seed,ratio=None):
    '''
    features: 'j_index' is necessary!
    '''
    with h5py.File(filePath+"processed-pythia82-lhc13-all-pt1-50k-r1_h022_e0175_t220_nonu_withPars_truth_0.z", 'r') as f:
        treeArray = f['t_allpar_new'][()]
    df = pd.DataFrame(treeArray, columns=features+labels) 
    if ratio==None:
        mini_df = downsize(df,size,seed,labels)
    else:
        mini_df = downsize_unbalanced(df,size,seed,labels,ratio=ratio)
    # Add constituents_index
    x = [len(list(y)) for _,y in itertools.groupby(mini_df['j_index'])]
    j_cIndex = np.array([],dtype='int8')
    for i in x:
        new_jet_index = np.arange(i)
        j_cIndex = np.append(j_cIndex, new_jet_index)
    mini_df['constituents_index'] = j_cIndex
    _features = features+['constituents_index']
    return mini_df, _features

In [41]:
###########TEST##############
filePath = './data/'
features = ['j_index','j1_phirel','j1_etarel','j1_phirot','j1_etarot','j1_deltaR','j1_pdgid','j1_ptrel','j1_erel','j_multiplicity'] 
labels = ['j_g','j_q','j_w','j_z','j_t']
with h5py.File(filePath+"processed-pythia82-lhc13-all-pt1-50k-r1_h022_e0175_t220_nonu_withPars_truth_0.z", 'r') as f:
    treeArray = f['t_allpar_new'][()]
    df = pd.DataFrame(treeArray, columns=features+labels)

# jet_dict = {}
# np.random.seed(42)
# for label in labels:
#     jet_dict[label] = np.random.choice(a=df[df[label]==1].j_index, size=size,replace=False )
# mini_df = pd.DataFrame()
# for label in labels:
#     mini_df = pd.concat([mini_df,df[df.j_index.isin(jet_dict[label])]],axis=0)
df[df['j_t']==1].j_index.shape

(1329098,)

In [42]:
###########TEST##############
np.unique(df[df['j_t']==1].j_index).shape


(20136,)

In [39]:
###########TEST##############

# Sort constituents by pt in each jet
df_sort = pd.DataFrame()
cIndex = np.array([],dtype='int8')
for i in np.unique(mini_df['j_index']):
    df_sort = pd.concat([df_sort,mini_df[mini_df['j_index']==i].sort_values(by=['j1_ptrel'],ascending=False)],axis=0)
    new_cIndex = np.arange(mini_df[mini_df['j_index']==i].shape[0])
    cIndex = np.append(cIndex, new_cIndex)
df_sort['constituents_index'] = cIndex

2793564    0.000398
2793565    0.000382
2793566    0.000349
2793567    0.000317
2793568    0.000276
2793569    0.000269
2793570    0.000233
2793571    0.000165
2793572    0.000139
2793573    0.000058
2949397    0.223644
2949398    0.163107
2949399    0.123149
2949400    0.052711
2949401    0.045249
2949402    0.035569
2949403    0.032382
2949404    0.030572
2949405    0.022388
2949406    0.017231
Name: j1_ptrel, dtype: float32

In [17]:
def loadNdownsize (filePath,features,labels,size,seed,ratio=None):
    '''
    features: 'j_index' is necessary!
    '''
    with h5py.File(filePath+"processed-pythia82-lhc13-all-pt1-50k-r1_h022_e0175_t220_nonu_withPars_truth_0.z", 'r') as f:
        treeArray = f['t_allpar_new'][()]
    df = pd.DataFrame(treeArray, columns=features+labels) 
    if ratio==None:
        mini_df = downsize(df,size,seed,labels)
    else:
        mini_df = downsize_unbalanced(df,size,seed,labels,ratio=ratio)
    df_sort = pd.DataFrame()
    for i in np.unique(mini_df['j_index']):
        df_sort = pd.concat([df_sort,mini_df[mini_df['j_index']==i].sort_values(by=['j1_pt'],ascending=False)],axis=0)
    return df_sort

In [18]:
def saveAsH5(filePath,df, size,features,labels,ratio=None):
# Since list is not valid in Dataframe, create a nparray to store the label list
    if ratio==None:
        label_list = np.vstack([df[label] for label in labels]).T
    else:
        target = labels[ratio.index(1)]
        label_list = np.vstack([df[target], np.abs(df[target]-1)]).T
    
    if ratio==None:
        savePath = filePath+"data_%sjets_%dlabels"%(size,len(labels))+'.h5'
    else:
        savePath = filePath+"data_%sjets_%dlabels_unbalanced"%(size,len(labels))+'.h5'
        
#     if os.path.exists(savePath):
#         with h5py.File(savePath, 'r+') as f:
#             for col in feats:
#                 f[col][()] = df[col]
#             f['label'][()] = label_list
#     else:
#         with h5py.File(savePath, 'w') as f:
#             for col in feats:
#                 f.create_dataset(col, data=df[col])
#             f.create_dataset('label', data = label_list)
    with h5py.File(savePath, 'w') as f:
            for col in features:
                f.create_dataset(col, data=df[col])
            f.create_dataset('label', data = label_list)

In [19]:
def LoadTransSave (filePath, features, labels, size, seed,ratio=None):
    mini_df = loadNdownsize(filePath,features, labels,size=size,seed=seed,ratio=ratio)
    saveAsH5(filePath, df = mini_df, size=size, features=features, labels=labels,ratio=ratio)

In [20]:
filePath = './data/'
features = ['index','j1_pt','j1_ptrel','j1_eta','j1_phi','j_mass','j1_pdgid','j1_deltaR','j_multiplicity','j1_etarot','j1_phirot','j_pt','j1_etarel','j1_phirel','j_index']
labels = ['j_g','j_q','j_w','j_z','j_t']
size=20
seed=42
ratio=[.25,.25,.25,.25,1]
LoadTransSave(filePath,features, labels,size=size,seed=seed,ratio=ratio)

In [24]:
with h5py.File('data/data_20jets_5labels_unbalanced.h5') as f:
    x = f['j_index'][()]
    y = f['j1_pt'][()]
# print(np.count_nonzero(x == [1,0]))
# print(np.count_nonzero(x == [0,1]))
print(x[40:50])
print(y[40:50])

[49044 49044 49044 49044 49044 49044 77074 77074 77074 77074]
[9.4040769e-01 8.7528598e-01 8.1013715e-01 5.8639365e-01 5.1699448e-01
 1.8312424e-02 1.3740416e+02 1.1231701e+02 1.0295010e+02 7.2317307e+01]


  """Entry point for launching an IPython kernel.


66