In [1]:
import h5py
import pandas as pd
import numpy as np

In [2]:
f = h5py.File('./data/processed-pythia82-lhc13-all-pt1-50k-r1_h022_e0175_t220_nonu_withPars_truth_0.z', 'r')
treeArray = f['t_allpar_new'][()]

features = ['j1_px', 'j1_py', 'j1_pz', 'j1_e', 'j1_erel', 'j1_pt', 'j1_ptrel', 'j1_eta', 'j1_etarel', 'j1_etarot', 'j1_phi', 'j1_phirel', 'j1_phirot', 'j1_deltaR', 'j1_costheta', 'j1_costhetarel','j_index']
labels = ['j_g','j_q','j_w','j_z','j_t','j_index']

features_df = pd.DataFrame(treeArray,columns=list(set(features)))
labels_df = pd.DataFrame(treeArray,columns=list(set(labels)))
df = pd.concat([features_df.drop(['j_index'],axis=1),labels_df],axis=1)
df = df.reindex(features[:-1]+labels, axis=1)

In [3]:
def j_transform(MaxParticles, df, features, labels):
    '''
    df should have feature-cols, label-cols and j_index.
    features and labels also should have j_index.
    '''
    df['label'] = df[labels[:-1]].values.tolist()
    df = df.drop(labels[:-1],axis=1)
    
    cols = features[:-1]
    
    jBased_df = pd.DataFrame(columns=cols+['label'])
    jBased_df = jBased_df.astype('object')
    

    for idx, jetId in enumerate(set(df.j_index)):
        nParticles = df[df['j_index']==jetId].shape[0]
        j_df = df[df['j_index']==jetId].sort_values(by=['j1_ptrel'], ascending=False)    # sort descending by ptrel
        
        if nParticles >= MaxParticles:
            j_df = j_df.iloc[:MaxParticles,:]
        else:
            emptyRows = pd.DataFrame(0,index=np.arange(MaxParticles-nParticles),columns = j_df.columns)
            j_df = pd.concat([j_df,emptyRows],axis=0)
            
        for col in cols:
            jBased_df.loc[idx,col] = j_df[col].values
        jBased_df.loc[idx,'label'] = j_df.label.iloc[0]                         

    return jBased_df

In [4]:
MaxParticles = 100


jet_df = j_transform(MaxParticles,df,features,labels)
jet_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 98769 entries, 0 to 98768
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   j1_px           98769 non-null  object
 1   j1_py           98769 non-null  object
 2   j1_pz           98769 non-null  object
 3   j1_e            98769 non-null  object
 4   j1_erel         98769 non-null  object
 5   j1_pt           98769 non-null  object
 6   j1_ptrel        98769 non-null  object
 7   j1_eta          98769 non-null  object
 8   j1_etarel       98769 non-null  object
 9   j1_etarot       98769 non-null  object
 10  j1_phi          98769 non-null  object
 11  j1_phirel       98769 non-null  object
 12  j1_phirot       98769 non-null  object
 13  j1_deltaR       98769 non-null  object
 14  j1_costheta     98769 non-null  object
 15  j1_costhetarel  98769 non-null  object
 16  label           98769 non-null  object
dtypes: object(17)
memory usage: 16.1+ MB


In [10]:
import os
def saveDf (df,MaxParticles, feature_num,label_num, destdir):
    if not os.path.exists(destdir):
        os.makedirs(destdir)
    output = os.path.join(destdir, 'dataset_%dp_%df_%dl.csv'%(MaxParticles,feature_num,label_num))
    df.to_csv(output,index=False)

In [None]:
saveDf(jet_df, MaxParticles, len(features)-1,len(labels)-1,'data')