In [107]:
## open the root file 
import uproot

key = "deepntuplizer/tree;42"

def list_trees_and_branches(file_path,key):
    """
    Lists all trees and branches in the given ROOT file.
    
    Parameters:
    - file_path: Path to the ROOT file.
    """
    # Open the ROOT file
    file = uproot.open(file_path)

    obj = file[key]
    print ("key: ", key)
    #Check if the object is a tree
    if isinstance(obj, uproot.TTree):
        print(f"Tree name: {obj.name}")
        print("Branches:")
        # Print the structure of the tree
        print(obj.show())
        print()


# Example usage
file_path = "try0.root"
list_trees_and_branches(file_path,key)

key:  deepntuplizer/tree;42
Tree name: tree
Branches:
name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
Delta_gen_pt         | float                    | AsDtype('>f4')
event_no             | uint32_t                 | AsDtype('>u4')
gen_pt               | float                    | AsDtype('>f4')
isB                  | int32_t                  | AsDtype('>i4')
isBB                 | int32_t                  | AsDtype('>i4')
isC                  | int32_t                  | AsDtype('>i4')
isG                  | int32_t                  | AsDtype('>i4')
isLeptonicB          | int32_t                  | AsDtype('>i4')
isLeptonicB_C        | int32_t                  | AsDtype('>i4')
isS                  | int32_t                  | AsDtype('>i4')
isUD                 | int32_t                  | AsDtype('>i4')
isUndefined          | int32_t                  | AsDtype('>i4')
jet_

In [108]:
import uproot

def extract_fj_label(file_path, tree_name):
    """
    Extracts the fj_label branch from the given ROOT file and tree name.
    
    Parameters:
    - file_path: Path to the ROOT file.
    - tree_name: Name of the tree containing the fj_label branch.
    
    Returns:
    - fj_label: Numpy array containing the values of the fj_label branch, or None if not found.
    """
    # Open the ROOT file
    file = uproot.open(file_path)
    
    # Get the tree object
    tree = file[tree_name]
    
    # Load only the necessary branches
    try:
        branches = tree.arrays(["fj_label"])
        fj_label = branches["fj_label"]
        return fj_label
    except KeyError:
        print("fj_label branch not found in the tree.")
        return None

# Example usage
file_path = "try0.root"
tree_name = "deepntuplizer/tree;42"  # Use the correct tree name from the output
fj_label = extract_fj_label(file_path, tree_name)
if fj_label is not None:
    print(fj_label)

[51, 55, 55, 41, 55, 55, 41, 55, 55, ..., 55, 53, 52, 41, 41, 41, 51, 55, 55]


In [109]:
import uproot
import numpy as np

def extract_branches(file_path, tree_name, branch_names):
    """
    Extracts the specified branches from the given ROOT file and tree name, and combines them into a single array.
    
    Parameters:
    - file_path: Path to the ROOT file.
    - tree_name: Name of the tree containing the branches.
    - branch_names: List of branch names to extract.
    
    Returns:
    - combined_array: Numpy array containing the values of the specified branches combined into one array, or None if not found.
    """
    # Open the ROOT file
    file = uproot.open(file_path)
    
    # Get the tree object
    tree = file[tree_name]
    
    # Load only the necessary branches
    try:
        branches = tree.arrays(branch_names)
        combined_array = np.column_stack([branches[name] for name in branch_names])
        return np.array(combined_array)
    except KeyError:
        print("One or more branches not found in the tree.")
        return None


In [110]:

def is_regular(array):
    """
    Check if the array is regular, meaning all its elements have the same shape.
    
    Parameters:
    - array: Nested array to check for regularity.
    
    Returns:
    - regular: Boolean indicating whether the array is regular or not.
    """
    first_shape = np.shape(array[0])
    for element in array:
        if np.shape(element) != first_shape:
            return False
    return True

In [111]:
import uproot
import numpy as np
import awkward as ak

def extract_branches(file_path, tree_name, branch_names):
    """
    Extracts the specified branches from the given ROOT file and tree name, and combines them into a single array.
    
    Parameters:
    - file_path: Path to the ROOT file.
    - tree_name: Name of the tree containing the branches.
    - branch_names: List of branch names to extract.
    
    Returns:
    - combined_array: Numpy array containing the values of the specified branches combined into one array, or None if not found.
    """
    # Open the ROOT file
    file = uproot.open(file_path)
    
    # Get the tree object
    tree = file[tree_name]
    
    # Load only the necessary branches
    try:
        branches = tree.arrays(branch_names)

        # Initialize combined_array with the first branch array 
        combined_array = branches[branch_names[0]]
        print ("here")

        # if not regular size, fill the blank with 999 (just a number, can be modified)
        if not is_regular(combined_array):

            print ("in side else ")

            combined_list = combined_array.tolist()

            max_length = max(len(sublist) for sublist in combined_list)

            # Iterate through each sublist and append 999 until it reaches the maximum length
            for sublist in combined_list:
                while len(sublist) < max_length:
                    sublist.append(999)

            # Convert the list of lists to a numpy array
            result_array = np.array(combined_list)
            combined_array = result_array

        # convert the array to norml np.array 
        combined_array = np.array(combined_array)

        # check the rest branch (starting from index 1)
        for name in branch_names[1:]:
            branch_array = branches[name]
            check_branch = branch_array
            
            print(name)

            # check under this name, if the array is regular or not 
            if not is_regular(check_branch):

                print ("in side forrrr else ")

                checked_list = check_branch.tolist()

                max_length = max(len(sublist) for sublist in checked_list)

                # Iterate through each sublist and append 999 until it reaches the maximum length
                for sublist in checked_list:
                    while len(sublist) < max_length:
                        sublist.append(999)

                # Convert the list of lists to a numpy array
                check_branch = np.array(checked_list)
                
            combined_array = np.column_stack((combined_array, check_branch))
            combined_array = np.array(combined_array)

        return combined_array
    
    except KeyError:
        print("One or more branches not found in the tree.")
        return None




In [112]:
params_1 = ['pfcand_ptrel',
          'pfcand_erel',
          'pfcand_phirel',
          'pfcand_etarel',
          'pfcand_deltaR',
          'pfcand_puppiw',
          'pfcand_drminsv',
          'pfcand_drsubjet1',
          'pfcand_drsubjet2',
          'pfcand_hcalFrac'
         ]

params_2 = ['track_ptrel',     
          'track_erel',     
          'track_phirel',     
          'track_etarel',     
          'track_deltaR',
          'track_drminsv',     
          'track_drsubjet1',     
          'track_drsubjet2',
          'track_dz',     
          'track_dzsig',     
          'track_dxy',     
          'track_dxysig',     
          'track_normchi2',     
          'track_quality',     
          'track_dptdpt',     
          'track_detadeta',     
          'track_dphidphi',     
          'track_dxydxy',     
          'track_dzdz',     
          'track_dxydz',     
          'track_dphidxy',     
          'track_dlambdadz',     
          'trackBTag_EtaRel',     
          'trackBTag_PtRatio',     
          'trackBTag_PParRatio',     
          'trackBTag_Sip2dVal',     
          'trackBTag_Sip2dSig',     
          'trackBTag_Sip3dVal',     
          'trackBTag_Sip3dSig',     
          'trackBTag_JetDistVal'
         ]

params_3 = ['sv_ptrel',
          'sv_erel',
          'sv_phirel',
          'sv_etarel',
          'sv_deltaR',
          'sv_pt',
          'sv_mass',
          'sv_ntracks',
          'sv_normchi2',
          'sv_dxy',
          'sv_dxysig',
          'sv_d3d',
          'sv_d3dsig',
          'sv_costhetasvpv'
         ]



In [113]:
# Example usage

# access data from tree_name in file_path
file_path = "try0.root" # to be modified base on the file you have downloaded 
tree_name = "deepntuplizer/tree;42"  # Use the correct tree name from the output

# features want, for formal use, can replace with params_1,2,3 (this is the features used in the paper)
branch_names = ["fj_label", "pfcand_ptrel","Delta_gen_pt", "jet_pt","pfcand_phirel"]
combined_array = extract_branches(file_path, tree_name, branch_names)

### Thus, all needed feature are stored in the combined array

# print relevent information 
if combined_array is not None:
    print( combined_array)
    print("Shape of combined array:", combined_array.shape)

here
pfcand_ptrel
in side forrrr else 
Delta_gen_pt
jet_pt
pfcand_phirel
in side forrrr else 
[[5.10000000e+01 2.25779369e-01 8.13291296e-02 ... 9.99000000e+02
  9.99000000e+02 9.99000000e+02]
 [5.50000000e+01 5.62919796e-01 1.93665072e-01 ... 9.99000000e+02
  9.99000000e+02 9.99000000e+02]
 [5.50000000e+01 1.57369405e-01 1.29131347e-01 ... 9.99000000e+02
  9.99000000e+02 9.99000000e+02]
 ...
 [5.10000000e+01 1.54905006e-01 1.21462427e-01 ... 9.99000000e+02
  9.99000000e+02 9.99000000e+02]
 [5.50000000e+01 2.43539378e-01 1.63723961e-01 ... 9.99000000e+02
  9.99000000e+02 9.99000000e+02]
 [5.50000000e+01 4.65071291e-01 2.88986385e-01 ... 9.99000000e+02
  9.99000000e+02 9.99000000e+02]]
Shape of combined array: (200000, 397)


In [106]:
## example of the method use to deal with not regular array 

import numpy as np

# Given array with varying sublist lengths
array_with_none = [[1, 2],
                   [3, 4, 5],
                   [5,]]

# Find the maximum length of sublists
max_length = max(len(sublist) for sublist in array_with_none)

# Iterate through each sublist and append 999 until it reaches the maximum length
for sublist in array_with_none:
    while len(sublist) < max_length:
        sublist.append(999)

# Convert the list of lists to a numpy array
result_array = np.array(array_with_none)

print(result_array)
print(result_array.shape)


[[  1   2 999]
 [  3   4   5]
 [  5 999 999]]
(3, 3)
