# Start

## Some Notes
- Replace the file_path with the version you have downloaded 
- There are two different methods to deal with data in this notebook 
- The return type might be different, some time the function return `awkward.highlevel.Array`, some time the function return regular `np.array`. This is depend on shape of data and usage of function. They can be converted to each other if the shape of the array is regular. (i.e. rectangular)

# Structure of Root File

In [1]:
## open the root file 
import uproot

key = "deepntuplizer/tree;42"

def list_trees_and_branches(file_path,key):
    """
    Lists all trees and branches in the given ROOT file.
    
    Parameters:
    - file_path: Path to the ROOT file.
    """
    # Open the ROOT file
    file = uproot.open(file_path)

    obj = file[key]
    print ("key: ", key)
    #Check if the object is a tree
    if isinstance(obj, uproot.TTree):
        print(f"Tree name: {obj.name}")
        print("Branches:")
        # Print the structure of the tree
        print(obj.show())
        print()


# Example usage
file_path = "try0.root"
list_trees_and_branches(file_path,key)

key:  deepntuplizer/tree;42
Tree name: tree
Branches:
name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
Delta_gen_pt         | float                    | AsDtype('>f4')
event_no             | uint32_t                 | AsDtype('>u4')
gen_pt               | float                    | AsDtype('>f4')
isB                  | int32_t                  | AsDtype('>i4')
isBB                 | int32_t                  | AsDtype('>i4')
isC                  | int32_t                  | AsDtype('>i4')
isG                  | int32_t                  | AsDtype('>i4')
isLeptonicB          | int32_t                  | AsDtype('>i4')
isLeptonicB_C        | int32_t                  | AsDtype('>i4')
isS                  | int32_t                  | AsDtype('>i4')
isUD                 | int32_t                  | AsDtype('>i4')
isUndefined          | int32_t                  | AsDtype('>i4')
jet_

## label extraction

In [5]:
import numpy as np
import uproot

def extract_label(feature,file_path, tree_name):
    """
    Extracts the fj_label branch from the given ROOT file and tree name.
    
    Parameters:
    - file_path: Path to the ROOT file.
    - tree_name: Name of the tree containing the fj_label branch.
    
    Returns:
    - label: array of type awkward.highlevel.Array (free to convert to np.array) or None if not found
    """
    # Open the ROOT file
    file = uproot.open(file_path)
    
    # Get the tree object
    tree = file[tree_name]
    
    # Load only the necessary branches
    try:
        branches = tree.arrays([feature])
        label = branches[feature]
        # return np.array(label)
        return (label)
    except KeyError:
        print("fj_label branch not found in the tree.")
        return None



### example

In [4]:
# Example usage
file_path = "ntuple_merged_10.root"
tree_name = "deepntuplizer/tree;42" 
label = "fj_label"
fj_label = extract_label(label,file_path, tree_name)
print(type(fj_label))
print(fj_label)

<class 'awkward.highlevel.Array'>
[55, 55, 55, 55, 54, 52, 55, 55, 55, ..., 55, 41, 55, 55, 41, 55, 55, 41, 55]


## Labels

In [7]:
params_1 = ['pfcand_ptrel',
          'pfcand_erel',
          'pfcand_phirel',
          'pfcand_etarel',
          'pfcand_deltaR',
          'pfcand_puppiw',
          'pfcand_drminsv',
          'pfcand_drsubjet1',
          'pfcand_drsubjet2',
          'pfcand_hcalFrac'
         ]

params_2 = ['track_ptrel',     
          'track_erel',     
          'track_phirel',     
          'track_etarel',     
          'track_deltaR',
          'track_drminsv',     
          'track_drsubjet1',     
          'track_drsubjet2',
          'track_dz',     
          'track_dzsig',     
          'track_dxy',     
          'track_dxysig',     
          'track_normchi2',     
          'track_quality',     
          'track_dptdpt',     
          'track_detadeta',     
          'track_dphidphi',     
          'track_dxydxy',     
          'track_dzdz',     
          'track_dxydz',     
          'track_dphidxy',     
          'track_dlambdadz',     
          'trackBTag_EtaRel',     
          'trackBTag_PtRatio',     
          'trackBTag_PParRatio',     
          'trackBTag_Sip2dVal',     
          'trackBTag_Sip2dSig',     
          'trackBTag_Sip3dVal',     
          'trackBTag_Sip3dSig',     
          'trackBTag_JetDistVal'
         ]

params_3 = ['sv_ptrel',
          'sv_erel',
          'sv_phirel',
          'sv_etarel',
          'sv_deltaR',
          'sv_pt',
          'sv_mass',
          'sv_ntracks',
          'sv_normchi2',
          'sv_dxy',
          'sv_dxysig',
          'sv_d3d',
          'sv_d3dsig',
          'sv_costhetasvpv'
         ]

## feature vector extraction (without dummy entries)

In [3]:
import uproot
import awkward as ak

def extract_branches(file_path, tree_name, branch_names):
    
    # Open the ROOT file
    file = uproot.open(file_path)
    
    # Get the tree object
    tree = file[tree_name]
    
    # Initialize an empty dictionary to store the arrays
    arrays = {}
    
    # Iterate over the branch names and extract the jagged arrays
    for name in branch_names:
        # Extract the jagged array for the current branch name
        arr = tree[name].array()
        # Store the jagged array in the dictionary
        arrays[name] = arr
        
    # Combine the arrays by column
    combined_list = [[arrays[name] for name in branch_names] for arrays in ak.zip(arrays)]
    
    # Print the combined list
    
    
    # Return the combined list
    return combined_list




### Example

In [5]:
file_path = "ntuple_merged_10.root"
tree_name = "deepntuplizer/tree;42"
branch_names = ['pfcand_ptrel',
          'pfcand_erel']
combined_list=extract_branches(file_path, tree_name, branch_names)
# print("Combined list:", combined_list)

In [7]:
print(combined_list[0])

[<Array [0.25, 0.142, 0.0723, ..., 0.00383, 0.00381] type='38 * float32'>, <Array [0.24, 0.137, 0.0697, ..., 0.00389, 0.00245] type='38 * float32'>]


In [3]:
# check
file_path = "ntuple_merged_10.root"
tree_name = "deepntuplizer/tree;42" 
label1 = "pfcand_ptrel"
label2 = "pfcand_erel"
pfcand_ptrel = extract_label(label1,file_path, tree_name)
pfcand_erel = extract_label(label2,file_path, tree_name)
print('pfcand_ptrel',pfcand_ptrel)
print('pfcand_erel',pfcand_erel)


pfcand_ptrel [[0.25, 0.142, 0.0723, 0.0562, ..., 0.00392, 0.00387, 0.00383, 0.00381], ...]
pfcand_erel [[0.24, 0.137, 0.0697, 0.0547, ..., 0.0048, 0.00308, 0.00389, 0.00245], ...]


## feature vector extraction (with dummy entries)

In [None]:
def is_regular(array):
    """
    Check if the array is regular, meaning all its elements have the same shape.
    
    Parameters:
    - array: Nested array to check for regularity.
    
    Returns:
    - regular: Boolean indicating whether the array is regular or not.
    """
    first_shape = np.shape(array[0])
    for element in array:
        if np.shape(element) != first_shape:
            return False
    return True

In [2]:
import uproot
import numpy as np
import awkward as ak

def extract_branches_with_dummy_feature(file_path, tree_name, branch_names):
    """
    Extracts the specified branches from the given ROOT file and tree name, and combines them into a single array.
    
    Parameters:
    - file_path: Path to the ROOT file.
    - tree_name: Name of the tree containing the branches.
    - branch_names: List of branch names to extract.
    
    Returns:
    - combined_array: Numpy array containing the values of the specified branches combined into one array, or None if not found.
    """
    # Open the ROOT file
    file = uproot.open(file_path)
    
    # Get the tree object
    tree = file[tree_name]
    
    # Load only the necessary branches
    try:
        branches = tree.arrays(branch_names)

        # Initialize combined_array with the first branch array 
        combined_array = branches[branch_names[0]]

        # if not regular size, fill the blank with 999 (just a number, can be modified)
        if not is_regular(combined_array):

            combined_list = combined_array.tolist()

            max_length = max(len(sublist) for sublist in combined_list)

            # Iterate through each sublist and append 999 until it reaches the maximum length
            for sublist in combined_list:
                while len(sublist) < max_length:
                    sublist.append(999)

            # Convert the list of lists to a numpy array
            result_array = np.array(combined_list)
            combined_array = result_array

        # convert the array to norml np.array 
        combined_array = np.array(combined_array)

        # check the rest branch (starting from index 1)
        for name in branch_names[1:]:
            branch_array = branches[name]
            check_branch = branch_array
            
            print(name)

            # check under this name, if the array is regular or not 
            if not is_regular(check_branch):

                checked_list = check_branch.tolist()

                max_length = max(len(sublist) for sublist in checked_list)

                # Iterate through each sublist and append 999 until it reaches the maximum length
                for sublist in checked_list:
                    while len(sublist) < max_length:
                        sublist.append(999)

                # Convert the list of lists to a numpy array
                check_branch = np.array(checked_list)
                
            combined_array = np.column_stack((combined_array, check_branch))
            combined_array = np.array(combined_array)

        return combined_array
    
    except KeyError:
        print("One or more branches not found in the tree.")
        return None




In [None]:
# Example usage

# access data from tree_name in file_path
file_path = "try0.root" # to be modified base on the file you have downloaded 
tree_name = "deepntuplizer/tree;42"  # Use the correct tree name from the output

# features want, for formal use, can replace with params_1,2,3 (this is the features used in the paper)
branch_names = params_1
combined_array = extract_branches_with_dummy_feature(file_path, tree_name, branch_names)

### Thus, all needed feature are stored in the combined array

# print relevent information 
if combined_array is not None:
    print( combined_array)
    print("Shape of combined array:", combined_array.shape)