In [1]:
import numpy as np
import os
import uproot
import awkward as ak
import ROOT
from cfg.hnl_mva_tools import read_json_file

from data_tools.load_data import (
    get_categorized_data,
    categorize_data,
    read_files_and_open_trees
)
from mva_tools.mva_training_tools import (
    train_one_signal_all_methods,
    load_model,
)

training_vars = ["C_Ds_pt"]
category_list = [1, 2, 3, 4, 5, 6]
category_var = "C_category"

def fix_pred_shape(y_pred, mask):
    """
    This function takes the y_pred 1D array and reshapes it to match the true
    entries of the mask array. False entries are filled with np.nan.
    Args:
        y_pred: the array of predictions
        mask: the array of mask values
    Returns:
        y_pred_shaped: the reshaped array of predictions
    Example:
        y_pred = [1,2,3]
        mask = [[False, True], [True, False, True]]
        y_pred_reshaped = [[nan, 1],[2, nan, 3]]
    """
    assert len(y_pred) == ak.sum(mask)

    y_pred_flat = np.full(ak.sum(ak.num(mask)), np.nan)
    y_pred_flat[ak.flatten(mask)] = y_pred

    y_pred_shaped = ak.unflatten(y_pred_flat, ak.num(mask))
    return y_pred_shaped

def get_bdt_output(data_dict, training_vars, category_list, xgboost_models):
    assert len(category_list) == len(xgboost_models)

    # categorize the data, this adds the C_category column
    categorize_data(
        data_dict,
        category_list,
        category_var=category_var,
        default_category=0,
    )

    # make sure the default category is empty
    assert np.sum(data_dict[category_var] == 0) == 0
    # input_dict[var] is an awkward array
    # make out an awkward array that copies
    # the shape of input_dict[var]
    bdt_output = ak.ones_like(data_dict[training_vars[0]]) * np.nan

    for category,model in zip(category_list,xgboost_models):
        mask = data_dict[category_var] == category
        # x_cat[i] is ith event
        # x_cat[i][j] is the jth variable for the ith event
        x_cat = np.array([ak.flatten(data_dict[var][mask]) for var in training_vars]).T
        y_score_cat = model.predict(x_cat, output_margin=True)
        y_score_cat_shaped = fix_pred_shape(y_score_cat, mask)

        #ak.where(condition, x, y) does the same as
        # output[i] = x[i] if condition[i] else y[i]     so this fills the 
        #bdt_output array with the predictions for the current category and
        #leaves the rest as they are
        bdt_output = ak.where(mask, y_score_cat_shaped, bdt_output)

    #check that there are no np.nan values left in the bdt_output array
    assert np.sum(np.isnan(bdt_output)) == 0

    #check again that bdt_output has the same shape as the input data
    assert ak.all(ak.num(bdt_output) == ak.num(data_dict[training_vars[0]]))

    return bdt_output


def rewrite_root_file(input_file, tree_name, bdt_output):  # array_of_pNN2):
    myfile = ROOT.TFile(input_file, "update")
    mytree = myfile.Get(tree_name)
    n_events = mytree.GetEntries()
    assert len(bdt_output) == n_events
    print("Save new branch in original ROOT file")

    cand_arr = ak.num(bdt_output, axis=1)
    max_cand = max(cand_arr)
    #give size based on max number of candidates in an event
    C_bdtscore = np.zeros(max_cand, dtype=np.float64)
    nCand = np.ones_like(np.array([[1]*max_cand]*n_events), dtype=np.int32)
    nCand_branch = mytree.Branch("nCand", nCand, "nCand/I")
    #set branch address to array
    nCand_branch.SetAddress(nCand)
    print("nCand", nCand)
    new_branch = mytree.Branch("C_bdtscore", C_bdtscore, "C_bdtscore[nCand]/D")
    return
    for idex,n_cand in enumerate(cand_arr):
        C_bdtscore[:n_cand] = bdt_output[idex]
        mytree.GetEntry(idex)
        new_branch.Fill()
        


    mytree.Write("", ROOT.TFile.kOverwrite)
    myfile.Close()  

Welcome to JupyROOT 6.28/04


2024-01-18 18:00:07.544610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
ntuples_json = "cfg/ntuples.json"
vars_json = "cfg/vars_new.json"
my_method = "XGBoost"

# open trees from root files
(
    sig_trees,
    bkg_trees,
    weight_name,
    sig_labels,
    bkg_labels,
) = read_files_and_open_trees(ntuples_json, vars_json)


In [3]:

ntuples = read_json_file(ntuples_json)
signal_file_names = ntuples["signal"]
background_file_names = ntuples["background"]
treename = ntuples["treename"]


good_vars = read_json_file(vars_json)["vars"]
training_vars = read_json_file(vars_json)["training_vars"]

In [4]:
my_sig_tree = sig_trees[2]
my_bkg_tree = bkg_trees[2]

In [5]:
data_dict_sig = uproot.concatenate(my_sig_tree,expressions=good_vars, how=dict)
#remove "C_pass_gen_matching" from the list of variables
good_vars.remove("C_pass_gen_matching")
data_dict_bkg = uproot.concatenate(my_bkg_tree,expressions=good_vars,how=dict)

In [6]:
trained_model_dir = "../results_categories/myMVA/mN1p0_ctau10"
#check that dir exists
assert os.path.isdir(trained_model_dir)
xgboost_models = []
for category in category_list:
    category_dir = f"{trained_model_dir}/cat_{category}"
    model = load_model(f"{category_dir}/{my_method}_model", my_method)
    xgboost_models.append(model)

In [7]:
bdt_output_sig = get_bdt_output(data_dict_sig, training_vars, category_list, xgboost_models)

In [8]:
bdt_output_bkg = get_bdt_output(data_dict_bkg, training_vars, category_list, xgboost_models)

In [9]:
bdt_output_sig

In [14]:
def rewrite_root_file(input_file, tree_name, bdt_output): 
    myfile = ROOT.TFile(input_file, "update")
    mytree = myfile.Get(tree_name)
    n_events = mytree.GetEntries()
    assert len(bdt_output) == n_events
    print("Save new branch in original ROOT file")

    nCand = ak.num(bdt_output, axis=1)
    max_cand = max(nCand)
    #give size based on max number of candidates in an event
    C_bdtscore = np.zeros(max_cand, dtype=np.float64)
    new_branch = mytree.Branch("C_bdtscore_b", C_bdtscore, f"C_bdtscore_b[{max_cand}]/D")
    #print info about the new branch
    new_branch.Print()
    for idex,n_cand in enumerate(nCand):
        C_bdtscore[:n_cand] = bdt_output[idex]
        mytree.GetEntry(idex)
        new_branch.Fill()
        


    new_branch.Print()
    mytree.Write("", ROOT.TFile.kOverwrite)
    myfile.Close()  

In [15]:
rewrite_root_file(signal_file_names[2], treename, bdt_output_sig)

Save new branch in original ROOT file
*Br    3 :C_bdtscore_b : C_bdtscore_b[14]/D                                  *
*Entries :        0 : Total  Size=        529 bytes  One basket in memory    *
*Baskets :        0 : Basket Size=      32000 bytes  Compression=   1.00     *
*............................................................................*
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore_b, len=1 and max=0
ERROR leaf:C_bdtscore_b, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtscore, len=1 and max=0
ERROR leaf:C_bdtsc

In [34]:
import uproot 

my_data_dict = {"C_pt": [[1.2, 2.3, 3.4], [4.5, 5.6, 6.7, 7.8]],
                "C_eta": [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6, 0.7]]}
zipped = ak.zip(my_data_dict)
with uproot.recreate("test.root") as f:
    f["mytree"] = zipped


In [35]:
my_root_file = ROOT.TFile("test.root")
my_tree = my_root_file.mytree
for entry in my_tree:
    C_pt = np.array(entry.C_pt)
    C_eta = np.array(entry.C_eta)
    n = np.array(entry.n)
    print(f"C_pt: {C_pt}, C_eta: {C_eta}, n: {n}")

dummy_bdt_score = ak.Array([[1.1, 2.2, 3.3], [4.4, 5.5, 6.6, 7.7]])


C_pt: [1.2 2.3 3.4], C_eta: [0.1 0.2 0.3], n: 3
C_pt: [4.5 5.6 6.7 7.8], C_eta: [0.4 0.5 0.6 0.7], n: 4


In [36]:
def rewrite_root_file(input_file, tree_name, bdt_output): 
    myfile = ROOT.TFile(input_file, "update")
    mytree = myfile.Get(tree_name)
    n_events = mytree.GetEntries()
    assert len(bdt_output) == n_events
    print("Save new branch in original ROOT file")

    nCand = ak.num(bdt_output, axis=1)
    max_cand = max(nCand)
    #give size based on max number of candidates in an event
    C_bdtscore = np.zeros(max_cand, dtype=np.float64)
    new_branch = mytree.Branch("C_bdtscore_b", C_bdtscore, "C_bdtscore_b[n]/D")
    #print info about the new branch
    new_branch.Print()
    for idex,n_cand in enumerate(nCand):
        C_bdtscore[:n_cand] = bdt_output[idex]
        mytree.GetEntry(idex)
        new_branch.Fill()
        


    new_branch.Print()
    mytree.Write("", ROOT.TFile.kOverwrite)
    myfile.Close()  

In [37]:
rewrite_root_file("test.root", "mytree", dummy_bdt_score)

Save new branch in original ROOT file
*Br    7 :C_bdtscore_b : C_bdtscore_b[n]/D                                   *
*Entries :        0 : Total  Size=        596 bytes  One basket in memory    *
*Baskets :        0 : Basket Size=      32000 bytes  Compression=   1.00     *
*............................................................................*
*Br    8 :C_bdtscore_b : C_bdtscore_b[n]/D                                   *
*Entries :        2 : Total  Size=        842 bytes  One basket in memory    *
*Baskets :        0 : Basket Size=      32000 bytes  Compression=   1.00     *
*............................................................................*


In [38]:
my_root_file = ROOT.TFile("test.root")
my_tree = my_root_file.mytree
for entry in my_tree:
    C_pt = np.array(entry.C_pt)
    C_eta = np.array(entry.C_eta)
    n = np.array(entry.n)
    C_bdtscore_b = np.array(entry.C_bdtscore_b)
    print(f"C_pt: {C_pt}, C_eta: {C_eta}, n: {n}, C_bdtscore_b: {C_bdtscore_b}")

C_pt: [1.2 2.3 3.4], C_eta: [0.1 0.2 0.3], n: 3, C_bdtscore_b: [1.1 2.2 3.3]
C_pt: [4.5 5.6 6.7 7.8], C_eta: [0.4 0.5 0.6 0.7], n: 4, C_bdtscore_b: [4.4 5.5 6.6 7.7]
