In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import ROOT
from ROOT import TFile, TTree, TH1F, TCanvas, TAxis, TLegend, TTreeReader, TTreeReaderValue
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
import multiprocessing
import time
import concurrent.futures

Welcome to JupyROOT 6.22/06


In [2]:
from joblib import dump, load
clf = load('BDT_model_QCD_sig0p3_matching3.joblib') 

In [3]:
def BDT_sort(event_record, evtnum, index, ismatched_arr):
    if not sum(evtnum) / 3 == evtnum[0]:
        print("Check if 3 candidates are coming from the same event")
        print(evtnum)
    score_BDT_temp = clf.predict(event_record)
#     print(score_BDT_temp)
#     print(index)
#     print(ismatched_arr)
    sum_score = sum(score_BDT_temp.ravel())
    sum_scale = 1.0/sum_score
    weights_list = [sum_scale*x for x in score_BDT_temp.ravel()]
    return weights_list, score_BDT_temp.ravel()

def select_trijet(sample, batch_size, ibatch, isSig):
    inFile = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/{sample}_ML_study.root","READ")
    inTree = inFile.Get("Events")
    variable2use = ['dijet_eta','dijet_phi','dR_jj','dEta_jj','dPhi_jj','dR_j0j2','dEta_j0j2','dPhi_j0j2','dR_j1j2','dEta_j1j2','dPhi_j1j2',
                    'jet_eta_0','jet_phi_0','jet_ptoverm_0','jet_eta_1','jet_phi_1','jet_ptoverm_1','jet_eta_2',
                    'jet_phi_2','jet_ptoverm_2','dR_jj_j','dEta_jj_j','dPhi_jj_j','jet_ptoverM_0','jet_ptoverM_1','jet_ptoverM_2',
                    'dijet_ptoverM']
    outFile = TFile(f"{sample}_BDT_Weighting_{ibatch}.root","RECREATE")
    outTree = TTree("Events","Events")
    
    dijet_eta = np.empty((1), dtype="float32")
    dijet_phi = np.empty((1), dtype="float32")
    dR_jj = np.empty((1), dtype="float32")
    dEta_jj = np.empty((1), dtype="float32")
    dPhi_jj = np.empty((1), dtype="float32")
    jet_eta_0 = np.empty((1), dtype="float32")
    jet_phi_0 = np.empty((1), dtype="float32")
    jet_ptoverm_0 = np.empty((1), dtype="float32")
    jet_eta_1 = np.empty((1), dtype="float32")
    jet_phi_1 = np.empty((1), dtype="float32")
    jet_ptoverm_1 = np.empty((1), dtype="float32")
    jet_eta_2 = np.empty((1), dtype="float32")
    jet_phi_2 = np.empty((1), dtype="float32")
    jet_ptoverm_2 = np.empty((1), dtype="float32")
    dR_jj_j = np.empty((1), dtype="float32")
    dEta_jj_j = np.empty((1), dtype="float32")
    dPhi_jj_j = np.empty((1), dtype="float32")
    jet_ptoverM_0 = np.empty((1), dtype="float32")
    jet_ptoverM_1 = np.empty((1), dtype="float32")
    jet_ptoverM_2 = np.empty((1), dtype="float32")
    dijet_ptoverM = np.empty((1), dtype="float32")
    M_jjj = np.empty((1), dtype="float32")
    m_jj = np.empty((1), dtype="float32")
    score_BDT = np.empty((1), dtype="float32")
    weight_BDT = np.empty((1), dtype="float32")
    isMatched = np.empty((1), dtype="int32")

    outTree.Branch("dijet_eta", dijet_eta, "dijet_eta/F")
    outTree.Branch("dijet_phi", dijet_phi, "dijet_phi/F")
    outTree.Branch("dR_jj", dR_jj, "dR_jj/F")
    outTree.Branch("dEta_jj", dEta_jj, "dEta_jj/F")
    outTree.Branch("dPhi_jj", dPhi_jj, "dPhi_jj/F")
    outTree.Branch("jet_eta_0", jet_eta_0, "jet_eta_0/F")
    outTree.Branch("jet_phi_0", jet_phi_0, "jet_phi_0/F")
    outTree.Branch("jet_ptoverm_0", jet_ptoverm_0, "jet_ptoverm_0/F")
    outTree.Branch("jet_eta_1", jet_eta_1, "jet_eta_1/F")
    outTree.Branch("jet_phi_1", jet_phi_1, "jet_phi_1/F")
    outTree.Branch("jet_ptoverm_1", jet_ptoverm_1, "jet_ptoverm_1/F")
    outTree.Branch("jet_eta_2", jet_eta_2, "jet_eta_2/F")
    outTree.Branch("jet_phi_2", jet_phi_2, "jet_phi_2/F")
    outTree.Branch("jet_ptoverm_2", jet_ptoverm_2, "jet_ptoverm_2/F")
    outTree.Branch("dR_jj_j", dR_jj_j, "dR_jj_j/F")
    outTree.Branch("dEta_jj_j", dEta_jj_j, "dEta_jj_j/F")
    outTree.Branch("dPhi_jj_j", dPhi_jj_j, "dPhi_jj_j/F")
    outTree.Branch("jet_ptoverM_0", jet_ptoverM_0, "jet_ptoverM_0/F")
    outTree.Branch("jet_ptoverM_1", jet_ptoverM_1, "jet_ptoverM_1/F")
    outTree.Branch("jet_ptoverM_2", jet_ptoverM_2, "jet_ptoverM_2/F")
    outTree.Branch("dijet_ptoverM", dijet_ptoverM, "dijet_ptoverM/F")
    outTree.Branch("M_jjj", M_jjj, "M_jjj/F")
    outTree.Branch("m_jj", m_jj, "m_jj/F")
    outTree.Branch("score_BDT", score_BDT, "score_BDT/F")
    outTree.Branch("weight_BDT", weight_BDT, "weight_BDT/F")
    outTree.Branch("isMatched", isMatched, "isMatched/I")
    
    t_start = time.time()
    evtnum = []
    index = []
    event_record = []
    ismatched_arr = []
    evt_start = ibatch*batch_size
    evt_end = (ibatch+1)*batch_size
    if((ibatch+1)*batch_size > inTree.GetEntries()):
        evt_end = inTree.GetEntries()
    for ievt in range(evt_start, evt_end):
        inTree.GetEntry(ievt)
#         if ievt%1000 == 0:
#             print("Processing: ",ievt)
#             # time check:
#             t_end = time.time()
#             speed = (t_end - t_start)/(ievt+1)*1000
#             print(f"Avg. speed: {speed}s/1k candidates".format(speed))
#         if ievt > 100000: break

        if (ievt - evt_start)%100000 == 0:
            # time check:
            t_end = time.time()
            speed = (t_end - t_start)/(ievt - evt_start+1)*1000
            t_remain = (evt_end - ievt) / 1000 * speed / 60
            print(f"Batch #{ibatch} >> Avg. speed: {speed}s/1k candidates, time remaining: {t_remain}mins\n".format(ibatch, speed, t_remain))

        # make event record for BDT classification
        trijet_record = []
        for var in variable2use:
            trijet_record.append(getattr(inTree, var))
        if len(event_record) < 3:
            event_record.append(trijet_record)
            evtnum.append(inTree.evt_num)
            index.append(ievt)
            if isSig == 1:
                ismatched_arr.append(inTree.gen_dijet_matched)
            else:
                ismatched_arr.append(-99)
        if(len(event_record) == 3):
            weight_list, score_list = BDT_sort(event_record, evtnum, index, ismatched_arr)
            for iweight, jevt in enumerate(index):
                inTree.GetEntry(jevt)
                dijet_eta[0] = inTree.dijet_eta
                dijet_phi[0] = inTree.dijet_phi
                dR_jj[0] = inTree.dR_jj
                dEta_jj[0] = inTree.dEta_jj
                dPhi_jj[0] = inTree.dPhi_jj
                jet_eta_0[0] = inTree.jet_eta_0
                jet_phi_0[0] = inTree.jet_phi_0
                jet_ptoverm_0[0] = inTree.jet_ptoverm_0
                jet_eta_1[0] = inTree.jet_eta_1
                jet_phi_1[0] = inTree.jet_phi_1
                jet_ptoverm_1[0] = inTree.jet_ptoverm_1
                jet_eta_2[0] = inTree.jet_eta_2
                jet_phi_2[0] = inTree.jet_phi_2
                jet_ptoverm_2[0] = inTree.jet_ptoverm_2
                dR_jj_j[0] = inTree.dR_jj_j
                dEta_jj_j[0] = inTree.dEta_jj_j
                dPhi_jj_j[0] = inTree.dPhi_jj_j
                jet_ptoverM_0[0] = inTree.jet_ptoverM_0
                jet_ptoverM_1[0] = inTree.jet_ptoverM_1
                jet_ptoverM_2[0] = inTree.jet_ptoverM_2
                dijet_ptoverM[0] = inTree.dijet_ptoverM
                M_jjj[0] = inTree.M_jjj
                m_jj[0] = inTree.m_jj
                score_BDT[0] = score_list[iweight]
                weight_BDT[0] = weight_list[iweight]
                if isSig == 1:
                    isMatched[0] = inTree.gen_dijet_matched
#                     print(isMatched[0])
                else:
                    isMatched[0] = -99
                outTree.Fill()
            event_record.clear()
            evtnum.clear()
            index.clear()
            ismatched_arr.clear()
#     print("Number of trijet candidates selected: ", outTree.GetEntries())
    outFile.cd()
    print(outTree.GetEntries())
    outTree.Write()
    outFile.Write()
    outFile.Close()
    print(f"Finished processing of batch {ibatch}/n")
    return 0

In [5]:
# QCD processing
if __name__ == '__main__':
    
    for sample in ["QCD_Pt_300to470","QCD_Pt_470to600","QCD_Pt_600to800"]:
#     for sample in ["QCD_Pt_800to1000","QCD_Pt_1000to1400","QCD_Pt_1400to1800",
#                    "QCD_Pt_1800to2400","QCD_Pt_2400to3200","QCD_Pt_3200toInf"]:
        isSig = 0
        temp_file = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/{sample}_ML_study.root","READ")
        temp_tree = temp_file.Get("Events")
        tot_evts = temp_tree.GetEntries()
        print(tot_evts)
        expect_time = 0.3 # in hrs
        known_speed = 2 # sec per 1k candidates
        evt_batch = int(expect_time * 3600 / known_speed * 1000)
        if(evt_batch%3 != 0):
            print("!!!!")
        num_batch = math.ceil(tot_evts / evt_batch)
        print(f"Number of Candidates to be processed: {tot_evts}")
        print(f"Candidates to be processed per batch: {evt_batch}")
        print(f"Number of batches to be processed: {num_batch}")

        main_start = time.time()

    #     multiprocessing way
    #     p1 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_1))
    #     p2 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_2))
    #     p1.start()
    #     p1.join()
    #     p2.start()
    #     p2.join()
        with concurrent.futures.ProcessPoolExecutor() as executor:
            results = [executor.submit(select_trijet, sample, evt_batch, ibatch, isSig) for ibatch in range(num_batch)]
            status = [r.result() for r in results]
            print(status)

        print(f"Time used: {round(time.time() - main_start, 2)}")

13502853
Number of Candidates to be processed: 13502853
Candidates to be processed per batch: 540000
Number of batches to be processed: 26
Batch #0 >> Avg. speed: 243.34073066711426s/1k candidates, time remaining: 2190.0665760040283mins

Batch #3 >> Avg. speed: 689.0835762023926s/1k candidates, time remaining: 6201.752185821533mins
Batch #4 >> Avg. speed: 695.3849792480469s/1k candidates, time remaining: 6258.464813232422mins
Batch #11 >> Avg. speed: 698.7600326538086s/1k candidates, time remaining: 6288.840293884277mins
Batch #7 >> Avg. speed: 703.5379409790039s/1k candidates, time remaining: 6331.841468811035mins




Batch #14 >> Avg. speed: 738.3480072021484s/1k candidates, time remaining: 6645.132064819336mins

Batch #10 >> Avg. speed: 751.80983543396s/1k candidates, time remaining: 6766.28851890564mins
Batch #1 >> Avg. speed: 743.7582015991211s/1k candidates, time remaining: 6693.82381439209mins
Batch #12 >> Avg. speed: 748.7142086029053s/1k candidates, time remaining: 6738.427877


Batch #7 >> Avg. speed: 2.229188805153121s/1k candidates, time remaining: 1.4861258701020807mins

Batch #11 >> Avg. speed: 2.229573987717106s/1k candidates, time remaining: 1.4863826584780708mins

Batch #8 >> Avg. speed: 2.231305673706859s/1k candidates, time remaining: 1.487537115804573mins

Batch #14 >> Avg. speed: 2.2456560592822115s/1k candidates, time remaining: 1.4971040395214743mins

Batch #13 >> Avg. speed: 2.253762429254164s/1k candidates, time remaining: 1.5025082861694423mins

Batch #1 >> Avg. speed: 2.2670318281075166s/1k candidates, time remaining: 1.5113545520716776mins

Batch #10 >> Avg. speed: 2.2717426617720924s/1k candidates, time remaining: 1.5144951078480617mins

Batch #9 >> Avg. speed: 2.2838030700853578s/1k candidates, time remaining: 1.5225353800569053mins

Batch #15 >> Avg. speed: 2.2899933934552905s/1k candidates, time remaining: 1.526662262303527mins

Batch #12 >> Avg. speed: 2.298949679139216s/1k candidates, time remaining: 1.532633119426144mins

Batch #0 >>


Batch #6 >> Avg. speed: 477.7388572692871s/1k candidates, time remaining: 4299.649715423584mins
Batch #3 >> Avg. speed: 511.4452838897705s/1k candidates, time remaining: 4603.007555007935mins


Batch #9 >> Avg. speed: 488.33203315734863s/1k candidates, time remaining: 4394.988298416138mins
Batch #1 >> Avg. speed: 526.1373519897461s/1k candidates, time remaining: 4735.236167907715mins
Batch #8 >> Avg. speed: 523.834228515625s/1k candidates, time remaining: 4714.508056640625mins
Batch #12 >> Avg. speed: 524.0256786346436s/1k candidates, time remaining: 4716.231107711792mins

Batch #13 >> Avg. speed: 519.8960304260254s/1k candidates, time remaining: 4679.0642738342285mins




Batch #15 >> Avg. speed: 497.52092361450195s/1k candidates, time remaining: 4477.688312530518mins
Batch #5 >> Avg. speed: 514.4901275634766s/1k candidates, time remaining: 4630.411148071289mins

Batch #4 >> Avg. speed: 527.6689529418945s/1k candidates, time remaining: 4749.020576477051mins

Batch #11 >> Avg. speed: 


Batch #2 >> Avg. speed: 3.3774270262929504s/1k candidates, time remaining: 2.2516180175286338mins

Batch #15 >> Avg. speed: 3.3777874715483174s/1k candidates, time remaining: 2.2518583143655446mins

Batch #12 >> Avg. speed: 3.3786700085787142s/1k candidates, time remaining: 2.2524466723858096mins

Batch #7 >> Avg. speed: 3.38060989079209s/1k candidates, time remaining: 2.2537399271947267mins

Batch #10 >> Avg. speed: 3.3853111810568675s/1k candidates, time remaining: 2.2568741207045786mins

Batch #9 >> Avg. speed: 3.3945033728699325s/1k candidates, time remaining: 2.2630022485799555mins

Batch #14 >> Avg. speed: 3.3986756243330167s/1k candidates, time remaining: 2.265783749555345mins

Batch #6 >> Avg. speed: 3.3992846237978482s/1k candidates, time remaining: 2.2661897491985656mins

Batch #11 >> Avg. speed: 3.404813376034564s/1k candidates, time remaining: 2.2698755840230422mins

Batch #0 >> Avg. speed: 3.4053562035087843s/1k candidates, time remaining: 2.270237469005856mins

540000
Fi


Batch #17 >> Avg. speed: 3.4304096036072242s/1k candidates, time remaining: 8.004289075083523mins

Batch #20 >> Avg. speed: 3.435927523777153s/1k candidates, time remaining: 8.017164222146691mins

Batch #23 >> Avg. speed: 3.424010478651019s/1k candidates, time remaining: 7.9893577835190435mins

Batch #24 >> Avg. speed: 3.4296379894349944s/1k candidates, time remaining: 8.002488642014987mins

Batch #22 >> Avg. speed: 3.4335263250749115s/1k candidates, time remaining: 8.011561425174794mins

Batch #27 >> Avg. speed: 3.4173074297748673s/1k candidates, time remaining: 7.973717336141357mins

Batch #29 >> Avg. speed: 3.41377580677657s/1k candidates, time remaining: 7.965476882478664mins

Batch #26 >> Avg. speed: 3.4367290880164143s/1k candidates, time remaining: 8.019034538704966mins

Batch #25 >> Avg. speed: 3.4444672774416927s/1k candidates, time remaining: 8.037090314030616mins

Batch #31 >> Avg. speed: 3.438706658493821s/1k candidates, time remaining: 8.023648869818915mins

Batch #21 >> 


Batch #32 >> Avg. speed: 3.3860050948979747s/1k candidates, time remaining: 13.544020379591899mins

Batch #33 >> Avg. speed: 3.4175562649807034s/1k candidates, time remaining: 13.670225059922814mins

Batch #34 >> Avg. speed: 3.432629077839953s/1k candidates, time remaining: 13.730516311359812mins

Batch #37 >> Avg. speed: 3.3921982702328575s/1k candidates, time remaining: 13.56879308093143mins

Batch #35 >> Avg. speed: 3.4125568331871228s/1k candidates, time remaining: 13.650227332748491mins

Batch #36 >> Avg. speed: 3.410654588284348s/1k candidates, time remaining: 13.642618353137392mins

Batch #39 >> Avg. speed: 3.4023015337888936s/1k candidates, time remaining: 13.609206135155574mins

Batch #46 >> Avg. speed: 3.3818892060347996s/1k candidates, time remaining: 13.527556824139198mins

Batch #38 >> Avg. speed: 3.425303784397949s/1k candidates, time remaining: 13.701215137591795mins

Batch #42 >> Avg. speed: 3.41189000884381s/1k candidates, time remaining: 13.64756003537524mins

Batch 


Batch #5 >> Avg. speed: 528.3379554748535s/1k candidates, time remaining: 4755.041599273682mins
Batch #6 >> Avg. speed: 500.3945827484131s/1k candidates, time remaining: 4503.551244735718mins

Batch #13 >> Avg. speed: 498.51489067077637s/1k candidates, time remaining: 4486.634016036987mins
Batch #12 >> Avg. speed: 518.7926292419434s/1k candidates, time remaining: 4669.13366317749mins

Batch #2 >> Avg. speed: 518.8570022583008s/1k candidates, time remaining: 4669.713020324707mins


Batch #9 >> Avg. speed: 484.6994876861572s/1k candidates, time remaining: 4362.295389175415mins


Batch #14 >> Avg. speed: 486.7250919342041s/1k candidates, time remaining: 4380.525827407837mins
Batch #1 >> Avg. speed: 515.5000686645508s/1k candidates, time remaining: 4639.500617980957mins

Batch #11 >> Avg. speed: 487.52379417419434s/1k candidates, time remaining: 4387.714147567749mins

Batch #4 >> Avg. speed: 499.76658821105957s/1k candidates, time remaining: 4497.899293899536mins


Batch #3 >> Avg. speed:


Batch #8 >> Avg. speed: 3.4278278635253336s/1k candidates, time remaining: 2.285218575683556mins

Batch #10 >> Avg. speed: 3.428008852428798s/1k candidates, time remaining: 2.2853392349525317mins

Batch #9 >> Avg. speed: 3.4298550172545346s/1k candidates, time remaining: 2.286570011503023mins

Batch #1 >> Avg. speed: 3.4314371807619604s/1k candidates, time remaining: 2.2876247871746402mins

Batch #2 >> Avg. speed: 3.433087531369798s/1k candidates, time remaining: 2.2887250209131986mins

Batch #6 >> Avg. speed: 3.433729501673992s/1k candidates, time remaining: 2.2891530011159946mins

Batch #3 >> Avg. speed: 3.442720424262978s/1k candidates, time remaining: 2.2951469495086525mins

540000
Finished processing of batch 15/n
Batch #16 >> Avg. speed: 668.7061786651611s/1k candidates, time remaining: 6018.35560798645mins

540000
540000
Finished processing of batch 0/n
Finished processing of batch 4/n
Batch #17 >> Avg. speed: 514.4989490509033s/1k candidates, time remaining: 4630.49054145813mi


Batch #19 >> Avg. speed: 3.4452330318959437s/1k candidates, time remaining: 8.03887707442387mins

Batch #24 >> Avg. speed: 3.423047537623656s/1k candidates, time remaining: 7.9871109211218645mins

Batch #23 >> Avg. speed: 3.4311261646829863s/1k candidates, time remaining: 8.005961050926969mins

Batch #21 >> Avg. speed: 3.4375091177474006s/1k candidates, time remaining: 8.020854608077268mins

Batch #25 >> Avg. speed: 3.4513834803393264s/1k candidates, time remaining: 8.053228120791761mins

Batch #30 >> Avg. speed: 3.4432072324407885s/1k candidates, time remaining: 8.034150209028507mins

Batch #29 >> Avg. speed: 3.4484353901229197s/1k candidates, time remaining: 8.046349243620146mins

Batch #27 >> Avg. speed: 3.4605532660410367s/1k candidates, time remaining: 8.074624287429085mins

Batch #28 >> Avg. speed: 3.466898646219456s/1k candidates, time remaining: 8.089430174512065mins

Batch #31 >> Avg. speed: 3.44923211388108s/1k candidates, time remaining: 8.04820826572252mins

Batch #16 >> A

In [None]:
# Signal processing
if __name__ == '__main__':
    
    for sample in ["Res1ToRes2GluTo3Glu_M1-3000_R-0p7","Res1ToRes2GluTo3Glu_M1-5000_R-0p7","Res1ToRes2GluTo3Glu_M1-7000_R-0p7"]:
        isSig = 1
        temp_file = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/{sample}_ML_study.root","READ")
        temp_tree = temp_file.Get("Events")
        tot_evts = temp_tree.GetEntries()

        expect_time = 2 # in mins
        known_speed = 1.8 # sec per 1k candidates
        evt_batch = int(expect_time * 60 / known_speed * 1000)
        num_batch = math.ceil(tot_evts / evt_batch)
        print(f"Number of Candidates to be processed: {tot_evts}")
        print(f"Candidates to be processed per batch: {evt_batch}")
        print(f"Number of batches to be processed: {num_batch}")

        main_start = time.time()

        batch_size = evt_batch

    #     multiprocessing way
    #     p1 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_1))
    #     p2 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_2))
    #     p1.start()
    #     p1.join()
    #     p2.start()
    #     p2.join()

        with concurrent.futures.ProcessPoolExecutor() as executor:
            results = [executor.submit(select_trijet, sample, batch_size, ibatch, isSig) for ibatch in range(num_batch)]
            status = [r.result() for r in results]
            print(status)

        print(f"Time used: {round(time.time() - main_start, 2)}")

In [None]:
# Dry-run, speed calculation
sample = "QCD_Pt_300to470"
isSig = 0
temp_file = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/{sample}_ML_study.root","READ")
temp_tree = temp_file.Get("Events")
tot_evts = temp_tree.GetEntries()

main_start = time.time()

select_trijet(sample, 99999, 0, isSig)

print(f"Time used: {round(time.time() - main_start, 2)}")