In [36]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import ROOT
from ROOT import TFile, TTree, TH1F, TCanvas, TAxis, TLegend, TTreeReader, TTreeReaderValue, TRandom3
import multiprocessing
import time
import concurrent.futures

In [None]:
# inFile = TFile("/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/QCD_Pt_300to470_ML_study.root")
# inTree = inFile.Get("Events")
# variable2use = [i.GetName() for i in inTree.GetListOfBranches()]
# display([f"{x}[0] = inTree.{x}" for x in variable2use])

In [37]:
def get_weight(sample):
    lumi = 100
    weight_xsec = {
        'QCD_Pt_300to470': lumi*6830/111229780,
        'QCD_Pt_470to600': lumi*552.1/27881028,
        'QCD_Pt_600to800': lumi*156.5/12807188,
        'QCD_Pt_800to1000': lumi*26.28/1906000,
        'QCD_Pt_1000to1400': lumi*7.47/1517308,
        'QCD_Pt_1400to1800': lumi*0.6484/776000,
        'QCD_Pt_1800to2400': lumi*0.08743/856734,
        'QCD_Pt_2400to3200': lumi*0.005236/1485988,
        'QCD_Pt_3200toInf': lumi*0.0001357/757837,
    }
    return weight_xsec[sample]

In [38]:
def select_trijet(sample, batch_size, ibatch, weight):
    inFile = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/{sample}_ML_study.root","READ")
    inTree = inFile.Get("Events")
    outFile = TFile(f"{sample}_filtered_2fb_{ibatch}.root","RECREATE")
    outTree = TTree("Events","Events")
    
    dijet_pt = np.empty((1), dtype="float32")
    dijet_eta = np.empty((1), dtype="float32")
    dijet_phi = np.empty((1), dtype="float32")
    dR_jj = np.empty((1), dtype="float32")
    dEta_jj = np.empty((1), dtype="float32")
    dPhi_jj = np.empty((1), dtype="float32")
    m_jj = np.empty((1), dtype="float32")
    jet_pt_0 = np.empty((1), dtype="float32")
    jet_eta_0 = np.empty((1), dtype="float32")
    jet_phi_0 = np.empty((1), dtype="float32")
    dR_j0j2 = np.empty((1), dtype="float32")
    dEta_j0j2 = np.empty((1), dtype="float32")
    dPhi_j0j2 = np.empty((1), dtype="float32")
    dR_j1j2 = np.empty((1), dtype="float32")
    dEta_j1j2 = np.empty((1), dtype="float32")
    dPhi_j1j2 = np.empty((1), dtype="float32")
    jet_m_0 = np.empty((1), dtype="float32")
    jet_ptoverm_0 = np.empty((1), dtype="float32")
    jet_pt_1 = np.empty((1), dtype="float32")
    jet_eta_1 = np.empty((1), dtype="float32")
    jet_phi_1 = np.empty((1), dtype="float32")
    jet_m_1 = np.empty((1), dtype="float32")
    jet_ptoverm_1 = np.empty((1), dtype="float32")
    jet_pt_2 = np.empty((1), dtype="float32")
    jet_eta_2 = np.empty((1), dtype="float32")
    jet_phi_2 = np.empty((1), dtype="float32")
    jet_m_2 = np.empty((1), dtype="float32")
    jet_ptoverm_2 = np.empty((1), dtype="float32")
    dR_jj_j = np.empty((1), dtype="float32")
    dEta_jj_j = np.empty((1), dtype="float32")
    dPhi_jj_j = np.empty((1), dtype="float32")
    M_jjj = np.empty((1), dtype="float32")
    jet_ptoverM_0 = np.empty((1), dtype="float32")
    jet_ptoverM_1 = np.empty((1), dtype="float32")
    jet_ptoverM_2 = np.empty((1), dtype="float32")
    dijet_ptoverM = np.empty((1), dtype="float32")
    dijet_res_dPt = np.empty((1), dtype="float32")
    dijet_res_dEta = np.empty((1), dtype="float32")
    dijet_res_dPhi = np.empty((1), dtype="float32")
    run_num = np.empty((1), dtype="int32")
    evt_num = np.empty((1), dtype="int32")
    lumi_block = np.empty((1), dtype="int32")

    outTree.Branch("dijet_pt", dijet_pt, "dijet_pt/F")
    outTree.Branch("dijet_eta", dijet_eta, "dijet_eta/F")
    outTree.Branch("dijet_phi", dijet_phi, "dijet_phi/F")
    outTree.Branch("dR_jj", dR_jj, "dR_jj/F")
    outTree.Branch("dEta_jj", dEta_jj, "dEta_jj/F")
    outTree.Branch("dPhi_jj", dPhi_jj, "dPhi_jj/F")
    outTree.Branch("m_jj", m_jj, "m_jj/F")
    outTree.Branch("jet_pt_0", jet_pt_0, "jet_pt_0/F")
    outTree.Branch("jet_eta_0", jet_eta_0, "jet_eta_0/F")
    outTree.Branch("jet_phi_0", jet_phi_0, "jet_phi_0/F")
    outTree.Branch("dR_j0j2", dR_j0j2, "dR_j0j2/F")
    outTree.Branch("dEta_j0j2", dEta_j0j2, "dEta_j0j2/F")
    outTree.Branch("dPhi_j0j2", dPhi_j0j2, "dPhi_j0j2/F")
    outTree.Branch("dR_j1j2", dR_j1j2, "dR_j1j2/F")
    outTree.Branch("dEta_j1j2", dEta_j1j2, "dEta_j1j2/F")
    outTree.Branch("dPhi_j1j2", dPhi_j1j2, "dPhi_j1j2/F")
    outTree.Branch("jet_m_0", jet_m_0, "jet_m_0/F")
    outTree.Branch("jet_ptoverm_0", jet_ptoverm_0, "jet_ptoverm_0/F")
    outTree.Branch("jet_pt_1", jet_pt_1, "jet_pt_1/F")
    outTree.Branch("jet_eta_1", jet_eta_1, "jet_eta_1/F")
    outTree.Branch("jet_phi_1", jet_phi_1, "jet_phi_1/F")
    outTree.Branch("jet_m_1", jet_m_1, "jet_m_1/F")
    outTree.Branch("jet_ptoverm_1", jet_ptoverm_1, "jet_ptoverm_1/F")
    outTree.Branch("jet_pt_2", jet_pt_2, "jet_pt_2/F")
    outTree.Branch("jet_eta_2", jet_eta_2, "jet_eta_2/F")
    outTree.Branch("jet_phi_2", jet_phi_2, "jet_phi_2/F")
    outTree.Branch("jet_m_2", jet_m_2, "jet_m_2/F")
    outTree.Branch("jet_ptoverm_2", jet_ptoverm_2, "jet_ptoverm_2/F")
    outTree.Branch("dR_jj_j", dR_jj_j, "dR_jj_j/F")
    outTree.Branch("dEta_jj_j", dEta_jj_j, "dEta_jj_j/F")
    outTree.Branch("dPhi_jj_j", dPhi_jj_j, "dPhi_jj_j/F")
    outTree.Branch("M_jjj", M_jjj, "M_jjj/F")
    outTree.Branch("jet_ptoverM_0", jet_ptoverM_0, "jet_ptoverM_0/F")
    outTree.Branch("jet_ptoverM_1", jet_ptoverM_1, "jet_ptoverM_1/F")
    outTree.Branch("jet_ptoverM_2", jet_ptoverM_2, "jet_ptoverM_2/F")
    outTree.Branch("dijet_ptoverM", dijet_ptoverM, "dijet_ptoverM/F")
    outTree.Branch("dijet_res_dPt", dijet_res_dPt, "dijet_res_dPt/F")
    outTree.Branch("dijet_res_dEta", dijet_res_dEta, "dijet_res_dEta/F")
    outTree.Branch("dijet_res_dPhi", dijet_res_dPhi, "dijet_res_dPhi/F")
    outTree.Branch("run_num", run_num, "run_num/I")
    outTree.Branch("evt_num", evt_num, "evt_num/I")
    outTree.Branch("lumi_block", lumi_block, "lumi_block/I")

    randgen = TRandom3()
    t_start = time.time()
    evt_start = ibatch*batch_size
    evt_end = (ibatch+1)*batch_size
    print(evt_start, evt_end)
    if((ibatch+1)*batch_size > inTree.GetEntries()):
        evt_end = inTree.GetEntries()
    for ievt in range(evt_start, evt_end):
        inTree.GetEntry(ievt)
#         if ievt%10000 == 0:
#             print("Processing: ",ievt)
#             # time check:
#             t_end = time.time()
#             speed = (t_end - t_start)/(ievt+1)*10000
#             print(f"Avg. speed: {speed}s/10k candidates".format(speed))
#         if ievt > 100000: break

        if (ievt - evt_start)%1000000 == 0:
            # time check:
            t_end = time.time()
            speed = (t_end - t_start)/(ievt - evt_start+1)*10000
            t_remain = (evt_end - ievt) / 10000 * speed / 60
            print(f"{sample}: Batch #{ibatch} >> Avg. speed: {speed}s/10k candidates, time remaining: {t_remain}mins\n".format(ibatch, speed, t_remain))
        
        rand = randgen.Uniform()
        if(rand > weight):
            continue
        dijet_pt[0] = inTree.dijet_pt
        dijet_eta[0] = inTree.dijet_eta
        dijet_phi[0] = inTree.dijet_phi
        dR_jj[0] = inTree.dR_jj
        dEta_jj[0] = inTree.dEta_jj
        dPhi_jj[0] = inTree.dPhi_jj
        m_jj[0] = inTree.m_jj
        jet_pt_0[0] = inTree.jet_pt_0
        jet_eta_0[0] = inTree.jet_eta_0
        jet_phi_0[0] = inTree.jet_phi_0
        dR_j0j2[0] = inTree.dR_j0j2
        dEta_j0j2[0] = inTree.dEta_j0j2
        dPhi_j0j2[0] = inTree.dPhi_j0j2
        dR_j1j2[0] = inTree.dR_j1j2
        dEta_j1j2[0] = inTree.dEta_j1j2
        dPhi_j1j2[0] = inTree.dPhi_j1j2
        jet_m_0[0] = inTree.jet_m_0
        jet_ptoverm_0[0] = inTree.jet_ptoverm_0
        jet_pt_1[0] = inTree.jet_pt_1
        jet_eta_1[0] = inTree.jet_eta_1
        jet_phi_1[0] = inTree.jet_phi_1
        jet_m_1[0] = inTree.jet_m_1
        jet_ptoverm_1[0] = inTree.jet_ptoverm_1
        jet_pt_2[0] = inTree.jet_pt_2
        jet_eta_2[0] = inTree.jet_eta_2
        jet_phi_2[0] = inTree.jet_phi_2
        jet_m_2[0] = inTree.jet_m_2
        jet_ptoverm_2[0] = inTree.jet_ptoverm_2
        dR_jj_j[0] = inTree.dR_jj_j
        dEta_jj_j[0] = inTree.dEta_jj_j
        dPhi_jj_j[0] = inTree.dPhi_jj_j
        M_jjj[0] = inTree.M_jjj
        jet_ptoverM_0[0] = inTree.jet_ptoverM_0
        jet_ptoverM_1[0] = inTree.jet_ptoverM_1
        jet_ptoverM_2[0] = inTree.jet_ptoverM_2
        dijet_ptoverM[0] = inTree.dijet_ptoverM
        dijet_res_dPt[0] = inTree.dijet_res_dPt
        dijet_res_dEta[0] = inTree.dijet_res_dEta
        dijet_res_dPhi[0] = inTree.dijet_res_dPhi
        run_num[0] = inTree.run_num
        evt_num[0] = inTree.evt_num
        lumi_block[0] = inTree.lumi_block
        outTree.Fill()
    print(outTree.GetEntries())
    outFile.cd()
    outTree.Write()
    outFile.Write()
    outFile.Close()
    print(f"Finished processing of batch {ibatch}/n")
    return 0

In [39]:
# QCD processing
if __name__ == '__main__':
    
    for sample in ["QCD_Pt_300to470","QCD_Pt_470to600","QCD_Pt_600to800","QCD_Pt_800to1000","QCD_Pt_1000to1400","QCD_Pt_1400to1800",
                   "QCD_Pt_1800to2400","QCD_Pt_2400to3200","QCD_Pt_3200toInf"]:
        temp_file = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/{sample}_ML_study.root","READ")
        temp_tree = temp_file.Get("Events")
        tot_evts = temp_tree.GetEntries()
        print(tot_evts)
        weight = get_weight(sample)
        
        expect_time = 0.1 # in hrs
        known_speed = 0.05 # sec per 10k candidates
        evt_batch = int(expect_time * 3600 / known_speed * 10000)
        num_batch = math.ceil(tot_evts / evt_batch)
        print(f"Number of Candidates to be processed: {tot_evts}")
        print(f"Candidates to be processed per batch: {evt_batch}")
        print(f"Number of batches to be processed: {num_batch}")

        main_start = time.time()
        with concurrent.futures.ProcessPoolExecutor() as executor:
            results = [executor.submit(select_trijet, sample, evt_batch, ibatch, weight) for ibatch in range(num_batch)]
            status = [r.result() for r in results]
            print(status)

        print(f"Time used: {round(time.time() - main_start, 2)}")

13502853
Number of Candidates to be processed: 13502853
Candidates to be processed per batch: 72000000
Number of batches to be processed: 1
0 72000000
QCD_Pt_300to470: Batch #0 >> Avg. speed: 289.461612701416s/10k candidates, time remaining: 6514.262675750256mins

QCD_Pt_300to470: Batch #0 >> Avg. speed: 0.06906864173534386s/10k candidates, time remaining: 1.4392584575444485mins

QCD_Pt_300to470: Batch #0 >> Avg. speed: 0.06725575541364145s/10k candidates, time remaining: 1.28938844654512mins

QCD_Pt_300to470: Batch #0 >> Avg. speed: 0.0674199841197404s/10k candidates, time remaining: 1.1801703041199465mins

QCD_Pt_300to470: Batch #0 >> Avg. speed: 0.06698286766132654s/10k candidates, time remaining: 1.0608805748400665mins

QCD_Pt_300to470: Batch #0 >> Avg. speed: 0.06712881605345432s/10k candidates, time remaining: 0.9513107582776037mins

QCD_Pt_300to470: Batch #0 >> Avg. speed: 0.06727483979180934s/10k candidates, time remaining: 0.8412553892608269mins

QCD_Pt_300to470: Batch #0 >> A


3915
Finished processing of batch 0/n
[0]
Time used: 12.45
2369250
Number of Candidates to be processed: 2369250
Candidates to be processed per batch: 72000000
Number of batches to be processed: 1
0 72000000
QCD_Pt_1000to1400: Batch #0 >> Avg. speed: 211.5154266357422s/10k candidates, time remaining: 835.221540927887mins

QCD_Pt_1000to1400: Batch #0 >> Avg. speed: 0.04069263442104356s/10k candidates, time remaining: 0.09286398280168982mins

QCD_Pt_1000to1400: Batch #0 >> Avg. speed: 0.04063548678903761s/10k candidates, time remaining: 0.025007755828086894mins

1216
Finished processing of batch 0/n
[0]
Time used: 9.67
1234140
Number of Candidates to be processed: 1234140
Candidates to be processed per batch: 72000000
Number of batches to be processed: 1
0 72000000
QCD_Pt_1400to1800: Batch #0 >> Avg. speed: 202.53419876098633s/10k candidates, time remaining: 416.5925934314728mins

QCD_Pt_1400to1800: Batch #0 >> Avg. speed: 0.0407220671652009s/10k candidates, time remaining: 0.0158911080

In [None]:
# Dry-run, speed calculation
sample = "QCD_Pt_300to470"
temp_file = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs_3_jets/{sample}_ML_study.root","READ")
temp_tree = temp_file.Get("Events")
tot_evts = temp_tree.GetEntries()
weight = get_weight(sample)

main_start = time.time()

select_trijet(sample, 50000, 0, weight)

print(f"Time used: {round(time.time() - main_start, 2)}")