In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
import ROOT
from ROOT import TFile, TTree, TH1F, TCanvas, TAxis, TLegend, TTreeReader, TTreeReaderValue
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
import multiprocessing
import time
import concurrent.futures

Welcome to JupyROOT 6.22/06


In [2]:
from joblib import dump, load
clf = load('BDT_model_1.joblib') 

In [3]:
def BDT_sort(event_record, evtnum, index):
    if not sum(evtnum) / 12 == evtnum[0]:
        print("Check if 12 candidates are coming from the same event")
        print(evtnum)
    best_score = 0
    best_index = -99
    for i, trijet in enumerate(event_record):
        sample = [trijet]
        score_BDT_temp = clf.predict(sample)
        if score_BDT_temp > best_score:
            best_score = score_BDT_temp
            best_index = index[i]
    return best_score, best_index

def select_trijet(sample, batch_size, ibatch, isSig):
    inFile = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs/{sample}_ML_study.root","READ")
    inTree = inFile.Get("Events")
    variable2use = ['dijet_eta', 'dijet_phi', 'dR_jj', 'dEta_jj', 'dPhi_jj', 'jet_eta_0', 'jet_phi_0', 'jet_ptoverm_0', 'jet_eta_1', 
                    'jet_phi_1', 'jet_ptoverm_1', 'jet_eta_2', 'jet_phi_2', 'jet_ptoverm_2', 'dR_jj_j', 'dEta_jj_j', 
                    'dPhi_jj_j', 'jet_ptoverM_0', 'jet_ptoverM_1', 'jet_ptoverM_2', 'dijet_ptoverM']
    outFile = TFile(f"{sample}_selection_{ibatch}.root","RECREATE")
    outTree = TTree("Events","Events")
    
    dijet_eta = np.empty((1), dtype="float32")
    dijet_phi = np.empty((1), dtype="float32")
    dR_jj = np.empty((1), dtype="float32")
    dEta_jj = np.empty((1), dtype="float32")
    dPhi_jj = np.empty((1), dtype="float32")
    jet_eta_0 = np.empty((1), dtype="float32")
    jet_phi_0 = np.empty((1), dtype="float32")
    jet_ptoverm_0 = np.empty((1), dtype="float32")
    jet_eta_1 = np.empty((1), dtype="float32")
    jet_phi_1 = np.empty((1), dtype="float32")
    jet_ptoverm_1 = np.empty((1), dtype="float32")
    jet_eta_2 = np.empty((1), dtype="float32")
    jet_phi_2 = np.empty((1), dtype="float32")
    jet_ptoverm_2 = np.empty((1), dtype="float32")
    dR_jj_j = np.empty((1), dtype="float32")
    dEta_jj_j = np.empty((1), dtype="float32")
    dPhi_jj_j = np.empty((1), dtype="float32")
    jet_ptoverM_0 = np.empty((1), dtype="float32")
    jet_ptoverM_1 = np.empty((1), dtype="float32")
    jet_ptoverM_2 = np.empty((1), dtype="float32")
    dijet_ptoverM = np.empty((1), dtype="float32")
    M_jjj = np.empty((1), dtype="float32")
    m_jj = np.empty((1), dtype="float32")
    score_BDT = np.empty((1), dtype="float32")
    isMatched = np.empty((1), dtype="int32")

    outTree.Branch("dijet_eta", dijet_eta, "dijet_eta/F")
    outTree.Branch("dijet_phi", dijet_phi, "dijet_phi/F")
    outTree.Branch("dR_jj", dR_jj, "dR_jj/F")
    outTree.Branch("dEta_jj", dEta_jj, "dEta_jj/F")
    outTree.Branch("dPhi_jj", dPhi_jj, "dPhi_jj/F")
    outTree.Branch("jet_eta_0", jet_eta_0, "jet_eta_0/F")
    outTree.Branch("jet_phi_0", jet_phi_0, "jet_phi_0/F")
    outTree.Branch("jet_ptoverm_0", jet_ptoverm_0, "jet_ptoverm_0/F")
    outTree.Branch("jet_eta_1", jet_eta_1, "jet_eta_1/F")
    outTree.Branch("jet_phi_1", jet_phi_1, "jet_phi_1/F")
    outTree.Branch("jet_ptoverm_1", jet_ptoverm_1, "jet_ptoverm_1/F")
    outTree.Branch("jet_eta_2", jet_eta_2, "jet_eta_2/F")
    outTree.Branch("jet_phi_2", jet_phi_2, "jet_phi_2/F")
    outTree.Branch("jet_ptoverm_2", jet_ptoverm_2, "jet_ptoverm_2/F")
    outTree.Branch("dR_jj_j", dR_jj_j, "dR_jj_j/F")
    outTree.Branch("dEta_jj_j", dEta_jj_j, "dEta_jj_j/F")
    outTree.Branch("dPhi_jj_j", dPhi_jj_j, "dPhi_jj_j/F")
    outTree.Branch("jet_ptoverM_0", jet_ptoverM_0, "jet_ptoverM_0/F")
    outTree.Branch("jet_ptoverM_1", jet_ptoverM_1, "jet_ptoverM_1/F")
    outTree.Branch("jet_ptoverM_2", jet_ptoverM_2, "jet_ptoverM_2/F")
    outTree.Branch("dijet_ptoverM", dijet_ptoverM, "dijet_ptoverM/F")
    outTree.Branch("M_jjj", M_jjj, "M_jjj/F")
    outTree.Branch("m_jj", m_jj, "m_jj/F")
    outTree.Branch("score_BDT", score_BDT, "score_BDT/F")
    outTree.Branch("isMatched", isMatched, "isMatched/I")
    
    evtnum = []
    index = []
    event_record = []
    evt_start = ibatch*batch_size
    evt_end = (ibatch+1)*batch_size
    if((ibatch+1)*batch_size > inTree.GetEntries()):
        evt_end = inTree.GetEntries()
    for ievt in range(evt_start, evt_end):
        inTree.GetEntry(ievt)
#         if i%1000 == 0:
#             print("Processing: ",i)
#             # time check:
#             end = time.time()
#             speed = (end - start)/(i+1)*1000
#             print(f"Avg. speed: {speed}s/1k candidates".format(speed))
#         if i > 100000: break
        # make event record for BDT classification
        trijet_record = []
        for var in variable2use:
            trijet_record.append(getattr(inTree, var))
        if len(event_record) < 12:
            event_record.append(trijet_record)
            evtnum.append(inTree.evt_num)
            index.append(ievt)
            if(len(event_record) == 12):
                best_score, best_index = BDT_sort(event_record, evtnum, index)
                inTree.GetEntry(best_index)
                dijet_eta[0] = inTree.dijet_eta
                dijet_phi[0] = inTree.dijet_phi
                dR_jj[0] = inTree.dR_jj
                dEta_jj[0] = inTree.dEta_jj
                dPhi_jj[0] = inTree.dPhi_jj
                jet_eta_0[0] = inTree.jet_eta_0
                jet_phi_0[0] = inTree.jet_phi_0
                jet_ptoverm_0[0] = inTree.jet_ptoverm_0
                jet_eta_1[0] = inTree.jet_eta_1
                jet_phi_1[0] = inTree.jet_phi_1
                jet_ptoverm_1[0] = inTree.jet_ptoverm_1
                jet_eta_2[0] = inTree.jet_eta_2
                jet_phi_2[0] = inTree.jet_phi_2
                jet_ptoverm_2[0] = inTree.jet_ptoverm_2
                dR_jj_j[0] = inTree.dR_jj_j
                dEta_jj_j[0] = inTree.dEta_jj_j
                dPhi_jj_j[0] = inTree.dPhi_jj_j
                jet_ptoverM_0[0] = inTree.jet_ptoverM_0
                jet_ptoverM_1[0] = inTree.jet_ptoverM_1
                jet_ptoverM_2[0] = inTree.jet_ptoverM_2
                dijet_ptoverM[0] = inTree.dijet_ptoverM
                M_jjj[0] = inTree.M_jjj
                m_jj[0] = inTree.m_jj
                score_BDT[0] = best_score
                if isSig == 1:
                    isMatched[0] = inTree.gen_dijet_matched
                else:
                    isMatched[0] = -99
                outTree.Fill()
                event_record.clear()
                evtnum.clear()
                index.clear()
#     print("Number of trijet candidates selected: ", outTree.GetEntries())
    outFile.cd()
    outTree.Write()
    outFile.Write()
    outFile.Close()
    print(f"Finished processing of batch {ibatch}")
    return 0

In [None]:
if __name__ == '__main__':
    
    sample = "QCD_Pt_470to600"
    isSig = 0
    temp_file = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs/{sample}_ML_study.root","READ")
    temp_tree = temp_file.Get("Events")
    tot_evts = temp_tree.GetEntries()
    
    expect_time = 2 # in hrs
    known_speed = 5 # sec per 1k candidates
    evt_batch = int(expect_time * 3600 / known_speed * 1000)
    num_batch = math.ceil(tot_evts / evt_batch)
    print(f"Number of Candidates to be processed: {tot_evts}")
    print(f"Candidates to be processed per batch: {evt_batch}")
    print(f"Number of batches to be processed: {num_batch}")
    
    main_start = time.time()
    
    batch_size = 24
    
#     multiprocessing way
#     p1 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_1))
#     p2 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_2))
#     p1.start()
#     p1.join()
#     p2.start()
#     p2.join()

with concurrent.futures.ProcessPoolExecutor() as executor:
    results = [executor.submit(select_trijet, sample, batch_size, ibatch, isSig) for ibatch in range(2)]
    status = [r.result() for r in results]
    print(status)

print(f"Time used: {round(time.time() - main_start, 2)}")

In [4]:
if __name__ == '__main__':
    
    sample = "Res1ToRes2GluTo3Glu_M1-3000_R-0p3"
    isSig = 1
    temp_file = TFile(f"/home/xyan13/Trijet/TrijetAna/TrijetAna/outputs/{sample}_ML_study.root","READ")
    temp_tree = temp_file.Get("Events")
    tot_evts = temp_tree.GetEntries()
    
    expect_time = 2 # in mins
    known_speed = 5 # sec per 1k candidates
    evt_batch = int(expect_time * 60 / known_speed * 1000)
    num_batch = math.ceil(tot_evts / evt_batch)
    print(f"Number of Candidates to be processed: {tot_evts}")
    print(f"Candidates to be processed per batch: {evt_batch}")
    print(f"Number of batches to be processed: {num_batch}")
    
    main_start = time.time()
    
    batch_size = evt_batch
    
#     multiprocessing way
#     p1 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_1))
#     p2 = multiprocessing.Process(target=select_trijet, args=(sample, ibatch_2))
#     p1.start()
#     p1.join()
#     p2.start()
#     p2.join()

with concurrent.futures.ProcessPoolExecutor() as executor:
    results = [executor.submit(select_trijet, sample, batch_size, ibatch, isSig) for ibatch in range(num_batch)]
    status = [r.result() for r in results]
    print(status)

print(f"Time used: {round(time.time() - main_start, 2)}")

Number of Candidates to be processed: 402156
Candidates to be processed per batch: 24000
Number of batches to be processed: 17
Finished processing of batch 14Finished processing of batch 3

Finished processing of batch 11
Finished processing of batch 6
Finished processing of batch 8
Finished processing of batch 13
Finished processing of batch 10
Finished processing of batch 0
Finished processing of batch 1
Finished processing of batch 12Finished processing of batch 15

Finished processing of batch 9
Finished processing of batch 2
Finished processing of batch 5
Finished processing of batch 4
Finished processing of batch 7
Finished processing of batch 16
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Time used: 78.12
