In [6]:
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
from coffea import hist, processor
import numpy as np
import awkward as ak
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from pprint import pprint
import numba
from numba import njit
from numba.typed import List
import correctionlib, rich
import correctionlib.convert

In [2]:
filename = "root://ingrid-se03.cism.ucl.ac.be:1094//store/mc/RunIISummer20UL18NanoAODv9/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v2/100000/19922B78-283B-CD48-8A3D-0308D48A824A.root"
#events = NanoEventsFactory.from_root(filename, schemaclass=NanoAODSchema).events()
events = NanoEventsFactory.from_root(filename, schemaclass=NanoAODSchema, entry_stop=20000).events()

dataset = "ttHTobb"

In [3]:
from hist import Hist

In [None]:
btagged = (Hist.new.Reg(20, 0, 400,overflow=True, underflow=False, name="pt" )
                .Reg(10, -5, 5, overflow=False, underflow=False, name="eta")
                .Int(0,3, name="flavour")
                .Weight())
non_btagged = (Hist.new.Reg(20, 0, 400,overflow=True, underflow=False, name="pt" )
                .Reg(10, -5, 5, overflow=False, underflow=False, name="eta")
                .Int(0,3, name="flavour")
                .Weight())

In [14]:
bJets = events.Jet[events.Jet.btagDeepFlavB > 0.277]
bJets_L = bJets[bJets.hadronFlavour==0]
bJets_C = bJets[bJets.hadronFlavour==4]
bJets_B = bJets[bJets.hadronFlavour==5]

bJets_flavour = ak.where(bJets.hadronFlavour!=0, bJets.hadronFlavour-3, 0.)

jets_L = events.Jet[events.Jet.hadronFlavour==0]
jets_C = events.Jet[events.Jet.hadronFlavour==4]
jets_B = events.Jet[events.Jet.hadronFlavour==5]

jets_flavour = ak.where(events.Jet.hadronFlavour!=0, events.Jet.hadronFlavour-3, 0.)


In [8]:
print(ak.num(bJets_L))
print(ak.num(bJets_C))
print(ak.num(bJets_B))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ... 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 1, 0, 0, ... 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[4, 3, 3, 2, 1, 3, 2, 4, 2, 4, 3, 1, 3, 4, ... 2, 4, 3, 2, 3, 2, 1, 2, 3, 3, 2, 4, 3]


In [9]:
print(ak.num(jets_L))
print(ak.num(jets_C))
print(ak.num(jets_B))

[4, 4, 6, 2, 5, 4, 8, 2, 7, 3, 4, 8, 5, ... 2, 7, 7, 5, 6, 5, 5, 6, 7, 7, 10, 2, 7]
[2, 1, 1, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, ... 2, 0, 0, 1, 1, 1, 1, 0, 0, 0, 2, 0, 0]
[4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 4, ... 3, 4, 4, 4, 4, 4, 3, 4, 3, 3, 3, 4, 3]


In [17]:
btagged.fill(ak.flatten(bJets.pt),
             ak.flatten(bJets.eta),
             ak.flatten(bJets_flavour))

Hist(
  Regular(20, 0, 400, underflow=False, name='pt'),
  Regular(10, -5, 5, underflow=False, overflow=False, name='eta'),
  Integer(0, 3, name='flavour'),
  storage=Weight()) # Sum: WeightedSum(value=121984, variance=121984) (WeightedSum(value=122962, variance=122962) with flow)

In [23]:
non_btagged.fill(ak.flatten(events.Jet.pt),
             ak.flatten(events.Jet.eta),
             ak.flatten(jets_flavour))

Hist(
  Regular(20, 0, 400, underflow=False, name='pt'),
  Regular(10, -5, 5, underflow=False, overflow=False, name='eta'),
  Integer(0, 3, name='flavour'),
  storage=Weight()) # Sum: WeightedSum(value=202993, variance=202993) (WeightedSum(value=204136, variance=204136) with flow)

In [24]:
btagged[{"flavour":2}]

In [25]:
non_btagged[{"flavour":2}]

### Let's save the efficiency in correctionlib format

In [28]:
w_num,  x, y, z = btagged.to_numpy()
num_var = btagged.variances()
w_denom,  x, y, z  = non_btagged.to_numpy()
denom_var = non_btagged.variances()

ratio= np.where( (w_denom>0)&(w_num>0), w_num/w_denom, 1.) 
ratio_err =  np.where( (w_denom>0)&(w_num>0),
                      np.sqrt((1/w_denom)**2 * num_var + (w_num/w_denom**2)**2 * denom_var),
                      0.)

#recreate the histo
sfhist = ( Hist.new.Reg(20, 0, 400,overflow=True, underflow=False, name="pt" )
                .Reg(10, -5, 5, overflow=False, underflow=False, name="eta")
                .Int(0,3, name="flavour")
           .Double(data=ratio))



  ratio= np.where( (w_denom>0)&(w_num>0), w_num/w_denom, 1.)
  np.sqrt((1/w_denom)**2 * num_var + (w_num/w_denom**2)**2 * denom_var),
  np.sqrt((1/w_denom)**2 * num_var + (w_num/w_denom**2)**2 * denom_var),
  np.sqrt((1/w_denom)**2 * num_var + (w_num/w_denom**2)**2 * denom_var),


In [7]:
# without a name, the resulting object will fail validation
sfhist.name = "TRF_efficiencies"
sfhist.label = "out"
clibcorr = correctionlib.convert.from_histogram(sfhist)
clibcorr.description = "Btagging efficiency by pt, eta, flavour"

# set overflow bins behavior (default is to raise an error when out of bounds)
clibcorr.data.flow = "clamp"

cset = correctionlib.schemav2.CorrectionSet(
    schema_version=2,
    description="trf efficiency",
    corrections=[clibcorr],
)
rich.print(cset)

with open("test_trf.json", "w") as fout:
    fout.write(cset.json(exclude_unset=True))

NameError: name 'sfhist' is not defined

In [8]:
cset = correctionlib.CorrectionSet.from_file("test_trf.json")
corr = cset["TRF_efficiencies"]

In [9]:
corr.evaluate(ak.flatten(bJets.pt), ak.flatten(bJets.eta), ak.flatten(bJets_flavour ))

NameError: name 'bJets' is not defined

# Compute the TRF method

In [10]:
jet_pt = ak.flatten(events.Jet.pt)
jet_eta = ak.flatten(events.Jet.eta)
jet_flavour = ak.where(events.Jet.hadronFlavour!=0, events.Jet.hadronFlavour-3, 0.)
jet_fl = ak.flatten(jet_flavour)

num_jets = ak.num(events.Jet)

In [11]:
eff = corr.evaluate(jet_pt, jet_eta, jet_fl)

In [12]:
eff_j = ak.unflatten(eff, num_jets)

In [13]:
eff_j

<Array [[1.7, 1.71, ... 0.0808, 0.0793]] type='20000 * var * float64'>

## Computing combinations without repetitions

In [35]:
from itertools import combinations, chain
from scipy.special import comb

In [26]:
comb(15, 3, exact=True)

455

In [20]:
list(combinations([0,1,2,3,4,5], 3))

[(0, 1, 2),
 (0, 1, 3),
 (0, 1, 4),
 (0, 1, 5),
 (0, 2, 3),
 (0, 2, 4),
 (0, 2, 5),
 (0, 3, 4),
 (0, 3, 5),
 (0, 4, 5),
 (1, 2, 3),
 (1, 2, 4),
 (1, 2, 5),
 (1, 3, 4),
 (1, 3, 5),
 (1, 4, 5),
 (2, 3, 4),
 (2, 3, 5),
 (2, 4, 5),
 (3, 4, 5)]

In [44]:
@numba.jit(nopython=False)
def comb_index(n, k):
    count = comb(n,k, exact=True)
    index = np.fromiter(chain.from_iterable(combinations(range(n), k)), 
                        int, count=count*k)
    return index.reshape(-1, k)

  @numba.jit(nopython=False)


In [54]:
%%timeit
comb_index(15, 4)

219 µs ± 27.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [52]:
out = comb_index(10, 3)