<!-- <img  src="https://swan.web.cern.ch/sites/swan.web.cern.ch/files/pictures/logo_swan_letters.png" alt="SWAN" style="float: left; width: 15%; margin-right: 5%; margin-left: 17%; margin-top: 1.0em; margin-bottom: 2.0em;">
<img src="https://spark.apache.org/images/spark-logo-trademark.png" alt="EP-SFT" style="float: left; width: 25%; margin-right: 0%; margin-left: 0%; margin-bottom: 2.0em;">
<img src="https://cms-docdb.cern.ch/cgi-bin/PublicDocDB/RetrieveFile?docid=3045&filename=CMSlogo_color_label_1024_May2014.png&version=3" alt="CMS" style="float: left; width: 12%; margin-left: 5%; margin-right: 5%; margin-bottom: 2.0em;"> -->
<p style="clear: both;">
<div style="text-align:center"><h1>CMS H&#8594;µµ analysis with Apache Spark  
     <br> using Coffea and Laurelin packages from Fermilab</h1></div>
<div style="text-align:center"><i>Author: Dmitry Kondratyev, based on example code by Lindsey Gray</i></div>
<hr style="border-top-width: 4px; border-top-color: #34609b;">

# Search for Higgs boson decaying into two muons

This code uses awkward array toolset, and utilizing Coffea [histograms](https://coffeateam.github.io/coffea/modules/coffea.hist.html).
This also shows the analysis object syntax implemented by Coffea [JaggedCandidateArray](https://coffeateam.github.io/coffea/api/coffea.analysis_objects.JaggedCandidateMethods.html), and the usage of custom [accumulators](https://coffeateam.github.io/coffea/api/coffea.processor.AccumulatorABC.html) other than histograms.  Further, it introduces the [processor](https://coffeateam.github.io/coffea/api/coffea.processor.ProcessorABC.html) concept and the interface to apache spark.

#### SWAN env: LCG96 Python3 stack and Cloud Containers cluster

In [None]:
# Run this cell if you do not have coffea installed (e.g. on SWAN with LCG 96Python3 stack)
!pip install --user --upgrade coffea

# spark.jars.packages doesnt work with Spark 2.4 with kubernetes
!wget -N https://repo1.maven.org/maven2/edu/vanderbilt/accre/laurelin/0.5.1/laurelin-0.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-api/2.11.2/log4j-api-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-core/2.11.2/log4j-core-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/lz4/lz4-java/1.5.1/lz4-java-1.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/tukaani/xz/1.2/xz-1.2.jar

In [None]:
# Run this cell before establishing spark connection

import os
os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + '/usr/local/lib/python3.6/site-packages'

In [None]:
import time
import coffea
print("Coffea version: ", coffea.__version__)

from coffea import hist, util
from coffea.analysis_objects import JaggedCandidateArray
import coffea.processor as processor
from coffea.lookup_tools import extractor, dense_lookup
from coffea.lumi_tools import LumiMask

import awkward
import uproot
import numpy as np
import numba

In [None]:
import glob

paths = {
    'data_B': '/eos/cms/store/data/Run2016B_ver1/SingleMuon/NANOAOD/Nano25Oct2019_ver1-v1/*/',
    'data_C': '/eos/cms/store/data/Run2016C/SingleMuon/NANOAOD/Nano25Oct2019-v1/*/',
    'data_D': '/eos/cms/store/data/Run2016D/SingleMuon/NANOAOD/Nano25Oct2019-v1/*/',
    'data_E': '/eos/cms/store/data/Run2016E/SingleMuon/NANOAOD/Nano25Oct2019-v1/*/',
    'data_F': '/eos/cms/store/data/Run2016F/SingleMuon/NANOAOD/Nano25Oct2019-v1/*/',
    'data_G': '/eos/cms/store/data/Run2016G/SingleMuon/NANOAOD/Nano25Oct2019-v1/*/',
    'data_H': '/eos/cms/store/data/Run2016H/SingleMuon/NANOAOD/Nano25Oct2019-v1/*/',
#     'ggh':,
#     'vbf':,
#     '':,
#     '':,
    'dy': '/eos/cms/store/mc/RunIISummer16NanoAODv6/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/*/*/',
#     'dy_m105_160_amc': '/eos/cms/store/mc/RunIISummer16NanoAODv6/DYJetsToLL_M-105To160_TuneCP5_PSweights_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/*/*/'
    'ttjets_dl': '/eos/cms/store/mc/RunIISummer16NanoAODv6/TTJets_DiLept_TuneCUETP8M1_13TeV-madgraphMLM-pythia8/NANOAODSIM/*/*/',
#     'st_t_top':,
#     'st_t_antitop':,
    'st_tw_top': '/eos/cms/store/mc/RunIISummer16NanoAODv6/ST_tW_top_5f_NoFullyHadronicDecays_13TeV-powheg_TuneCUETP8M1/NANOAODSIM/*/*/',
    'st_tw_antitop': '/eos/cms/store/mc/RunIISummer16NanoAODv6/ST_tW_antitop_5f_NoFullyHadronicDecays_13TeV-powheg_TuneCUETP8M1/NANOAODSIM/*/*/',

    'ww_2l2nu': '/eos/cms/store/mc/RunIISummer16NanoAODv6/WWTo2L2Nu_13TeV-powheg/NANOAODSIM/*/*/',
    'wz_3lnu': '/eos/cms/store/mc/RunIISummer16NanoAODv6/WZTo3LNu_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/*/*/',

    'www': '/eos/cms/store/mc/RunIISummer16NanoAODv6/WWW_4F_DiLeptonFilter_TuneCUETP8M1_13TeV-amcatnlo-pythia8/NANOAODSIM/*/*/',
    'wwz': '/eos/cms/store/mc/RunIISummer16NanoAODv6/WWZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8/NANOAODSIM/*/*/',
    'wzz': '/eos/cms/store/mc/RunIISummer16NanoAODv6/WZZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8/NANOAODSIM/*/*/',
    'zzz': '/eos/cms/store/mc/RunIISummer16NanoAODv6/ZZZ_TuneCUETP8M1_13TeV-amcatnlo-pythia8/NANOAODSIM/*/*/',
}

for_debug = ['data_G', 'dy', 'ttjets_dl']

# TODO: add all processes 
# TODO: add possibility to load of files through XrootD

samples = paths.keys()

debug = True

fileset = {}
fileset_debug = {}

lumi = 1
data_entries = 0

# Find ROOT files in local directories
for sample, path in paths.items():
    all_files = []
    all_files = glob.glob(path+'*root')
    
    if debug:
        all_files = [all_files[0]]

    if 'data' in sample:
        for f in all_files:
           fi = uproot.open('root://eoscms.cern.ch/'+f)['Events']
           data_entries += fi.numentries
        
    fileset[sample] = {
        'files': ['root://eoscms.cern.ch/'+f for f in all_files],
        'treename': 'Events'
    }

    if sample in for_debug:
        fileset_debug[sample] = {
            'files': ['root://eoscms.cern.ch/'+f for f in all_files],
            'treename': 'Events'
        }

lumi = 35860.*data_entries/645880988.
print(f"Loading {data_entries/645880988.*100}% of 2016 data.")
print(f"Integrated luminosity {lumi}/pb")


In [None]:
lumimasks = {
    "2016": "data/lumimasks/Cert_271036-284044_13TeV_23Sep2016ReReco_Collisions16_JSON.txt",
    "2017": "data/lumimasks/Cert_294927-306462_13TeV_EOY2017ReReco_Collisions17_JSON_v1.txt",
    "2018": "data/lumimasks/Cert_314472-325175_13TeV_17SeptEarlyReReco2018ABC_PromptEraD_Collisions18_JSON.txt",
}

import python.RoccoR as roccor
roccor_files = {
    "2016": "data/roch_corr/RoccoR2016.txt",
    "2017": "data/roch_corr/RoccoR2017.txt",
    "2018": "data/roch_corr/RoccoR2018.txt",
}
# To generate RoccoR libraries:
# cd plugin
# make
# cd ../

import warnings
warnings.filterwarnings('ignore')

@numba.jit(parallel=True)
def apply_roccor(roch_corrector, muons, nmuons=2):
    corrections = muons.pt.ones_like()
    for iev in range(muons.shape[0]):
        for imu in range(nmuons):
            mu = muons[iev][imu]
            corrections[iev][imu] = roch_corrector.kScaleDT(mu.charge, mu.pt, mu.eta, mu.phi)
    return corrections


puLookup = util.load('data/pileup/puLookup.coffea')
muSFFileList = [{'id'   : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunBCDEF_SF_ID.root", "NUM_TightID_DEN_genTracks_eta_pt"),
                 'iso'   : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunBCDEF_SF_ISO.root", "NUM_TightRelIso_DEN_TightIDandIPCut_eta_pt"),
                 'trig'  : ("data/muon_sf/EfficienciesStudies_2016_trigger_EfficienciesAndSF_RunBtoF.root", "IsoMu24_OR_IsoTkMu24_PtEtaBins/abseta_pt_ratio"),
                 'scale' : 19.656062760/35.882515396},
                {'id'     : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunGH_SF_ID.root", "NUM_TightID_DEN_genTracks_eta_pt"),
                 'iso'   : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunGH_SF_ISO.root", "NUM_TightRelIso_DEN_TightIDandIPCut_eta_pt"),
                 'trig'  : ("data/muon_sf/EfficienciesStudies_2016_trigger_EfficienciesAndSF_RunGtoH.root", "IsoMu24_OR_IsoTkMu24_PtEtaBins/abseta_pt_ratio"),
                 'scale' : 16.226452636/35.882515396}]
# TODO: check scale
# TODO: generate SF for other years

In [None]:
# Look at ProcessorABC documentation to see the expected methods and what they are supposed to do
# https://coffeateam.github.io/coffea/api/coffea.processor.ProcessorABC.html
class DimuonProcessor(processor.ProcessorABC):
    def __init__(self, mass_window=[76,106], do_roccor=True):
        self.mass_window = mass_window
        self.do_roccor = do_roccor
        event_branches = ['run', 'luminosityBlock', 'genWeight']
        muon_branches = ['nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_pfRelIso04_all']
        jet_branches = ['nJet', 'Jet_pt', 'Jet_eta', 'Jet_phi', 'Jet_mass', 'Jet_qgl', 'Jet_jetId', 'Jet_puId']
        vtx_branches = ['Pileup_nTrueInt', 'PV_npvsGood'] 
        other_branches = ['MET_pt']
        event_flags = ['Flag_BadPFMuonFilter','Flag_EcalDeadCellTriggerPrimitiveFilter',
                        'Flag_HBHENoiseFilter','Flag_HBHENoiseIsoFilter',
                        'Flag_globalSuperTightHalo2016Filter','Flag_goodVertices','Flag_BadChargedCandidateFilter']
        hlt_branches = ['HLT_IsoMu24', 'HLT_IsoTkMu24']
        self._columns = event_branches + muon_branches + jet_branches +\
                        vtx_branches + other_branches + event_flags + hlt_branches

        dataset_axis = hist.Cat("dataset", "")

        axes = {}
        axes['dimuon_mass'] =  hist.Bin("dimuon_mass", r"$m_{\mu\mu}$ [GeV]", 100, self.mass_window[0], self.mass_window[1])
        axes['dimuon_pt'] = hist.Bin("dimuon_pt", r"$p_{T}(\mu\mu)$ [GeV]", 100, 0, 400)
        axes['dimuon_eta'] = hist.Bin("dimuon_eta", r"$\eta (\mu\mu)$ [GeV]", 100, -5, 5)
        axes['dimuon_phi'] = hist.Bin("dimuon_phi", r"$\phi (\mu\mu)$ [GeV]", 100, -3.2, 3.2)
        axes['dimuon_dEta'] = hist.Bin("dimuon_dEta", r"$\Delta\eta (\mu\mu)$ [GeV]", 100, 0, 5)
        axes['dimuon_dPhi'] = hist.Bin("dimuon_dPhi", r"$\Delta\phi (\mu\mu)$ [GeV]", 100, 0, 6.4)

        axes['mu1_pt'] = hist.Bin("mu1_pt", r"$p_{T}(\mu_{1})$ [GeV]", 100, 0, 250)
        axes['mu1_pt_nosf'] = hist.Bin("mu1_pt_nosf", r"$p_{T}(\mu_{1}) no SF$ [GeV]", 100, 0, 250)
        axes['mu1_eta'] = hist.Bin("mu1_eta", r"$\eta (\mu_{1})$ [GeV]", 100, -2.4, 2.4)
        axes['mu1_phi'] = hist.Bin("mu1_phi", r"$\phi (\mu_{1})$ [GeV]", 100, -3.2, 3.2)
        axes['mu2_pt'] = hist.Bin("mu2_pt", r"$p_{T}(\mu_{2})$ [GeV]", 100, 0, 150)
        axes['mu2_eta'] = hist.Bin("mu2_eta", r"$\eta (\mu_{2})$ [GeV]", 100, -2.4, 2.4)
        axes['mu2_phi'] = hist.Bin("mu2_phi", r"$\phi (\mu_{2})$ [GeV]", 100, -3.2, 3.2)

        axes['jet1_pt'] = hist.Bin("jet1_pt", r"$p_{T}(jet1)$ [GeV]", 100, 0, 400)
        axes['jet1_eta'] = hist.Bin("jet1_eta", r"$\eta (jet1)$ [GeV]", 100, -4.7, 4.7)
        axes['jet1_phi'] = hist.Bin("jet1_phi", r"$\phi (jet1)$ [GeV]", 100, -3.2, 3.2)
        axes['jet1_qgl'] = hist.Bin("jet1_qgl", r"$QGL (jet1)$ [GeV]", 10, 0, 1)
        
        axes['jet2_pt'] = hist.Bin("jet2_pt", r"$p_{T}(jet2)$ [GeV]", 100, 0, 300)
        axes['jet2_eta'] = hist.Bin("jet2_eta", r"$\eta (jet2)$ [GeV]", 100, -4.7, 4.7)
        axes['jet2_phi'] = hist.Bin("jet2_phi", r"$\phi (jet2)$ [GeV]", 100, -3.2, 3.2)
        axes['jet2_qgl'] = hist.Bin("jet2_qgl", r"$QGL (jet2)$ [GeV]", 10, 0, 1)

        
        axes['njets'] = hist.Bin("njets", "njets", 10, 0, 10)
        axes['npv'] = hist.Bin("npv", "npv", 50, 0, 50)
        axes['npv_prePU'] = hist.Bin("npv_prePU", "npv_prePU", 50, 0, 50)
        axes['met'] = hist.Bin("met", r"$E_{T}^{miss.}$ [GeV]", 100, 0, 200)

        axes['genweight'] = hist.Bin("genweight", "genweight", 50, 0, 50)
        
        variables = axes.keys()
        
        accumulators = {}
        for v in variables:
            accumulators[v] = hist.Hist("Counts", dataset_axis, axes[v])
            # TODO: add category axis and systematics axis
            
        accumulators['sumGenWeights'] = hist.Hist("sumGenWeights", dataset_axis)
        accumulators['entries'] = hist.Hist("Entries", dataset_axis)
        accumulators['cutflow'] = processor.defaultdict_accumulator(int)
        
        accumulators['dimuon_mass_unbinned'] = processor.column_accumulator(np.ndarray([]))
        # request: possibility to add another axis to column_accumulator (e.g. 'dataset')
        
        self._accumulator = processor.dict_accumulator(accumulators)
    
        mu_id_vals = 0
        mu_id_err = 0
        mu_iso_vals = 0
        mu_iso_err = 0
        mu_trig_vals = 0
        mu_trig_err = 0

        for scaleFactors in muSFFileList:
            id_file = uproot.open(scaleFactors['id'][0])
            iso_file = uproot.open(scaleFactors['iso'][0])
            trig_file = uproot.open(scaleFactors['trig'][0])
            
            mu_id_vals += id_file[scaleFactors['id'][1]].values * scaleFactors['scale']
            mu_id_err += id_file[scaleFactors['id'][1]].variances**0.5 * scaleFactors['scale']
            mu_id_edges = id_file[scaleFactors['id'][1]].edges

            mu_iso_vals += iso_file[scaleFactors['iso'][1]].values * scaleFactors['scale']
            mu_iso_err += iso_file[scaleFactors['iso'][1]].variances**0.5 * scaleFactors['scale']
            mu_iso_edges = iso_file[scaleFactors['iso'][1]].edges

            mu_trig_vals += trig_file[scaleFactors['trig'][1]].values * scaleFactors['scale']
            mu_trig_err += trig_file[scaleFactors['trig'][1]].variances**0.5 * scaleFactors['scale']
            mu_trig_edges = trig_file[scaleFactors['trig'][1]].edges

        self.mu_id_sf = dense_lookup.dense_lookup(mu_id_vals, mu_id_edges)
        self.mu_id_err = dense_lookup.dense_lookup(mu_id_err, mu_id_edges)
        self.mu_iso_sf = dense_lookup.dense_lookup(mu_iso_vals, mu_iso_edges)
        self.mu_iso_err = dense_lookup.dense_lookup(mu_iso_err, mu_iso_edges)
        self.mu_trig_sf = dense_lookup.dense_lookup(mu_trig_vals, mu_trig_edges)
        self.mu_trig_err = dense_lookup.dense_lookup(mu_trig_err, mu_trig_edges)    
    
#         self.extractor = extractor()
#         self.extractor.add_weight_sets([""])
#         self.extractor.finalize()
#         self.evaluator = self.extractor.make_evaluator()
#         print(self.evaluator)
    
        self.roch_corrector = roccor.RoccoR(roccor_files["2016"].encode('utf-8'))

    
    @property
    def accumulator(self):
        return self._accumulator
    
    @property
    def columns(self):
        return self._columns
    
    def process(self, df):
        output = self.accumulator.identity()
        dataset = df.metadata['dataset']
        isData = 'data' in dataset
       
        nEvts = df.shape[0]
        
        if isData:
            lumi_info = LumiMask(lumimasks["2016"])
            lumimask = lumi_info(df.run.flatten(), df.luminosityBlock.flatten())
            genweight = np.ones(nEvts)
            event_weight = np.ones(nEvts)
            event_weight_nosf = np.ones(nEvts)
            
        else:
            muID = self.mu_id_sf(df.Muon.eta, df.Muon.pt)
            muIso = self.mu_iso_sf(df.Muon.eta, df.Muon.pt)
            muTrig = self.mu_iso_sf(abs(df.Muon.eta), df.Muon.pt)
            muSF = (muID*muIso*muTrig).prod()
            
#             muIDerr = self.mu_id_err(muons.eta, muons.pt)
#             muIsoerr = self.mu_iso_err(muons.eta, muons.pt)
#             muTrigerr = self.mu_iso_err(abs(muons.eta), muons.pt)
#             muSF_up = ((muID + muIDerr) * (muIso + muIsoerr) * (muTrig + muTrigerr)).prod()
#             muSF_down = ((muID - muIDerr) * (muIso - muIsoerr) * (muTrig - muTrigerr)).prod() 
    
            lumimask = np.ones(nEvts, dtype=bool)
            genweight = df.genWeight.array
            pu_weight = puLookup(dataset, df.Pileup.nTrueInt)
            event_weight = genweight*muSF*pu_weight
            event_weight_nosf = genweight*pu_weight

        output['sumGenWeights'].fill(dataset=dataset, weight=genweight.sum())

        # TODO: move all parameters to external config
        # TODO: Add FSR recovery
        # TODO: trigger matching for muons
        
        muons = df.Muon
        muons = muons[(muons.pt > 20) & (abs(muons.eta) < 2.4) & (muons.pfRelIso04_all<0.25) & muons.mediumId]
    
        
        event_flags = df.Flag.BadPFMuonFilter & \
                        df.Flag.EcalDeadCellTriggerPrimitiveFilter &\
                        df.Flag.HBHENoiseFilter &\
                        df.Flag.HBHENoiseIsoFilter &\
                        df.Flag.globalSuperTightHalo2016Filter &\
                        df.Flag.goodVertices &\
                        df.Flag.BadChargedCandidateFilter
        
        hlt = df.HLT.IsoMu24 | df.HLT.IsoTkMu24 
        electron_veto = (df.Electron[(df.Electron.pt>20) & (abs(df.Electron.eta)<2.5) & (df.Electron.mvaFall17V2Iso_WP90==1)].counts == 0)
        event_filter = (lumimask & event_flags & hlt & (muons.counts == 2) & (muons['charge'].prod() == -1) &\
                        electron_veto & (df.PV.npvsGood > 0)).flatten()
        
        df = df[event_filter]
        muons = muons[event_filter] # 'muons' only stores two muons per event
        event_weight = event_weight[event_filter]
        event_weight_nosf = event_weight_nosf[event_filter]        
        
        if self.do_roccor:
            roccor_factors = apply_roccor(self.roch_corrector, muons)
        else: 
            roccor_factors = muons.pt.ones_like()
        
        # for fast calculation of dimuon kinematics
        muons_jca = JaggedCandidateArray.candidatesfromcounts(
            muons.counts,
            pt=muons.pt.content*roccor_factors.content,
            eta=muons.eta.content,
            phi=muons.phi.content,
            mass=muons.mass.content,
        )
        
        dimuons = muons_jca.distincts()
        leading_mu = (dimuons.i0.pt.content > dimuons.i1.pt.content)

        mu1_pt = np.where(leading_mu, dimuons.i0.pt.content, dimuons.i1.pt.content)
        mu1_eta= np.where(leading_mu, dimuons.i0.eta.content, dimuons.i1.eta.content)
        mu1_phi= np.where(leading_mu, dimuons.i0.phi.content, dimuons.i1.phi.content)

        mu2_pt = np.where(~leading_mu, dimuons.i0.pt.content, dimuons.i1.pt.content)
        mu2_eta= np.where(~leading_mu, dimuons.i0.eta.content, dimuons.i1.eta.content)
        mu2_phi= np.where(~leading_mu, dimuons.i0.phi.content, dimuons.i1.phi.content)
        
        dimuon_filter = ((mu1_pt > 26) & (dimuons.mass > self.mass_window[0]) & (dimuons.mass < self.mass_window[1])).flatten()
        
        df = df[dimuon_filter]   

        # this is messy
        mu1_pt = mu1_pt[dimuon_filter]
        mu1_eta = mu1_eta[dimuon_filter]
        mu1_phi = mu1_phi[dimuon_filter]        
        mu2_pt = mu2_pt[dimuon_filter]
        mu2_eta = mu2_eta[dimuon_filter]
        mu2_phi = mu2_phi[dimuon_filter]        
        dimuons = dimuons[dimuon_filter]
        
        event_weight = event_weight[dimuon_filter]
        event_weight_nosf = event_weight_nosf[dimuon_filter]
    
        # TODO: dimuon_costhetaCS, dimuon_phiCS
        
        # Select and fill jets and jet pairs
        # TODO: nsoftjets, dijet mass, eta, phi, deta, dphi
        
#         print(df.Jet.columns)
#         print(df.Jet.btagDeepB)
        
        jets = df.Jet
        
        # 2016: loose jetId, loose piId        
        jet_id = (jets.jetId >= 1)
        jet_puid = (((jets.puId >= 4) & (jets.pt < 50)) | (jets.pt > 50))
        

        jet_selection = ((jets.pt > 25) & (abs(jets.eta) < 4.7) & jet_id & jet_puid & (jets.qgl > -2))
        
        jets = jets[jet_selection]
        event_weight_jet = event_weight[jet_selection.any()]        
        # TODO: DeltaR(j, mu) > 0.4
        # TODO: JEC, JER

        one_jet = (jet_selection.any() & (jets.counts>0))
        two_jets = (jet_selection.any() & (jets.counts>1))
        
        event_weight_jet1 = event_weight[one_jet]
        event_weight_jet2 = event_weight[two_jets]
        
        # temporary - pT sorting will not stay preserved after jet corrections
        jet1 = jets[one_jet,0]
        jet2 = jets[two_jets,1]
        
        # this would be more accurate:
#         jet1 = jets[jets.pt.argmax()]
        # but I couldn't get jet2 the same way (argsort didn't work with jagged arrays)
        
        # TODO: kinematic variables of multimuon-multijet system
        # >> many of them...
        
        # TODO: event-by-event mass resolution and calibration
        
        # Other variables
        npv = df.PV.npvsGood      
        met = df.MET.pt        

        
        # TODO: Add initial categorization (ggH vs. VBF)
        
        # Fill event weights
        # TODO: NNLOPS reweighting (ggH), Zpt reweighting, LHE wgts
        
        # TODO: Add systematic uncertainties
        
        # TODO: Evaluate DNN
        

        output['cutflow']['all events'] += nEvts
        output['cutflow']['event_filter'] += event_filter.sum()
        output['cutflow']['dimuon_filter'] += dimuon_filter.sum()

        ### Fill muons ###
        output['dimuon_mass'].fill(dataset=dataset, dimuon_mass=dimuons.mass.flatten(), weight=event_weight) 
        if isData:
            output['dimuon_mass_unbinned'] += processor.column_accumulator(dimuons.mass.flatten())
        output['dimuon_pt'].fill(dataset=dataset, dimuon_pt=dimuons.pt.flatten(), weight=event_weight)
        output['dimuon_eta'].fill(dataset=dataset, dimuon_eta=dimuons.eta.flatten(), weight=event_weight)
        output['dimuon_phi'].fill(dataset=dataset, dimuon_phi=dimuons.phi.flatten(), weight=event_weight)
    
        output['dimuon_dEta'].fill(dataset=dataset, dimuon_dEta=abs(mu1_eta.flatten() - mu2_eta.flatten()), weight=event_weight)

#         output['dimuon_dPhi'].fill(dataset=dataset, dimuon_dPhi=abs(mu1.delta_phi(mu2)).flatten(), weight=event_weight)        
#         TODO: make sure delta_phi is accessible from uproot_methods/classes/TLorentzVector.py

        output['mu1_pt'].fill(dataset=dataset, mu1_pt=mu1_pt.flatten(), weight=event_weight)
        output['mu1_pt_nosf'].fill(dataset=dataset, mu1_pt_nosf=mu1_pt.flatten(), weight=event_weight_nosf)
        output['mu1_eta'].fill(dataset=dataset, mu1_eta=mu1_eta.flatten(), weight=event_weight)
        output['mu1_phi'].fill(dataset=dataset, mu1_phi=mu1_phi.flatten(), weight=event_weight)

        output['mu2_pt'].fill(dataset=dataset, mu2_pt=mu2_pt.flatten(), weight=event_weight)
        output['mu2_eta'].fill(dataset=dataset, mu2_eta=mu2_eta.flatten(), weight=event_weight)
        output['mu2_phi'].fill(dataset=dataset, mu2_phi=mu2_phi.flatten(), weight=event_weight)    
        
        ### Fill jets ###
        output['jet1_pt'].fill(dataset=dataset, jet1_pt=jet1.pt.flatten(), weight=event_weight_jet1)
        output['jet1_eta'].fill(dataset=dataset, jet1_eta=jet1.eta.flatten(), weight=event_weight_jet1)
        output['jet1_phi'].fill(dataset=dataset, jet1_phi=jet1.phi.flatten(), weight=event_weight_jet1)
        output['jet1_qgl'].fill(dataset=dataset, jet1_qgl=jet1.qgl.flatten(), weight=event_weight_jet1)
        
        output['jet2_pt'].fill(dataset=dataset, jet2_pt=jet2.pt.flatten(), weight=event_weight_jet2)
        output['jet2_eta'].fill(dataset=dataset, jet2_eta=jet2.eta.flatten(), weight=event_weight_jet2)
        output['jet2_phi'].fill(dataset=dataset, jet2_phi=jet2.phi.flatten(), weight=event_weight_jet2)
        output['jet2_qgl'].fill(dataset=dataset, jet2_qgl=jet2.qgl.flatten(), weight=event_weight_jet2)
        
        output['njets'].fill(dataset=dataset, njets=jets.counts, weight=event_weight)
        
        ### Fill other variables ###
        output['npv'].fill(dataset=dataset, npv=npv, weight=event_weight)
        output['met'].fill(dataset=dataset, met=met, weight=event_weight)
            
        output['entries'].fill(dataset=dataset, weight=nEvts) # temporary
        
        
        
        return output
    
    def postprocess(self, accumulator):
        return accumulator

Option 1: Iterative executor
===


In [None]:
# DEBUG: iterative processing 

from coffea.processor.executor import iterative_executor

tstart = time.time() 
output = processor.run_uproot_job(fileset_debug, 'Events', DimuonProcessor(), iterative_executor, executor_args={'nano': True})
elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Option 2: Dask executor
===


In [None]:
import pytest
from coffea.processor.executor import dask_executor

distributed = pytest.importorskip("distributed", minversion="1.28.1")
client = distributed.Client(processes=False, dashboard_address=None)

tstart = time.time() 
output = processor.run_uproot_job(fileset, 'Events', DimuonProcessor(), dask_executor, executor_args={'nano': True, 'client': client})
elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Option 3: Apache Spark
===


NOW IT IS TIME TO START SPARK CLUSTER CONNECTION
---

When using SWAN, click on the 5-point start icon in Jupyter notebook

In [None]:
import pyspark.sql
from pyarrow.compat import guid
from coffea.processor.spark.detail import _spark_initialize, _spark_stop
from coffea.processor.spark.spark_executor import spark_executor
"""
# NOT needed on SWAN, spark config is offloaded to spark connector

spark_config = pyspark.sql.SparkSession.builder \
    .appName('spark-executor-test-%s' % guid()) \
    .master('local[*]') \
    .config('spark.driver.memory', '4g') \
    .config('spark.executor.memory', '4g') \
    .config('spark.sql.execution.arrow.enabled','true') \
    .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

spark = _spark_initialize(config=spark, log_level='WARN', 
                          spark_progress=False, laurelin_version='0.5.1')
"""
partitionsize = 200000
thread_workers = 2

tstart = time.time() 
# if jobs fail, it might be because some columns are missing from processor._columns
output = processor.run_spark_job(fileset, DimuonProcessor(), spark_executor, 
                                 spark=spark, partitionsize=partitionsize, thread_workers=thread_workers,
                                 executor_args={'file_type': 'edu.vanderbilt.accre.laurelin.Root', 'cache': False}
                                )

elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Plotting
===

In [None]:
def plot_variable(fig, var, gs, weights):
    
    output_copy = output[var].copy() # copy to prevent scaling multiple times
    output_copy.scale(weights, axis='dataset')
    
    # Group the samples for plotting
    
#     data = output_copy['data']
    data_sources = {
        'data': ['data_B','data_C','data_D','data_E','data_F','data_G','data_H']    
    }
    data = output_copy.group('dataset', hist.Cat("dataset", "Dataset"), data_sources)
    
    bkg_sources = {
        'dy': ['dy'],
#         'dy': ['dy_m105_160_amc'],
        'ttbar':['ttjets_dl'],
        'single top': ['st_tw_top', 'st_tw_antitop'],
        'vv': ['ww_2l2nu', 'wz_3lnu'],
        'vvv': ['www','wwz','wzz','zzz']
    }

    bkg = output_copy.group('dataset', hist.Cat("dataset", "Dataset"), bkg_sources)
    bkg.axis('dataset').sorting = 'integral' # sort backgrounds by event yields
    
    data_opts = {'color': 'k', 'marker': '.', 'markersize':15}
    stack_fill_opts = {'alpha': 0.8, 'edgecolor':(0,0,0)}
    stack_error_opts = {'label':'Stat. unc.','facecolor':(0,0,0,.4), 'hatch':'', 'linewidth': 0}
    
    # Top panel: Data vs. MC plot
    plt1 = fig.add_subplot(gs[0])
    ax_bkg = hist.plot1d(bkg, ax=plt1, overlay='dataset', overflow='all', stack=True, fill_opts=stack_fill_opts, error_opts=stack_error_opts)
    ax_data = hist.plot1d(data, overlay='dataset', overflow='all', line_opts=None, error_opts=data_opts)
    plt1.set_yscale('log')
    plt1.set_ylim(0.1, 1e7)
    lbl = hep.cms.cmslabel(plt1, data=True, paper=False, year='2016')
    plt1.set_xlabel('')
    plt1.tick_params(axis='x', labelbottom=False)
    plt1.legend(prop={'size': 'xx-small'})
    
    # Bottom panel: Data/MC ratio plot
    plt2 = fig.add_subplot(gs[1], sharex=plt1)
    num = data.sum('dataset')
    denom = bkg.sum('dataset')
    hist.plotratio(num=num, ax=plt2,
                    denom=denom,
                    error_opts=data_opts, denom_fill_opts={}, guide_opts={},
                    unc='num')
    
    
    plt2.axhline(1, ls='--')
    plt2.set_ylim([0,2])    
    plt2.set_ylabel('Data/MC')
    

In [None]:
# Prepare things to plot
from parameters import cross_sections

mc_datasets = [s for s in samples if 'data' not in s]

lumi_weights = {'data':1}
for mc in mc_datasets:
    N = output['sumGenWeights'].values()[(mc,)]
    lumi_weights[mc] = cross_sections[mc]*lumi / N

# print(output['dimuon_mass_unbinned'].value)    
    
vars_to_plot = []
vars_to_plot += ['dimuon_mass', 'dimuon_pt', 'dimuon_eta', 'dimuon_phi']
vars_to_plot += ['dimuon_dEta']#, 'dimuon_dPhi']
vars_to_plot += ['mu1_pt','mu2_pt', 'mu1_eta','mu2_eta', 'mu1_phi','mu2_phi']
vars_to_plot += ['jet1_pt', 'jet1_eta', 'jet1_phi', 'jet1_qgl']
vars_to_plot += ['jet2_pt', 'jet2_eta', 'jet2_phi', 'jet2_qgl']
# vars_to_plot += ['mu1_pt_nosf']
vars_to_plot += ['njets', 'npv', 'met']

Make plots iteratively and put into a grid
---

Slower, but output looks nicer.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import mplhep as hep
plt.style.use(hep.cms.style.ROOT)
from matplotlib import gridspec
import math
    
fig = plt.figure()
    
nplots_x = 4 # number of plots in one row
nplots_y = math.ceil(len(vars_to_plot) / nplots_x) # number of rows

plotsize=10
ratio_plot_size = 0.25
fig.set_size_inches(nplots_x*plotsize,nplots_y*plotsize*(1+ratio_plot_size))
outer_grid = gridspec.GridSpec(nplots_y, nplots_x, hspace = .3) 
for i, var in enumerate(vars_to_plot):
    gs = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec = outer_grid[i], height_ratios=[(1-ratio_plot_size),ratio_plot_size], hspace = .05)
    plot_variable(fig, var, gs, lumi_weights)

Make plots in parallel
---

Faster, but can't display the plots in a grid. Maybe will use to save png/pdf plots.

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib import gridspec
import mplhep as hep
plt.style.use(hep.cms.style.ROOT)
import multiprocessing as mp

plt.rcParams['figure.dpi'] = 50
    
figs = {}
def plot_var(i):
    fig = plt.figure()
    plotsize=8
    ratio_plot_size = 0.24
    fig.set_size_inches(plotsize,plotsize*(1+ratio_plot_size))
    gs = gridspec.GridSpec(2, 1, height_ratios=[(1-ratio_plot_size),ratio_plot_size], hspace = .05)
    plot_variable(fig, vars_to_plot[i], gs, lumi_weights)
    return fig

pool = mp.Pool(mp.cpu_count() - 1)
results = [pool.apply(plot_var, args=(x,)) for x in range(len(vars_to_plot))]    
