<!-- <img  src="https://swan.web.cern.ch/sites/swan.web.cern.ch/files/pictures/logo_swan_letters.png" alt="SWAN" style="float: left; width: 15%; margin-right: 5%; margin-left: 17%; margin-top: 1.0em; margin-bottom: 2.0em;">
<img src="https://spark.apache.org/images/spark-logo-trademark.png" alt="EP-SFT" style="float: left; width: 25%; margin-right: 0%; margin-left: 0%; margin-bottom: 2.0em;">
<img src="https://cms-docdb.cern.ch/cgi-bin/PublicDocDB/RetrieveFile?docid=3045&filename=CMSlogo_color_label_1024_May2014.png&version=3" alt="CMS" style="float: left; width: 12%; margin-left: 5%; margin-right: 5%; margin-bottom: 2.0em;"> -->
<p style="clear: both;">
<div style="text-align:center"><h1>CMS H&#8594;µµ analysis with Apache Spark  
     <br> using Coffea and Laurelin packages from Fermilab</h1></div>
<div style="text-align:center"><i>Author: Dmitry Kondratyev, based on example code by Lindsey Gray</i></div>
<hr style="border-top-width: 4px; border-top-color: #34609b;">

# Search for Higgs boson decaying into two muons

This code uses awkward array toolset, and utilizing Coffea [histograms](https://coffeateam.github.io/coffea/modules/coffea.hist.html).
This also shows the analysis object syntax implemented by Coffea [JaggedCandidateArray](https://coffeateam.github.io/coffea/api/coffea.analysis_objects.JaggedCandidateMethods.html), and the usage of custom [accumulators](https://coffeateam.github.io/coffea/api/coffea.processor.AccumulatorABC.html) other than histograms.  Further, it introduces the [processor](https://coffeateam.github.io/coffea/api/coffea.processor.ProcessorABC.html) concept and the interface to apache spark.

#### SWAN env: LCG96 Python3 stack and Cloud Containers cluster

In [None]:
# Run this cell if you do not have coffea installed (e.g. on SWAN with LCG 96Python3 stack)
!pip install --user --upgrade coffea

# spark.jars.packages doesnt work with Spark 2.4 with kubernetes
!wget -N https://repo1.maven.org/maven2/edu/vanderbilt/accre/laurelin/0.5.1/laurelin-0.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-api/2.11.2/log4j-api-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-core/2.11.2/log4j-core-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/lz4/lz4-java/1.5.1/lz4-java-1.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/tukaani/xz/1.2/xz-1.2.jar

In [None]:
# Run this cell before establishing spark connection

import os
os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + '/usr/local/lib/python3.6/site-packages'

In [None]:
import time
import coffea
print("Coffea version: ", coffea.__version__)

from coffea import hist, util
from coffea.analysis_objects import JaggedCandidateArray
import coffea.processor as processor
from coffea.lookup_tools import extractor, dense_lookup

import uproot
import numpy as np

In [None]:
import glob

paths = {
'data': '/eos/cms/store/data/Run2016*/SingleMuon/NANOAOD/Nano25Oct2019-v1/*/',
'dy': '/eos/cms/store/mc/RunIISummer16NanoAODv6/DYJetsToLL_M-50_TuneCUETP8M1_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/PUMoriond17_Nano25Oct2019_102X_mcRun2_asymptotic_v7_ext2-v1/*/',
'ttjets_dl': '/eos/cms/store/mc/RunIISummer16NanoAODv6/TTJets_DiLept_TuneCUETP8M1_13TeV-madgraphMLM-pythia8/NANOAODSIM/PUMoriond17_Nano25Oct2019_102X_mcRun2_asymptotic_v7-v1/*/'
}

# TODO: add possibility to load of files through XrootD
# TODO: add Golden JSON lumi-section filter

samples = paths.keys()

debug = True

fileset = {}

lumi = 1

# Find ROOT files in local directories
for sample, path in paths.items():
    all_files = []
    all_files = glob.glob(path+'*root')
    
    if debug:
        all_files = [all_files[0]]

    if 'data' in sample:
        entries = 0
        for f in all_files:
           fi = uproot.open('root://eoscms.cern.ch/'+f)['Events']
           entries += fi.numentries
        lumi = 35860.*entries/645880988.
        print(f"Loading {entries/645880988.}% of 2016 data.")
        print(f"Integrated luminosity {lumi}/pb")
        
    fileset[sample] = {
        'files': ['root://eoscms.cern.ch/'+f for f in all_files],
        'treename': 'Events'
    }
    

    


In [None]:
puLookup = util.load('data/pileup/puLookup.coffea')
muSFFileList = [{'id'   : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunBCDEF_SF_ID.root", "NUM_TightID_DEN_genTracks_eta_pt"),
                 'iso'   : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunBCDEF_SF_ISO.root", "NUM_TightRelIso_DEN_TightIDandIPCut_eta_pt"),
                 'trig'  : ("data/muon_sf/EfficienciesStudies_2016_trigger_EfficienciesAndSF_RunBtoF.root", "IsoMu24_OR_IsoTkMu24_PtEtaBins/abseta_pt_ratio"),
                 'scale' : 19.656062760/35.882515396},
                {'id'     : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunGH_SF_ID.root", "NUM_TightID_DEN_genTracks_eta_pt"),
                 'iso'   : ("data/muon_sf/EfficienciesStudies_2016_legacy_rereco_rootfiles_RunGH_SF_ISO.root", "NUM_TightRelIso_DEN_TightIDandIPCut_eta_pt"),
                 'trig'  : ("data/muon_sf/EfficienciesStudies_2016_trigger_EfficienciesAndSF_RunGtoH.root", "IsoMu24_OR_IsoTkMu24_PtEtaBins/abseta_pt_ratio"),
                 'scale' : 16.226452636/35.882515396}]

In [None]:
# Look at ProcessorABC documentation to see the expected methods and what they are supposed to do
# https://coffeateam.github.io/coffea/api/coffea.processor.ProcessorABC.html
class DimuonProcessor(processor.ProcessorABC):
    def __init__(self, mass_window=[76,106]):
        self.mass_window = mass_window
        self._columns = ['nMuon', 'Muon_pt', 'Muon_eta', 'Muon_phi', 'Muon_mass', 'Muon_charge', 'Muon_pfRelIso04_all',
                         'nJet', 'Jet_pt', 'Jet_eta', 'Jet_phi', 'Jet_mass',
                        'PV_npvsGood', 'MET_pt', 'genWeight', 'HLT_IsoMu24', 'HLT_IsoTkMu24']
        dataset_axis = hist.Cat("dataset", "")

        axes = {}
        axes['dimuon_mass'] =  hist.Bin("dimuon_mass", r"$m_{\mu\mu}$ [GeV]", 100, self.mass_window[0], self.mass_window[1])
        axes['dimuon_pt'] = hist.Bin("dimuon_pt", r"$p_{T}(\mu\mu)$ [GeV]", 100, 0, 400)
        axes['dimuon_eta'] = hist.Bin("dimuon_eta", r"$\eta (\mu\mu)$ [GeV]", 100, -5, 5)
        axes['dimuon_phi'] = hist.Bin("dimuon_phi", r"$\phi (\mu\mu)$ [GeV]", 100, -3.2, 3.2)
        axes['dimuon_dEta'] = hist.Bin("dimuon_dEta", r"$\Delta\eta (\mu\mu)$ [GeV]", 100, 0, 5)
        axes['dimuon_dPhi'] = hist.Bin("dimuon_dPhi", r"$\Delta\phi (\mu\mu)$ [GeV]", 100, 0, 6.4)

        axes['mu1_pt'] = hist.Bin("mu1_pt", r"$p_{T}(\mu_{1})$ [GeV]", 100, 0, 400)
        axes['mu1_pt_nosf'] = hist.Bin("mu1_pt_nosf", r"$p_{T}(\mu_{1}) no SF$ [GeV]", 100, 0, 400)
        axes['mu1_eta'] = hist.Bin("mu1_eta", r"$\eta (\mu_{1})$ [GeV]", 100, -2.4, 2.4)
        axes['mu1_phi'] = hist.Bin("mu1_phi", r"$\phi (\mu_{1})$ [GeV]", 100, -3.2, 3.2)
        axes['mu2_pt'] = hist.Bin("mu2_pt", r"$p_{T}(\mu_{2})$ [GeV]", 100, 0, 300)
        axes['mu2_eta'] = hist.Bin("mu2_eta", r"$\eta (\mu_{2})$ [GeV]", 100, -2.4, 2.4)
        axes['mu2_phi'] = hist.Bin("mu2_phi", r"$\phi (\mu_{2})$ [GeV]", 100, -3.2, 3.2)

        axes['jet1_pt'] = hist.Bin("jet1_pt", r"$p_{T}(jet1)$ [GeV]", 100, 0, 400)
        axes['jet1_eta'] = hist.Bin("jet1_eta", r"$\eta (jet1)$ [GeV]", 100, -4.7, 4.7)
        axes['jet1_phi'] = hist.Bin("jet1_phi", r"$\phi (jet1)$ [GeV]", 100, -3.2, 3.2)
        axes['jet2_pt'] = hist.Bin("jet2_pt", r"$p_{T}(jet2)$ [GeV]", 100, 0, 300)
        axes['jet2_eta'] = hist.Bin("jet2_eta", r"$\eta (jet2)$ [GeV]", 100, -4.7, 4.7)
        axes['jet2_phi'] = hist.Bin("jet2_phi", r"$\phi (jet2)$ [GeV]", 100, -3.2, 3.2)
        
        axes['njets'] = hist.Bin("njets", "njets", 10, 0, 10)
        axes['npv'] = hist.Bin("npv", "npv", 50, 0, 50)
        axes['npv_prePU'] = hist.Bin("npv_prePU", "npv_prePU", 50, 0, 50)
        axes['met'] = hist.Bin("met", r"$E_{T}^{miss.}$ [GeV]", 100, 0, 200)

        axes['genweight'] = hist.Bin("genweight", "genweight", 50, 0, 50)
        
        variables = axes.keys()
        
        accumulators = {}
        for v in variables:
            accumulators[v] = hist.Hist("Counts", dataset_axis, axes[v])
            # TODO: add category axis and systematics axis
            
        accumulators['sumGenWeights'] = hist.Hist("sumGenWeights", dataset_axis)
        accumulators['entries'] = hist.Hist("Entries", dataset_axis)
        accumulators['cutflow'] = processor.defaultdict_accumulator(int)
        
        accumulators['dimuon_mass_unbinned'] = processor.column_accumulator(np.ndarray([]))
        # request: possibility to add another axis to column_accumulator (e.g. 'dataset')
        
        self._accumulator = processor.dict_accumulator(accumulators)
    
        mu_id_vals = 0
        mu_id_err = 0
        mu_iso_vals = 0
        mu_iso_err = 0
        mu_trig_vals = 0
        mu_trig_err = 0

        for scaleFactors in muSFFileList:
            id_file = uproot.open(scaleFactors['id'][0])
            iso_file = uproot.open(scaleFactors['iso'][0])
            trig_file = uproot.open(scaleFactors['trig'][0])

            mu_id_vals += id_file[scaleFactors['id'][1]].values * scaleFactors['scale']
            mu_id_err += id_file[scaleFactors['id'][1]].variances**0.5 * scaleFactors['scale']
            mu_id_edges = id_file[scaleFactors['id'][1]].edges

            mu_iso_vals += iso_file[scaleFactors['iso'][1]].values * scaleFactors['scale']
            mu_iso_err += iso_file[scaleFactors['iso'][1]].variances**0.5 * scaleFactors['scale']
            mu_iso_edges = iso_file[scaleFactors['iso'][1]].edges

            mu_trig_vals += trig_file[scaleFactors['trig'][1]].values * scaleFactors['scale']
            mu_trig_err += trig_file[scaleFactors['trig'][1]].variances**0.5 * scaleFactors['scale']
            mu_trig_edges = trig_file[scaleFactors['trig'][1]].edges

        self.mu_id_sf = dense_lookup.dense_lookup(mu_id_vals, mu_id_edges)
        self.mu_id_err = dense_lookup.dense_lookup(mu_id_err, mu_id_edges)
        self.mu_iso_sf = dense_lookup.dense_lookup(mu_iso_vals, mu_iso_edges)
        self.mu_iso_err = dense_lookup.dense_lookup(mu_iso_err, mu_iso_edges)
        self.mu_trig_sf = dense_lookup.dense_lookup(mu_trig_vals, mu_trig_edges)
        self.mu_trig_err = dense_lookup.dense_lookup(mu_trig_err, mu_trig_edges)    
    
#         self.extractor = extractor()
        # Take PU weights and muon scale factors from https://github.com/UFLX2MuMu/Ntupliser
#         self.extractor.add_weight_sets(['PU_wgt PU_wgt data/pileup/PU_wgt_2016_Summer16_v0.root'])
#         self.extractor.finalize()
#         self.evaluator = self.extractor.make_evaluator()
    
    @property
    def accumulator(self):
        return self._accumulator
    
    @property
    def columns(self):
        return self._columns
    
    def process(self, df):
        output = self.accumulator.identity()
        dataset = df.metadata['dataset']
        isData = 'data' in dataset
    
        if not isData:            
            muID = self.mu_id_sf(df.Muon.eta, df.Muon.pt)
            muIso = self.mu_iso_sf(df.Muon.eta, df.Muon.pt)
            muTrig = self.mu_iso_sf(abs(df.Muon.eta), df.Muon.pt)
            muSF = (muID*muIso*muTrig).prod()
            
#             muIDerr = self.mu_id_err(muons.eta, muons.pt)
#             muIsoerr = self.mu_iso_err(muons.eta, muons.pt)
#             muTrigerr = self.mu_iso_err(abs(muons.eta), muons.pt)
#             muSF_up = ((muID + muIDerr) * (muIso + muIsoerr) * (muTrig + muTrigerr)).prod()
#             muSF_down = ((muID - muIDerr) * (muIso - muIsoerr) * (muTrig - muTrigerr)).prod()    
    
        nEvts = df.shape[0]
        
        if isData:
            genweight = np.ones(nEvts)
            event_weight = np.ones(nEvts)
            event_weight_nosf = np.ones(nEvts)
        else:
            genweight = df.genWeight.array
#             pu_weight = self.evaluator['PU_wgt'](df.Pileup.nTrueInt)
            pu_weight = puLookup(dataset, df.Pileup.nTrueInt)
            event_weight = genweight*muSF*pu_weight
            event_weight_nosf = genweight*pu_weight

        output['sumGenWeights'].fill(dataset=dataset, weight=genweight.sum())
         

        # Select and fill general event info

        # TODO: electron veto
        # TODO: Add FSR recovery
        # TODO: Add muon scale factors
        # TODO: Add Rochester correction

        muons = df.Muon
        muons = muons[(muons.pt > 20) & (abs(muons.eta) < 2.4) & (muons.pfRelIso04_all<0.25)]

        # TODO: require at least one muon to be matched with an L3 object

        hlt = df.HLT.IsoMu24 | df.HLT.IsoTkMu24          
        event_filter = (hlt & (muons.counts == 2) & (muons['charge'].prod() == -1)).flatten()

        df = df[event_filter]
        muons = muons[event_filter] # 'muons' only stores two muons per event
        event_weight = event_weight[event_filter]
        event_weight_nosf = event_weight_nosf[event_filter]
        # maybe will move event filter somewhere just before filling histograms
        
        # for fast calculation of dimuon kinematics
        muons_jca = JaggedCandidateArray.candidatesfromcounts(
            muons.counts,
            pt=muons.pt.content,
            eta=muons.eta.content,
            phi=muons.phi.content,
            mass=muons.mass.content,
        )
        dimuons = muons_jca.distincts()
        dimuon_filter = ((muons[muons.pt.argmax()].pt > 26) & (dimuons.mass > self.mass_window[0]) & (dimuons.mass < self.mass_window[1])).flatten()
        
        df = df[dimuon_filter]    
        muons = muons[dimuon_filter]
        mu1 = muons[muons.pt.argmax()]
        mu2 = muons[muons.pt.argmin()]
        dimuons = dimuons[dimuon_filter]
        event_weight = event_weight[dimuon_filter]
        event_weight_nosf = event_weight_nosf[dimuon_filter]
        
        # TODO: dimuon_costhetaCS, dimuon_phiCS
        
        # Select and fill jets and jet pairs
        # TODO: nsoftjets, jet pT, eta, phi, QGL; dijet mass, eta, phi, deta, dphi
        
        jets = df.Jet
        jet_selection = ((jets.pt > 25) & (abs(jets.eta) < 4.7))
        jets = jets[jet_selection]
        
        # TODO: DeltaR(j, mu) > 0.4
        # TODO: jetID, PU jetID

        jet1 = jets[jets.pt.argmax()]
        # TODO: get jet2 as well
        
        # TODO: kinematic variables of multimuon-multijet system
        # >> many of them...
        
        # Other variables
        npv = df.PV.npvsGood      
        met = df.MET.pt        

        
        # TODO: Add initial categorization (ggH vs. VBF)
        
        # Fill event weights
        # TODO: PU weight, NNLOPS reweighting, Zpt reweighting
        
        # TODO: Add systematic uncertainties
        
        # TODO: Evaluate DNN
        

        output['cutflow']['all events'] += nEvts
        output['cutflow']['event_filter'] += event_filter.sum()
        output['cutflow']['dimuon_filter'] += dimuon_filter.sum()

        ### Fill muons ###
        output['dimuon_mass'].fill(dataset=dataset, dimuon_mass=dimuons.mass.flatten(), weight=event_weight) 
        if isData:
            output['dimuon_mass_unbinned'] += processor.column_accumulator(dimuons.mass.flatten())
        output['dimuon_pt'].fill(dataset=dataset, dimuon_pt=dimuons.pt.flatten(), weight=event_weight)
        output['dimuon_eta'].fill(dataset=dataset, dimuon_eta=dimuons.eta.flatten(), weight=event_weight)
        output['dimuon_phi'].fill(dataset=dataset, dimuon_phi=dimuons.phi.flatten(), weight=event_weight)
        output['dimuon_dEta'].fill(dataset=dataset, dimuon_dEta=abs(mu1.eta.flatten() - mu2.eta.flatten()), weight=event_weight)

        output['dimuon_dPhi'].fill(dataset=dataset, dimuon_dPhi=abs(mu1.delta_phi(mu2)).flatten(), weight=event_weight)        

        output['mu1_pt'].fill(dataset=dataset, mu1_pt=mu1.pt.flatten(), weight=event_weight)
        output['mu1_pt_nosf'].fill(dataset=dataset, mu1_pt_nosf=mu1.pt.flatten(), weight=event_weight_nosf)
        output['mu1_eta'].fill(dataset=dataset, mu1_eta=mu1.eta.flatten(), weight=event_weight)
        output['mu1_phi'].fill(dataset=dataset, mu1_phi=mu1.phi.flatten(), weight=event_weight)

        output['mu2_pt'].fill(dataset=dataset, mu2_pt=mu2.pt.flatten(), weight=event_weight)
        output['mu2_eta'].fill(dataset=dataset, mu2_eta=mu2.eta.flatten(), weight=event_weight)
        output['mu2_phi'].fill(dataset=dataset, mu2_phi=mu2.phi.flatten(), weight=event_weight)    
        
        ### Fill jets ###
#         output['jet1_pt'].fill(dataset=dataset, jet1_pt=jet1.pt.flatten(), weight=event_weight)
#         output['jet1_eta'].fill(dataset=dataset, jet1_eta=jet1.eta.flatten(), weight=event_weight)
#         output['jet1_phi'].fill(dataset=dataset, jet1_phi=jet1.phi.flatten(), weight=event_weight)
#         output['njets'].fill(dataset=dataset, njets=jets.counts, weight=event_weight)
        
        ### Fill other variables ###
        output['npv'].fill(dataset=dataset, npv=npv, weight=event_weight)
        output['met'].fill(dataset=dataset, met=met, weight=event_weight)
            
        output['entries'].fill(dataset=dataset, weight=nEvts) # temporary
        
        
        
        return output
    
    def postprocess(self, accumulator):
        return accumulator

Option 1: Iterative executor
===


In [None]:
# DEBUG: iterative processing 

from coffea.processor.executor import iterative_executor

tstart = time.time() 
output = processor.run_uproot_job(fileset, 'Events', DimuonProcessor(), iterative_executor, executor_args={'nano': True})
elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Option 2: Dask executor
===


In [None]:
import pytest
from coffea.processor.executor import dask_executor

distributed = pytest.importorskip("distributed", minversion="1.28.1")
client = distributed.Client(processes=False, dashboard_address=None)

tstart = time.time() 
output = processor.run_uproot_job(fileset, 'Events', DimuonProcessor(mass_window=[115,150]), dask_executor, executor_args={'nano': True, 'client': client})
elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Option 3: Apache Spark
===


NOW IT IS TIME TO START SPARK CLUSTER CONNECTION
---

When using SWAN, click on the 5-point start icon in Jupyter notebook

In [None]:
import pyspark.sql
from pyarrow.compat import guid
from coffea.processor.spark.detail import _spark_initialize, _spark_stop
from coffea.processor.spark.spark_executor import spark_executor
"""
# NOT needed on SWAN, spark config is offloaded to spark connector

spark_config = pyspark.sql.SparkSession.builder \
    .appName('spark-executor-test-%s' % guid()) \
    .master('local[*]') \
    .config('spark.driver.memory', '4g') \
    .config('spark.executor.memory', '4g') \
    .config('spark.sql.execution.arrow.enabled','true') \
    .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

spark = _spark_initialize(config=spark, log_level='WARN', 
                          spark_progress=False, laurelin_version='0.5.1')
"""
partitionsize = 200000
thread_workers = 2

tstart = time.time() 

output = processor.run_spark_job(fileset, DimuonProcessor(), spark_executor, 
                                 spark=spark, partitionsize=partitionsize, thread_workers=thread_workers,
                                 executor_args={'file_type': 'edu.vanderbilt.accre.laurelin.Root', 'cache': False}
                                )

elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Plotting
===

In [None]:
def plot_variable(fig, var, gs, weights):
    
    output_copy = output[var].copy() # copy to prevent scaling multiple times
    output_copy.scale(weights, axis='dataset')
    
    # Group the samples for plotting (will be important when we unite low-yield samples like VV)
    
    data = output_copy['data']
    
    bkg_sources = {
        'dy': ['dy'],
        'ttbar':['ttjets_dl'],  
    }
    bkg = output_copy.group('dataset', hist.Cat("dataset", "Dataset"), bkg_sources)
    bkg.axis('dataset').sorting = 'integral' # sort backgrounds by event yields
    
    data_opts = {'color': 'k', 'marker': '.', 'markersize':15}
    stack_fill_opts = {'alpha': 0.8, 'edgecolor':(0,0,0)}
    stack_error_opts = {'label':'Stat. unc.','facecolor':(0,0,0,.4), 'hatch':'', 'linewidth': 0}
    
    # Top panel: Data vs. MC plot
    plt1 = fig.add_subplot(gs[0])
    ax_bkg = hist.plot1d(bkg, ax=plt1, overlay='dataset', overflow='all', stack=True, fill_opts=stack_fill_opts, error_opts=stack_error_opts)
    ax_data = hist.plot1d(data, overlay='dataset', overflow='all', line_opts=None, error_opts=data_opts)
    plt1.set_yscale('log')
    plt1.set_ylim(0.1, 1e7)
    lbl = hep.cms.cmslabel(plt1, data=True, paper=False, year='2016')
    plt1.set_xlabel('')
    plt1.tick_params(axis='x', labelbottom=False)
    

    # Bottom panel: Data/MC ratio plot
    plt2 = fig.add_subplot(gs[1], sharex=plt1)
    num = data.sum('dataset')
    denom = bkg.sum('dataset')
    hist.plotratio(num=num, ax=plt2,
                    denom=denom,
                    error_opts=data_opts, denom_fill_opts={}, guide_opts={},
                    unc='num')
    
    
    plt2.axhline(1, ls='--')
    plt2.set_ylim([0,2])    
    plt2.set_ylabel('Data/MC')
    

In [None]:
# Prepare things to plot
from parameters import cross_sections

mc_datasets = [s for s in samples if 'data' not in s]

lumi_weights = {'data':1}
for mc in mc_datasets:
    N = output['sumGenWeights'].values()[(mc,)]
#     print(N)
    lumi_weights[mc] = cross_sections[mc]*lumi / N

# print(output['dimuon_mass_unbinned'].value)    
    
vars_to_plot = []
vars_to_plot += ['dimuon_mass', 'dimuon_pt', 'dimuon_eta', 'dimuon_phi']
#vars_to_plot += ['dimuon_dEta', 'dimuon_dPhi']
#vars_to_plot += ['mu1_pt','mu2_pt', 'mu1_eta','mu2_eta', 'mu1_phi','mu2_phi']
#vars_to_plot += ['jet1_pt', 'jet1_eta', 'jet1_phi', 'njets',]
vars_to_plot += ['mu1_pt', 'mu1_pt_nosf']
# vars_to_plot += ['npv', 'npv_prePU', 'met']

Make plots iteratively and put into a grid
---

Slower, but output looks nicer.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import mplhep as hep
plt.style.use(hep.cms.style.ROOT)
from matplotlib import gridspec
import math
    
fig = plt.figure()
    
nplots_x = 4 # number of plots in one row
nplots_y = math.ceil(len(vars_to_plot) / nplots_x) # number of rows

plotsize=10
ratio_plot_size = 0.25
fig.set_size_inches(nplots_x*plotsize,nplots_y*plotsize*(1+ratio_plot_size))
outer_grid = gridspec.GridSpec(nplots_y, nplots_x, hspace = .3) 
for i, var in enumerate(vars_to_plot):
    gs = gridspec.GridSpecFromSubplotSpec(2, 1, subplot_spec = outer_grid[i], height_ratios=[(1-ratio_plot_size),ratio_plot_size], hspace = .05)
    plot_variable(fig, var, gs, lumi_weights)

Make plots in parallel
---

Faster, but can't display the plots in a grid.

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt
from matplotlib import gridspec
import mplhep as hep
plt.style.use(hep.cms.style.ROOT)
import multiprocessing as mp

plt.rcParams['figure.dpi'] = 50
    
figs = {}
def plot_var(i):
    fig = plt.figure()
    plotsize=8
    ratio_plot_size = 0.24
    fig.set_size_inches(plotsize,plotsize*(1+ratio_plot_size))
    gs = gridspec.GridSpec(2, 1, height_ratios=[(1-ratio_plot_size),ratio_plot_size], hspace = .05)
    plot_variable(fig, vars_to_plot[i], gs, lumi_weights)
    return fig

pool = mp.Pool(mp.cpu_count() - 1)
results = [pool.apply(plot_var, args=(x,)) for x in range(len(vars_to_plot))]    
