<!-- <img  src="https://swan.web.cern.ch/sites/swan.web.cern.ch/files/pictures/logo_swan_letters.png" alt="SWAN" style="float: left; width: 15%; margin-right: 5%; margin-left: 17%; margin-top: 1.0em; margin-bottom: 2.0em;">
<img src="https://spark.apache.org/images/spark-logo-trademark.png" alt="EP-SFT" style="float: left; width: 25%; margin-right: 0%; margin-left: 0%; margin-bottom: 2.0em;">
<img src="https://cms-docdb.cern.ch/cgi-bin/PublicDocDB/RetrieveFile?docid=3045&filename=CMSlogo_color_label_1024_May2014.png&version=3" alt="CMS" style="float: left; width: 12%; margin-left: 5%; margin-right: 5%; margin-bottom: 2.0em;"> -->
<p style="clear: both;">
<div style="text-align:center"><h1>CMS H&#8594;µµ analysis  
     <br> with Coffea package from Fermilab</h1></div>
<div style="text-align:center"><i>Author: Dmitry Kondratyev, based on example code by Lindsey Gray</i></div>
<hr style="border-top-width: 4px; border-top-color: #34609b;">

# Search for Higgs boson decaying into two muons

This code uses awkward array toolset, and utilizing Coffea [histograms](https://coffeateam.github.io/coffea/modules/coffea.hist.html).
This also shows the analysis object syntax implemented by Coffea [JaggedCandidateArray](https://coffeateam.github.io/coffea/api/coffea.analysis_objects.JaggedCandidateMethods.html), and the usage of custom [accumulators](https://coffeateam.github.io/coffea/api/coffea.processor.AccumulatorABC.html) other than histograms.  Further, it introduces the [processor](https://coffeateam.github.io/coffea/api/coffea.processor.ProcessorABC.html) concept and the interface to apache spark.


Instructions to run at SWAN:
===

#### Load SWAN environment: LCG96 Python3 stack and Cloud Containers cluster

Then run next two cells


In [None]:
# Run this cell if you do not have coffea installed (e.g. on SWAN with LCG 96Python3 stack)
!pip install --user --upgrade coffea

# spark.jars.packages doesnt work with Spark 2.4 with kubernetes
!wget -N https://repo1.maven.org/maven2/edu/vanderbilt/accre/laurelin/0.5.1/laurelin-0.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-api/2.11.2/log4j-api-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-core/2.11.2/log4j-core-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/lz4/lz4-java/1.5.1/lz4-java-1.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/tukaani/xz/1.2/xz-1.2.jar
                    
!mkdir output

In [None]:
# Run this cell before establishing spark connection

import os
os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + '/usr/local/lib/python3.6/site-packages'

Instructions to run at Purdue Jupyter hub:
===

- Login to interactive node on a cluster (e.g. `hammer.rcac.purdue.edu`)
- Activate local conda environment (create a new directory for the environment, if needed):
```
   module load anaconda/5.3.1-py37 
   source activate /home/dkondra/conda_tests/
```
- Once the environment is activated, `conda install` will automatically install packages to that environment
- Install missing packages like this: `conda install [-c conda-forge] <package name>`
- `coffea` can be installed like this (it will use conda's `pip`): 
```
    pip install --upgrade coffea
```
- In order for conda to work with notebooks, install `nb_conda`: 
```
    conda install nb_conda
```
- After that, in Jupyter notebook there will be an option in **Kernel -> Change Kernel** to run the notebook using desired conda environment
- Set up VOMS proxy:
```
    . setup_proxy.sh
```

In [None]:
%env X509_USER_PROXY=/home/dkondra/x509up_u616617

In [None]:
import time
import coffea
print("Coffea version: ", coffea.__version__)
import socket
at_purdue = ('hammer' in socket.gethostname())

from coffea import util
import coffea.processor as processor
import uproot


In [None]:
class SamplesInfo(object):
    def __init__(self, year, at_purdue=True, debug=False):
        
        self.year = year
        self.at_purdue = at_purdue
        self.debug = debug
        
        if self.at_purdue:
            from config.datasets import datasets, lumi_data
            self.server = 'root://xrootd.rcac.purdue.edu/'
        else:
            from config.datasets_eos import datasets, lumi_data
            self.server = ''
            
        self.paths = datasets[self.year]
        self.datasets = datasets
        self.lumi_data = lumi_data
                
        self.lumi = 40000 # default value
        self.data_entries = 0

        self.samples = []
        self.missing_samples = []

        self.filesets = {}
        self.full_fileset = {}
        self.metadata = {}
        
        #--- Define regions and channels used in the analysis ---#
        self.regions = ['z-peak', 'h-sidebands', 'h-peak']
        self.channels = ['ggh_01j', 'ggh_2j', 'vbf']

        #--- Select samples for which unbinned data will be saved ---#
        self.data_samples = [s for s in self.samples if 'data' in s]
        self.mc_samples = [s for s in self.samples if 'data' not in s]
        self.signal_samples = ['ggh_amcPS', 'vbf_amcPS']
        self.main_bkg_samples = ['dy_m105_160_amc', 'dy_m105_160_vbf_amc', 'ttjets_dl', 'ewk_lljj_mll105_160', "ewk_lljj_mll105_160_ptj0"]
        self.datasets_to_save_unbin = self.data_samples + self.signal_samples + self.main_bkg_samples
        
        #--- Take overlapping samples and assign them to different regions ---#
        all_dy = ["dy", "dy_0j", "dy_1j", "dy_2j", "dy_m105_160_amc", "dy_m105_160_vbf_amc","dy_m105_160_mg", "dy_m105_160_vbf_mg"]
        all_ewk = ["ewk_lljj_mll50_mjj120", "ewk_lljj_mll105_160", "ewk_lljj_mll105_160_ptj0"]

        self.overlapping_samples = all_dy + all_ewk
        self.specific_samples = {
                'z-peak': {
                    'ggh_01j' : ["dy_0j", "dy_1j", "dy_2j", 'ewk_lljj_mll50_mjj120', 'ggh_amcPS', 'vbf_amcPS'],
                    'ggh_2j' : ["dy_0j", "dy_1j", "dy_2j", 'ewk_lljj_mll50_mjj120', 'ggh_amcPS', 'vbf_amcPS'],
                    'vbf' : ["dy_0j", "dy_1j", "dy_2j", 'ewk_lljj_mll50_mjj120', 'ggh_amcPS', 'vbf_amcPS']
                },
                'h-sidebands': {
                    'ggh_01j' : ['dy_m105_160_amc', 'ewk_lljj_mll105_160', 'ggh_amcPS', 'vbf_amcPS'],
                    'ggh_2j' : ['dy_m105_160_amc', 'ewk_lljj_mll105_160', 'ggh_amcPS', 'vbf_amcPS'],
                    'vbf' : ['dy_m105_160_vbf_amc', 'ewk_lljj_mll105_160', 'ggh_amcPS', 'vbf_amcPS'],
                },
                'h-peak': {
                    'ggh_01j' : ['dy_m105_160_amc', 'ewk_lljj_mll105_160'],
                    'ggh_2j' : ['dy_m105_160_amc', 'ewk_lljj_mll105_160'],
                    'vbf' : ['dy_m105_160_vbf_amc', 'ewk_lljj_mll105_160'],
                }
            }
        all_ggh = ["ggh_amcPS", "ggh_amcPS_TuneCP5down", "ggh_amcPS_TuneCP5up", "ggh_powheg", "ggh_powhegPS"]
        all_vbf = ["vbf_amcPS", "vbf_amcPS_TuneCP5down", "vbf_amcPS_TuneCP5up", "vbf_powheg", "vbf_powhegPS", "vbf_amc_herwig", "vbf_powheg_herwig"]

        self.lumi_weights = {}

        
    def load(self, samples):
        import multiprocessing as mp
        t0 = time.time()

        pool = mp.Pool(mp.cpu_count())
        a = [pool.apply_async(self.load_sample, args=(s,)) for s in samples]
        results = []
        for process in a:
            process.wait()
            results.append(process.get())
        pool.close()

        for res in results: 
            sample = res['sample']
            if res['is_missing']:
                self.missing_samples.append(sample)
            else:
                self.samples.append(sample)
                self.filesets[sample] = {}
                self.filesets[sample][sample] = res['files']
                self.full_fileset[sample] = res['files']
    
                self.metadata[sample] = {}
                self.metadata[sample] = res['metadata']

                self.data_entries = self.data_entries + res['data_entries']
                

        if self.data_entries:
            print()   
            print(f"Loaded {self.data_entries} of {self.year} data events")
            self.lumi = lumi_data[self.year]['lumi']*self.data_entries/self.lumi_data[self.year]['events']
            prc = round(self.data_entries/self.lumi_data[self.year]['events']*100, 2)
            print(f"This is ~ {prc}% of {self.year} data.")
            print(f"Integrated luminosity {lumi}/pb")
            print()
        if self.missing_samples:
            print(f"Missing samples: {self.missing_samples}")

        t1 = time.time()        
        dt=round(t1-t0, 2)
        print(f"Loading took {dt} s")

    def load_sample(self, sample):
        import glob, tqdm
        from python.utils import read_via_xrootd
        print("Loading", sample)

        if sample not in self.paths:
            print(f"Couldn't load {sample}! Skipping.")
            return {'sample': sample, 'metadata': {}, 'files': {}, 'data_entries': 0, 'is_missing': True}
        
        all_files = []
        metadata = {}
        data_entries = 0
        
        if self.at_purdue:
            all_files = read_via_xrootd(self.paths[sample], self.server)
        else:
            all_files = [self.server+ f for f in glob.glob(self.paths[sample]+'*root')]      

        if self.debug:
            all_files = [all_files[0]]

        if 'data' in sample:
            for f in all_files:
                tree = uproot.open(f)['Events']
                data_entries += tree.numentries
        else:
            sumGenWgts = 0
            nGenEvts = 0
            for f in all_files:
                tree = uproot.open(f)['Runs']
                if 'NanoAODv6' in self.paths[sample]:
                    sumGenWgts += tree.array('genEventSumw_')[0]
                    nGenEvts += tree.array('genEventCount_')[0]
                else:
                    sumGenWgts += tree.array('genEventSumw')[0]
                    nGenEvts += tree.array('genEventCount')[0]
            metadata['sumGenWgts'] = sumGenWgts
            metadata['nGenEvts'] = nGenEvts

        files = {
            'files': all_files,
            'treename': 'Events'
        }
        return {'sample': sample, 'metadata': metadata, 'files': files, 'data_entries':data_entries, 'is_missing':False}

    def compute_lumi_weights(self):
        from config.cross_sections import cross_sections
        import json
        self.lumi_weights = {'data':1}
        for sample in self.mc_samples:
            N = self.metadata[sample]['sumGenWgts']
            if 'ewk_lljj_mll50_mjj120' in sample:
                xsec = cross_sections[sample]['2016']
            else:
                if 'ewk_lljj_mll105_160_ptj0' in sample:
                    xsec = cross_sections['ewk_lljj_mll105_160']            
                else:
                    xsec = cross_sections[sample]
            self.lumi_weights[sample] = xsec*self.lumi / N


In [None]:

samples = [
#     'data_G',
#     'data_B','data_C','data_D','data_E','data_F','data_G','data_H',
#     'dy_0j', 'dy_1j', 'dy_2j',
#     'dy_m105_160_amc',
#     'dy_m105_160_vbf_amc',
    'ewk_lljj_mll50_mjj120', 'ewk_lljj_mll105_160', "ewk_lljj_mll105_160_ptj0"
#     'ttjets_dl', 
#     'ggh_amcPS', 'vbf_amcPS',

    
#    'ttjets_sl', 'ttz', 'ttw',
#     'st_tw_top','st_tw_antitop',
#     'ww_2l2nu','wz_2l2q','wz_3lnu','wz_1l1nu2q','zz',
#     'www','wwz','wzz','zzz',

]

samp_info = SamplesInfo("2016")
samp_info.load(samples)
samp_info.compute_lumi_weights()

Option 1: Iterative executor
===


In [None]:
from coffea.processor.executor import iterative_executor
from python.timer import Timer
from python.dimuon_processor import DimuonProcessor

# include this to make it work at Purdue:
from ipywidgets import IntProgress, HBox, HTML
# however, still doesn't show a nice progress bar widget

tstart = time.time() 
output = processor.run_uproot_job(samp_info.full_fileset, 'Events', DimuonProcessor(
                                                                     samp_info=samp_info,\
                                                                     do_roccor=True,\
                                                                     evaluate_dnn=False,\
                                                                     do_timer=True),\
                                        iterative_executor, executor_args={'nano': True})
elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

--------------------------------------------------
Summary of global timer:
--------------------------------------------------
                     Action  Time (s)   % CPU
0  Applied HLT and lumimask  10.53784  34.880
1      Applied preselection   8.63608  28.585
2         Applied Rochester   4.09531  13.555
3       Applied dimuon cuts   2.55628   8.461
4          Applied jet cuts   2.69480   8.920
5    Computed jet variables   0.38160   1.263
6            Filled outputs   1.30978   4.335
--------------------------------------------------
Total time: 30.2117 s

--------------------------------------------------
Summary of preselection timer:
--------------------------------------------------
                  Action  Time (s)   % CPU
0         Flags computed   2.46162  30.073
1         Muons selected   1.73152  21.153
2     Electrons selected   3.62032  44.228
3            df filtered   0.36982   4.518
4         muons filtered   0.00190   0.023
5  event_weight filtered   0.00036   0.0

Option 2: Futures executor
===

In [None]:
from coffea.processor.executor import futures_executor

tstart = time.time() 
output = processor.run_uproot_job(fileset, 'Events',\
                                  DimuonProcessor(),\
                                  futures_executor,\
                                  executor_args={'nano': True, 'workers':8})
elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

out_path = "output/test_futures.coffea"
util.save(output, out_path)
print(f"Saved output to {out_path}")

Option 3: Dask executor
===


In [None]:
import pytest
from coffea.processor.executor import dask_executor
import dask

if at_purdue:
    n_workers = 18
else:
    n_workers = 4

distributed = pytest.importorskip("distributed", minversion="1.28.1")
distributed.config['distributed']['worker']['memory']['terminate'] = False
client = distributed.Client(processes=True, dashboard_address=None, n_workers=n_workers, threads_per_worker=1) 

tstart = time.time()
 
for group, fileset_ in samp_info.filesets.items():
    print(f"Processing {group}...")
    output = processor.run_uproot_job(fileset_, 'Events',\
                                  DimuonProcessor(samp_info=samp_info,\
                                                  evaluate_dnn=False,),\
                                  dask_executor,\
                                  executor_args={'nano': True, 'client': client, 'retries':20})

    out_path = f"/depot/cms/hmm/coffea/ewk_test/test_dask_{group}.coffea"
    util.save(output, out_path)
    print(f"Saved output to {out_path}")  

elapsed = time.time() - tstart

print(f"Total time: {elapsed} s")


Option 4: Parsl executor
===

In [None]:
from coffea.processor.executor import parsl_executor
import parsl
from coffea.processor.parsl.detail import (_parsl_initialize, _parsl_stop, _default_cfg)
_parsl_initialize(config=_default_cfg)

# Doesn't work

tstart = time.time() 
output = processor.run_uproot_job(fileset, 'Events', DimuonProcessor(), parsl_executor, executor_args={'nano': True})
elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Option 5: Apache Spark
===


NOW IT IS TIME TO START SPARK CLUSTER CONNECTION
---

When using SWAN, click on the 5-point start icon in Jupyter notebook

In [None]:
import pyspark.sql
from pyarrow.compat import guid
from coffea.processor.spark.detail import _spark_initialize, _spark_stop
from coffea.processor.spark.spark_executor import spark_executor
"""
# NOT needed on SWAN, spark config is offloaded to spark connector

spark_config = pyspark.sql.SparkSession.builder \
    .appName('spark-executor-test-%s' % guid()) \
    .master('local[*]') \
    .config('spark.driver.memory', '4g') \
    .config('spark.executor.memory', '4g') \
    .config('spark.sql.execution.arrow.enabled','true') \
    .config('spark.sql.execution.arrow.maxRecordsPerBatch', 200000)

spark = _spark_initialize(config=spark, log_level='WARN', 
                          spark_progress=False, laurelin_version='0.5.1')
"""
partitionsize = 200000
thread_workers = 2

# Doesn't work (no full NanoEvents support)

tstart = time.time() 
# if jobs fail, it might be because some columns are missing from processor._columns
output = processor.run_spark_job(fileset, DimuonProcessor(), spark_executor, 
                                 spark=spark, partitionsize=partitionsize, thread_workers=thread_workers,
                                 executor_args={'file_type': 'edu.vanderbilt.accre.laurelin.Root', 'cache': False, 'nano': True, 'retries': 5}
                                )

elapsed = time.time() - tstart

print(f"Processed {output['cutflow']['all events']} events")
print(f"Total time: {elapsed} s")
print(f"Rate: {output['cutflow']['all events']/elapsed} events/s")

Plotting
===

Make plots and put them into a grid


In [None]:
%matplotlib inline
from python.plotting import Plotter
    
# vars_to_plot = []

# vars_to_plot += ['dimuon_mass']
# vars_to_plot += ['dimuon_pt', 'dimuon_eta', 'dimuon_phi']
# vars_to_plot += ['dimuon_dEta', 'dimuon_dPhi']
# vars_to_plot += ['mu1_pt', 'mu1_eta', 'mu1_phi',  'mu2_pt', 'mu2_eta', 'mu2_phi']
# vars_to_plot += ['jet1_pt', 'jet1_eta', 'jet1_phi', 'jet1_qgl']
# vars_to_plot += ['jet2_pt', 'jet2_eta', 'jet2_phi', 'jet2_qgl']
# vars_to_plot += ['jj_mass', 'jj_pt', 'jj_eta', 'jj_phi']
# vars_to_plot += ['jj_dEta', 'jj_dPhi']
# vars_to_plot += ['njets', 'npv', 'met']
# vars_to_plot += ['dnn_score']

all_plots_2016_pars = {
    'processor': DimuonProcessor(),
    'path': '/depot/cms/hmm/coffea/',
    'samples': [
        'data_B','data_C','data_D','data_E','data_F','data_G','data_H',
        'dy_0j', 'dy_1j', 'dy_2j',
        'dy_m105_160_amc',
        'dy_m105_160_vbf_amc',
        'ewk_lljj_mll50_mjj120', 'ewk_lljj_mll105_160',
        'ttjets_dl', 
        'ggh_amcPS', 'vbf_amcPS',
        'ttjets_sl', 'ttz', 'ttw',
        'st_tw_top','st_tw_antitop',
        'ww_2l2nu','wz_2l2q','wz_3lnu','wz_1l1nu2q','zz',
        'www','wwz','wzz','zzz',
        ],
    'vars': ['dimuon_mass'],
    'year': '2016',
    'regions' : ["z-peak", "h-sidebands", "h-peak"],
    'channels': ["ggh_01j", "ggh_2j", "vbf"], 
}

all_plots = Plotter(**all_plots_2016_pars)
all_plots.make_datamc_comparison(do_inclusive=True, do_exclusive=False)


In [None]:
%matplotlib inline
from python.plotting import Plotter

ewz_study_pars = {
    'processor': DimuonProcessor(),
    'path': '/depot/cms/hmm/coffea/ewk_test/',
    'samples': ['ewk_lljj_mll105_160', 'ewk_lljj_mll105_160_ptj0'],
    'vars': ['jet1_pt', 'jet1_eta', 'jet2_pt', 'jet2_eta'],
    'year': '2017',
    'regions' : ["h-sidebands", "h-peak"],
    'channels': ["vbf"], 
}

ewz_study_plots = Plotter(**ewz_study_pars)
ewz_study_plots.make_shape_comparison(do_inclusive=False, do_exclusive=True)
