<!-- <img  src="https://swan.web.cern.ch/sites/swan.web.cern.ch/files/pictures/logo_swan_letters.png" alt="SWAN" style="float: left; width: 15%; margin-right: 5%; margin-left: 17%; margin-top: 1.0em; margin-bottom: 2.0em;">
<img src="https://spark.apache.org/images/spark-logo-trademark.png" alt="EP-SFT" style="float: left; width: 25%; margin-right: 0%; margin-left: 0%; margin-bottom: 2.0em;">
<img src="https://cms-docdb.cern.ch/cgi-bin/PublicDocDB/RetrieveFile?docid=3045&filename=CMSlogo_color_label_1024_May2014.png&version=3" alt="CMS" style="float: left; width: 12%; margin-left: 5%; margin-right: 5%; margin-bottom: 2.0em;"> -->
<p style="clear: both;">
<div style="text-align:center"><h1>CMS H&#8594;µµ analysis  
     <br> with Coffea package from Fermilab</h1></div>
<div style="text-align:center"><i>Author: Dmitry Kondratyev, based on example code by Lindsey Gray</i></div>
<hr style="border-top-width: 4px; border-top-color: #34609b;">

# Search for Higgs boson decaying into two muons

This code uses awkward array toolset, and utilizing Coffea [histograms](https://coffeateam.github.io/coffea/modules/coffea.hist.html).
This also shows the analysis object syntax implemented by Coffea [JaggedCandidateArray](https://coffeateam.github.io/coffea/api/coffea.analysis_objects.JaggedCandidateMethods.html), and the usage of custom [accumulators](https://coffeateam.github.io/coffea/api/coffea.processor.AccumulatorABC.html) other than histograms.  Further, it introduces the [processor](https://coffeateam.github.io/coffea/api/coffea.processor.ProcessorABC.html) concept and the interface to apache spark.


Instructions to run at SWAN:
===

#### Load SWAN environment: LCG96 Python3 stack and Cloud Containers cluster

Then run next two cells


In [None]:
# Run this cell if you do not have coffea installed (e.g. on SWAN with LCG 96Python3 stack)
!pip install --user --upgrade coffea

# spark.jars.packages doesnt work with Spark 2.4 with kubernetes
!wget -N https://repo1.maven.org/maven2/edu/vanderbilt/accre/laurelin/0.5.1/laurelin-0.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-api/2.11.2/log4j-api-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/apache/logging/log4j/log4j-core/2.11.2/log4j-core-2.11.2.jar && \
wget -N https://repo1.maven.org/maven2/org/lz4/lz4-java/1.5.1/lz4-java-1.5.1.jar && \
wget -N https://repo1.maven.org/maven2/org/tukaani/xz/1.2/xz-1.2.jar
                    
!mkdir output

In [None]:
# Run this cell before establishing spark connection

import os
os.environ['PYTHONPATH'] = os.environ['PYTHONPATH'] + ':' + '/usr/local/lib/python3.6/site-packages'

Instructions to run at Purdue Jupyter hub:
===

- Login to interactive node on a cluster (e.g. `hammer.rcac.purdue.edu`)
- Activate local conda environment (create a new directory for the environment, if needed):
```
   module load anaconda/5.3.1-py37 
   source activate /home/dkondra/conda_tests/
```
- Once the environment is activated, `conda install` will automatically install packages to that environment
- Install missing packages like this: `conda install [-c conda-forge] <package name>`
- `coffea` can be installed like this (it will use conda's `pip`): 
```
    pip install --upgrade coffea
```
- In order for conda to work with notebooks, install `nb_conda`: 
```
    conda install nb_conda
```
- After that, in Jupyter notebook there will be an option in **Kernel -> Change Kernel** to run the notebook using desired conda environment
- Set up VOMS proxy:
```
    . setup_proxy.sh
```

In [None]:
%env X509_USER_PROXY=/home/dkondra/x509up_u616617

In [None]:
import time
import coffea
print("Coffea version: ", coffea.__version__)
import socket

print(socket.gethostname())
from coffea import util
import coffea.processor as processor
import multiprocessing as mp
print(f"{mp.cpu_count()} CPUs")

In [None]:
from python.samples_info import SamplesInfo
samples = [
### Data ###
#           'data_A',
#      'data_B',
#        'data_C',
#     'data_D','data_E',
#     'data_F',
#    'data_G','data_H',

### Essential MC ###    
    'dy_m105_160_amc', 
#    'dy_m105_160_vbf_amc',
#      'ggh_amcPS', 
#     'vbf_powhegPS', 
#     'ttjets_dl',
#    "ewk_lljj_mll105_160_ptj0",

### Non-essential MC ### 
#     'ttjets_sl',
#     'ttz',
#     'ttw',
#     'st_tw_top','st_tw_antitop',
#     'ww_2l2nu',
#     'wz_2l2q',
#     'wz_3lnu',
#     'wz_1l1nu2q',
#      'zz',
# # ##
    
]

purdue = 'root://xrootd.rcac.purdue.edu/'

year = '2016'
label = 'test' # change this to save to other directory

samp_info = SamplesInfo(year=year, out_path=f'test_{year}_{label}', server=purdue, debug=True)

# 'outer' refers to parallelization by sample, 'inner' - by ROOT file in each sample
samp_info.load(samples, nchunks=1, parallelize_outer=1, parallelize_inner=10)
samp_info.compute_lumi_weights()


Iterative executor
===


In [None]:
from coffea.processor.executor import iterative_executor
from python.dimuon_processor import DimuonProcessor
from ipywidgets import IntProgress, HBox, HTML

tstart = time.time() 
output = processor.run_uproot_job(samp_info.full_fileset, 'Events', DimuonProcessor(
                                                                     samp_info=samp_info, do_jecunc=False),
                                        iterative_executor, executor_args={'nano': True})
elapsed = time.time() - tstart
print(f"Total time: {elapsed} s")


Dask executor
===


In [None]:
import pytest
from coffea.processor.executor import dask_executor
import dask
from python.dimuon_processor import DimuonProcessor

n_workers = 16

distributed = pytest.importorskip("distributed", minversion="1.28.1")
distributed.config['distributed']['worker']['memory']['terminate'] = False
client = distributed.Client(processes=True, dashboard_address=None, n_workers=n_workers, threads_per_worker=1) 

tstart = time.time()

for ds_name, fileset_ in samp_info.filesets_chunked.items():
    for ichunk, ifileset in enumerate(fileset_):
        print(f"Processing {ds_name}, chunk {ichunk+1}/{samp_info.nchunks} ...")
        output = processor.run_uproot_job(ifileset, 'Events',\
                                      DimuonProcessor(samp_info=samp_info, do_jecunc=False),\
                                      dask_executor,\
                                      executor_args={'nano': True, 'client': client})

        out_dir = f"/depot/cms/hmm/coffea/{samp_info.out_path}/"

        try:
            os.mkdir(out_dir)
        except:
            pass

        for mode in output.keys():
            out_dir_ = f"{out_dir}/{mode}/"
            out_path_ = f"{out_dir_}/{ds_name}_{ichunk}.coffea"
            try:
                os.mkdir(out_dir_)
            except:
                pass
            util.save(output[mode], out_path_)

        output.clear()
        print(f"Saved output to {out_dir}")
    
elapsed = time.time() - tstart

print(f"Total time: {elapsed} s")


Plot Data/MC comparison
---

In [None]:
import os,glob
import argparse
from python.postprocessing import postprocess, plot, save_shapes
from config.variables import variables
from config.datasets import datasets
import pandas as pd

year = '2016'
label = 'test'

to_plot = ['dimuon_mass']
vars_to_plot = {v.name:v for v in variables if v.name in to_plot}
samples = [
    'data_A',
    'data_B',
    'data_C',
    'data_D',
    'data_E',
    'data_F',
    'data_G',
    'data_H',
    'dy_m105_160_amc',
    'dy_m105_160_vbf_amc',
    'ewk_lljj_mll105_160_ptj0',
    'ttjets_dl',
    'ttjets_sl',
    'ttz',
    'ttw',
    'st_tw_top','st_tw_antitop',
    'ww_2l2nu',
    'wz_2l2q',
    'wz_3lnu',
    'zz',
    'ggh_amcPS',
    'vbf_powhegPS',
]


postproc_args = {
    'modules': ['to_pandas',  'get_hists'],
    'year': year,
    'label': label,
    'in_path': f'/depot/cms/hmm/coffea/all_{year}_{label}/',
    'syst_variations': ['nominal'],
    'samples':samples,
    'channels': ['vbf'],
    'regions': ['h-peak', 'h-sidebands'],
    'vars_to_plot': list(vars_to_plot.values()),
    'wgt_variations': False,
}


dfs, hist_dfs, edges = postprocess(postproc_args)
hist = {}
for var, hists in hist_dfs.items():
    hist[var] = pd.concat(hists, ignore_index=True)
    
for vname, var in vars_to_plot.items():
     for r in postproc_args['regions']:
        plot(var, hist, 'wgt_nominal', edges[vname], postproc_args, r, save=False, show=True, plotsize=8)