Before running:
---

- Activate a Conda environment with the following packages installed: `xrootd`, `dask`, `pytest`, `nb_conda`
- Make sure that current Jupyter kernel uses that environment
- Initialize VOMS proxy (might need to explicitly set the path in a Jupyter cell: `%env X509_USER_PROXY=<path>`)



In [None]:
import time
import coffea
from coffea import util
import coffea.processor as processor

from python.samples_info import SamplesInfo

# Replace the server and dataset path with whatever works on your machine 
server = 'root://xrootd.rcac.purdue.edu/'
example_datasets = {
    "2016": {
        "dy_m105_160_amc": "/store/mc/RunIISummer16NanoAODv6/DYJetsToLL_M-105To160_TuneCP5_PSweights_13TeV-amcatnloFXFX-pythia8/",
    }   
}

samples = [
    'dy_m105_160_amc', 
]

# path to save output files (they may take up to several GB)
out_path = '/depot/cms/hmm/coffea/'

samp_info = SamplesInfo(year="2016", out_path=out_path, server=server, example=True, example_datasets=example_datasets)

# this will get the list of all root files in <server>/<dataset_path> via xrootd and split the list of files 
# into <nchunks> chunks (or less, if there are less files in the dataset). 
samp_info.load(samples, nchunks=5)
samp_info.compute_lumi_weights()


Dask executor
===


In [None]:
import pytest
from coffea.processor.executor import dask_executor
import dask
from python.dimuon_processor import DimuonProcessor

n_workers = 12

distributed = pytest.importorskip("distributed", minversion="1.28.1")
distributed.config['distributed']['worker']['memory']['terminate'] = False
client = distributed.Client(processes=True, dashboard_address=None, n_workers=n_workers, threads_per_worker=1) 

tstart = time.time()

# The chunked lists of files are stored in a dictionary samp_info.filesets_chunked, that has a format 
# {dataset_name: [list of filesets]},
# and each of the 'filesets' is in the format that is good to be used as an input to Coffea processor.

for sample, filesets in samp_info.filesets_chunked.items():
    for ichunk, fileset in enumerate(filesets):
        print(f"Processing {sample}, chunk {ichunk+1}/{samp_info.nchunks} ...")
        output = processor.run_uproot_job(fileset, 'Events',\
                                      DimuonProcessor(samp_info=samp_info),\
                                      dask_executor,\
                                      executor_args={'nano': True, 'client': client})

        prefix = ""
        out_path = f"{samp_info.out_path}/{prefix}{sample}_{ichunk}.coffea"
        util.save(output, out_path)
        print(f"Saved output to {out_path}")   
    
elapsed = time.time() - tstart

print(f"Total time: {elapsed} s")
