# Dask through Jupyter Notebooks

This notebook runs a simple study on SUEP data using Dask, creating an output coffea file that can be analyzed in Dask_analysis.ipynb.

In [1]:
import os
import sys
import json
import time
import random
import numpy as np
import fastjet
import awkward as ak
import vector
vector.register_awkward()
import coffea
coffea.deprecations_as_errors = False #Get rid of warning for now
from coffea import hist, processor
from hist import Hist
import matplotlib

sys.path.append("..")
from dask_jobqueue import SLURMCluster
from distributed import Client
from dask.distributed import performance_report
from workflows.math_utils import *
from workflows.SUEP_coffea import SUEP_cluster
from plotting.plot_utils import *
from dask_utils import *

(Set coffea.deprecations_as_errors = True to get a stack trace now.)
ImportError: coffea.hist is deprecated


We set up a coffea ABC Processor to analyze the ROOT files.

In [2]:
class Simple_Process(processor.ProcessorABC):
    def __init__(self, isMC: int, era: int, sample: str, scouting: int) -> None:
        
#         import os
#         import sys
#         import json
#         import time
#         import random
#         import numpy as np
#         import fastjet
#         import awkward as ak
#         import vector
#         vector.register_awkward()

#         sys.path.append("..")
#         from workflows.math_utils import sphericity
        
        self.gensumweight = 1.0
        self.era = era
        self.isMC = isMC
        self.sample = sample
        self.scouting = scouting

        self._accumulator = processor.dict_accumulator(
            {
                "sumw": processor.defaultdict_accumulator(float),
                "total": processor.defaultdict_accumulator(float),
                "cut1": processor.defaultdict_accumulator(float),
                "cut2": processor.defaultdict_accumulator(float),
                "cut3": processor.defaultdict_accumulator(float),
                "cut4": processor.defaultdict_accumulator(float),
                "cut5": processor.defaultdict_accumulator(float)
            }
        )
        
    @property
    def accumulator(self):
        return self._accumulator
    
    # this is modified from the normal SUEP_cluster one,
    # since we need to count events after each selection
    def eventSelection(self, events, output, dataset):
        if self.scouting == 1:
            Jets = ak.zip({
                "pt": events.Jet.pt,
                "eta": events.Jet.eta,
                "phi": events.Jet.phi,
                "mass": events.Jet.mass,
            })
        else:
            Jets = ak.zip({
                "pt": events.Jet.pt,
                "eta": events.Jet.eta,
                "phi": events.Jet.phi,
                "mass": events.Jet.mass,
                "jetId": events.Jet.jetId
            })
        jetCut = (Jets.pt > 30) & (abs(Jets.eta)<2.4)
        ak4jets = Jets[jetCut]
        ht = ak.sum(ak4jets.pt,axis=-1)
        
        # apply trigger selection
        if self.scouting == 1:
            events = events[(ht > 600)]
            ak4jets = ak4jets[(ht > 600)]
        else:
            events = events[(ht > 1200)]
            ak4jets = ak4jets[(ht > 1200)]
            
            output['cut1'][dataset] += len(events)
            
            if self.era == 2016:
                trigger = (events.HLT.PFHT900 == 1)
            else:
                trigger = (events.HLT.PFHT1050 == 1)
            
            events = events[(trigger)]
            ak4jets = ak4jets[(trigger)]
            
            output['cut2'][dataset] += len(events)
            
        return events, ak4jets

    def process(self, events):
        output = self.accumulator
        dataset = events.metadata['dataset']
        
        os.system('sleep $[ ( $RANDOM % 100 )  + 1 ]s')
        
        # this needs to be here!
        # FIXME: why isn't doing this in __init__ enough?
        vector.register_awkward()

        if self.isMC: self.gensumweight = ak.sum(events.genWeight)
        
        output["sumw"][dataset] += ak.sum(events.genWeight)
        output["total"][dataset] += len(events)
  
        events, ak4jets = self.eventSelection(events, output, dataset)
    
        if len(events) == 0:
            return output
    
        tracks, Cleaned_cands = SUEP_cluster.getTracks(self, events)
        
        # events = events[ak.num(tracks) > 0]
        # tracks = tracks[ak.num(tracks) > 0]
        indices = np.arange(0,len(tracks))
        
        if len(tracks) == 0:
            return output
        
        ak_inclusive_jets, ak_inclusive_cluster = SUEP_cluster.FastJetReclustering(self, tracks, r=1.5, minPt=150)
                
        # remove events with at least 2 clusters (i.e. need at least SUEP and ISR jets for IRM)
        clusterCut = (ak.num(ak_inclusive_jets, axis=1)>1)
        ak_inclusive_cluster = ak_inclusive_cluster[clusterCut]
        ak_inclusive_jets = ak_inclusive_jets[clusterCut]
        tracks = tracks[clusterCut]
        indices = indices[clusterCut]   
        output['cut3'][dataset] += len(tracks)
        
        if len(tracks) == 0:
            return output
        
        tracks, indices, topTwoJets = SUEP_cluster.getTopTwoJets(self, tracks, indices, ak_inclusive_jets, ak_inclusive_cluster)
        SUEP_cand, ISR_cand, SUEP_cluster_tracks, ISR_cluster_tracks = topTwoJets
        
        # boost into frame of SUEP
        boost_SUEP = ak.zip({
            "px": SUEP_cand.px*-1,
            "py": SUEP_cand.py*-1,
            "pz": SUEP_cand.pz*-1,
            "mass": SUEP_cand.mass
        }, with_name="Momentum4D")        
        
        # SUEP tracks for this method are defined to be the ones from the cluster
        # that was picked to be the SUEP jet
        SUEP_tracks_b = SUEP_cluster_tracks.boost_p4(boost_SUEP)        
        
        # SUEP jet variables
        eigs = sphericity(SUEP_tracks_b,1.0) #Set r=1.0 for IRC safe
        S1 = 1.5 * (eigs[:,1]+eigs[:,0])
        nconst = ak.num(SUEP_tracks_b)
        
        tracks = tracks[(nconst > 80)]
        S1 = S1[(nconst > 80)]
        output['cut4'][dataset] += len(tracks)
        
        tracks = tracks[(S1 > 0.5)]
        output['cut5'][dataset] += len(tracks)
                
        return output
        
    def postprocess(self, accumulator):
        return accumulator

The following section defines additional parts of the slurm Dask job. Here we source the bashrc to prepare Conda. We also pass in the x509 proxy. In order to share the proxy across the SubMIT machines you should move your proxy to your HOME directory.

In [3]:
slurm_env = [
     'export XRD_RUNFORKHANDLER=1',
     'export XRD_STREAMTIMEOUT=10',
     f'source {os.environ["HOME"]}/.bashrc',
     f'conda activate SUEP',
     f'export X509_USER_PROXY={os.environ["HOME"]}/x509up_u210253',
     'export PYTHONPATH=/home/submit/lavezzo/SUEP/SUEPCoffea_dask/:/home/submit/lavezzo/SUEP/SUEPCoffea_dask/workflows/:$PYTHONPATH'
     # 'sleep $[ ( $RANDOM % 1000 )  + 1 ]s'
]

extra_args=[
     "--output=logs/dask_job_output_%j.out",
     "--error=logs/dask_job_output_%j.err",
     "--partition=submit",
     "--clusters=submit",
]

In [4]:
n_port       = 6820                   # might need to change this if re running
w_port       = 9765
cores        = 2
processes    = 1
memory       = "10 GB"

The following sets up the processor and json file. If you want to change files you can simply modify the json file

In [5]:
# load samples
file = "../filelist/list_2018_SUEP_A01.txt"
samples = []
with open(file, 'r') as stream:
    for sample in stream.read().split('\n'):
        if '#' in sample: continue
        if len(sample.split('/')) <= 1: continue
        sample_name = sample.split("/")[-1]
        samples.append(sample_name)

In [6]:
# load file names
samples_dict = {}
for sample_name in samples:
    
    input_list = "/home/tier3/cmsprod/catalog/t2mit/nanosu/A01/{}/RawFiles.00".format(sample_name)
    
    files = []
    Raw_list = open(input_list, "r")
    for i in Raw_list:
        file = i.split(" ")[0]
        files.append(file)
    
    samples_dict[sample_name] = files
process_dict = samples_dict

In [7]:
# cross section
xsections = {}
for sample in list(process_dict.keys()):
    xsection = getXSection(sample, '2018')
    xsections.update({sample:xsection})



The next section forms the Slurm Cluster. You can set up various parameters of the cluster here.

In [8]:
while not check_port(n_port):
    time.sleep(5)

import socket
cluster = SLURMCluster(
        queue='all',
        project="SUEP_Slurm",
        cores=cores,
        processes=processes,
        memory=memory,
        #retries=10,
        walltime='00:30:00',
        scheduler_options={
              'port': n_port,
              'dashboard_address': 8000,
              'host': socket.gethostname()
        },
    
        job_extra=extra_args,
        env_extra=slurm_env,
)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 21170 instead


In [9]:
cluster.adapt(minimum=1, maximum=250)
client = Client(cluster)
print(client)

<Client: 'tcp://18.12.2.18:6820' processes=0 threads=0, memory=0 B>


## Running the processor
Now we will run the code with a performance report. This will analyze all of the input ROOT files and will store the histograms in output. Then we can analyze the output and make plots.

In [10]:
key = list(process_dict.keys())[0]
debug_dict = {key: process_dict[key][:10]}

In [None]:
processor_instance = Simple_Process(isMC=1, era='2018', sample='test', scouting=0)
with performance_report(filename="dask-report.html"):
    output = processor.run_uproot_job(debug_dict,
             treename='Events',
             processor_instance=processor_instance,
             executor=processor.dask_executor,
             executor_args={
                           'client': client,
                           'skipbadfiles': True,
                           'schema': processor.NanoAODSchema,
                           'xrootdtimeout': 10,
                           'retries': 3,
                           },
             chunksize=50)

[#############################           ] | 72% Completed |  1min  8.1s

In [19]:
client.cancel(cluster)

In [15]:
output

{'sumw': defaultdict_accumulator(float,
                         {'SUEP-m1000-darkPhoHad+RunIIAutumn18-private+MINIAODSIM': 99.96999740600586}),
 'total': defaultdict_accumulator(float,
                         {'SUEP-m1000-darkPhoHad+RunIIAutumn18-private+MINIAODSIM': 100.0}),
 'cut1': defaultdict_accumulator(float,
                         {'SUEP-m1000-darkPhoHad+RunIIAutumn18-private+MINIAODSIM': 6.0}),
 'cut2': defaultdict_accumulator(float,
                         {'SUEP-m1000-darkPhoHad+RunIIAutumn18-private+MINIAODSIM': 6.0}),
 'cut3': defaultdict_accumulator(float,
                         {'SUEP-m1000-darkPhoHad+RunIIAutumn18-private+MINIAODSIM': 6.0}),
 'cut4': defaultdict_accumulator(float,
                         {'SUEP-m1000-darkPhoHad+RunIIAutumn18-private+MINIAODSIM': 6.0}),
 'cut5': defaultdict_accumulator(float,
                         {'SUEP-m1000-darkPhoHad+RunIIAutumn18-private+MINIAODSIM': 6.0})}

In [14]:
# just in case this isn't working as expected
coffea.util.save(output, "unscaled_output.coffea")

# calculate normalization
scales = {} 
for dataset in output["sumw"]:
    xsec = xsections[dataset]
    scale = xsec / output["sumw"][dataset]
    scales.update({dataset: scale})

# apply normalization to all histograms
for key in list(output.keys()):
    if key.lower() == 'sumw': continue
    print(key)
    output[key].scale(scales, axis='dataset')

coffea.util.save(output, "output.coffea")

total


AttributeError: 'defaultdict_accumulator' object has no attribute 'scale'

### We can make some plots here too! But most of the analysis is in Dask_analysis.ipynb