# SWIFT-HEP / GridPP Workshop - April 2025

## Caching

The Dirac Client is introduced here.
Functionally it works the same as the dask.distributed.Client, but allows for persistent caching.

The following cache locations are supported:
- `local`: to set the directory use `file:///path/to/cache`

Caching options in the works;
- `rucio`: to set the directory use `rucio:///path/to/cache`
- `dirac`: to set the directory use `dirac:///path/to/cache`

In [None]:
from dask_dirac import DiracClient, DiracCluster
from dask.distributed import LocalCluster, Client
import dask.array as da

In [None]:
cluster = DiracCluster(scheduler_options={"port": 8786},)

In [None]:
client = Client(cluster)

In [None]:
client.scheduler_info()

In [None]:
cluster = LocalCluster(n_workers=1)

In [None]:
client = DiracClient(cluster, cache_location="file:///tmp/dask-cache_05022025")
# client = Client(cluster)

In [None]:
# Check the cache location and show what files are there
print(client.cache_location)
!ls {client.cache_location[7:]} # remove file:// at the beginning

In [None]:
client

In [None]:
# Create a Dask DataFrame directly
dask_array = da.ones((1e4, 1), chunks=(1)) + 20231
#dask_array.visualize()
dask_array

In [None]:
result = client.compute(dask_array)

In [None]:
result.result()

In [None]:
# Check the cache location and show what files are there
print(client.cache_location)
!ls {client.cache_location[7:]} # remove file:// at the beginning

## GPU vs CPU

This is an LUX-ZEPLIN analysis which builds a model of multi-scatter-single-ionisation (MSSI) events from simulated events.
This simulated events are from detector components. 
In this analysis, the simulations (ROOT files) are read using `uproot`, and then events are looped over, selecting MSSI events.
The simulated events here have already gone through a pre-processing so only events classified as single-scatter events are considered.

A more detailed step-by-step description of the analysis is as follows:
1. Simulations of detector components are stored as ROOT files.
2. These files are read using `uproot` into `awkward` arrays.
3. A selection is applied to the data to select MSSI events.
4. A normalization is applied to get the expected rate of these events.
5. Something about building the model.


In addition to the above, this analysis also highlights function decorations with numba for CPU and GPU acceleration.

In [None]:
import awkward as ak
import numpy as np
import numba as nb
from dask.distributed import LocalCluster, Client, progress
import glob
import pandas as pd
import uproot as up
import numba
import dask

Define the processing

In [None]:
@numba.njit
def evaluate_poly(coeffs, x):
    result = 0.0
    for c in coeffs:
        result = result * x + c
    return result

@numba.njit
def loop_over_events(ss, mc):
    is_mssi = np.zeros(len(ss['ss.correctedS1Area_phd']))
    is_FV_mssi = np.zeros(len(ss['ss.correctedS1Area_phd']))
    is_FV_ROI_mssi = np.zeros(len(ss['ss.correctedS1Area_phd']))
    is_FV_ss = np.zeros(len(ss['ss.correctedS1Area_phd']))
    is_FV_ROI_ss = np.zeros(len(ss['ss.correctedS1Area_phd']))


    wall_poly_coeffs = np.array([-8.14589334e-14, 2.09181587e-10, -2.06758029e-07,
                                  1.01366014e-04, -2.69048354e-02, 7.24276394e+01])

    for i in range(len(is_mssi)):
        nS1 = 0
        nS2 = 0
        r = np.sqrt(ss['ss.x_cm'][i] ** 2 + ss['ss.y_cm'][i] ** 2)
        drift_time = ss['ss.driftTime_ns'][i] / 1000.
        boundary_r = evaluate_poly(wall_poly_coeffs, drift_time) - 3
        for j in range(mc['mcTruthVertices.nRQMCTruthVertices'][i]):
            if 'Skin' in str(mc['mcTruthVertices.volumeName'][i][j]) or 'Scint' in str(mc['mcTruthVertices.volumeName'][i][j]):
                continue
            if mc['mcTruthVertices.detectedS1Photons'][i][j] > 0.:
                nS1 += 1
            if mc['mcTruthVertices.detectedS2Photons'][i][j] > 0.:
                nS2 += 1
        if nS1 > nS2:
            is_mssi[i] = 1
            # Apply FV cut
            if r < boundary_r:
                if drift_time < 1030.:
                    if drift_time > 71.:
                        is_FV_mssi[i] = 1
                        # Apply ROI
                        if ss['ss.correctedS1Area_phd'][i] < 600:
                            if ss['ss.correctedS1Area_phd'][i] > 3:
                                if np.log10(ss['ss.correctedS2Area_phd'][i]) < 4.5:
                                    if ss['ss.s2Area_phd'][i] > 14.5 * 44.5:
                                        is_FV_ROI_mssi[i] = 1
        # single scatter rate
        if r < boundary_r:
            if drift_time < 1030.:
                if drift_time > 71.:
                    is_FV_ss[i] = 1
                    # Apply ROI
                    if ss['ss.correctedS1Area_phd'][i] < 600:
                        if ss['ss.correctedS1Area_phd'][i] > 3:
                            if np.log10(ss['ss.correctedS2Area_phd'][i]) < 4.5:
                                if ss['ss.s2Area_phd'][i] > 14.5 * 44.5:
                                    is_FV_ROI_ss[i] = 1

    return is_mssi, is_FV_mssi, is_FV_ROI_mssi, is_FV_ss, is_FV_ROI_ss

In [None]:
def process_file(file):
    # Read the file
    branches = ['ss.correctedS1Area_phd', 'ss.correctedS2Area_phd', 'ss.s1Area_phd', 'ss.s2Area_phd', 'ss.x_cm', 'ss.y_cm', 'ss.driftTime_ns']
    mcBranches = ['mcTruthVertices.nRQMCTruthVertices', 'mcTruthVertices.volumeName', 'mcTruthVertices.detectedS1Photons', 'mcTruthVertices.detectedS2Photons', 'mcTruthEvent.eventWeight']

    tfile = up.open(file)
    try: # this is to account for empty files where the simulation was empty
        t = tfile['Scatters']
        mct = tfile['RQMCTruth']

        ss = t.arrays(branches)
        mc = mct.arrays(mcBranches)

        # Now calculate the number of MSSI events
        is_mssi, is_FV_mssi, is_FV_ROI_mssi, is_FV_ss, is_FV_ROI_ss = loop_over_events(ss, mc)
        eventWeight = mc['mcTruthEvent.eventWeight'][0]

        f_name = file.split('/SS_skim_')[1][:-5] # remove .root from the end of the file name

        return f_name, len(ss['ss.s1Area_phd']),  sum(is_FV_ss), sum(is_FV_ROI_ss), sum(is_mssi), sum(is_FV_mssi), sum(is_FV_ROI_mssi), eventWeight
    except:
        return file, 0, 0, 0, 0, 0, 0

setup dask cluster

In [None]:
cluster = LocalCluster(n_workers=2)
client = Client(cluster)

Select the files to be used. 
In this example, the files are stored locally under `/shared/scratch/ak18773/lz/mssi/`. 
Each file is a ROOT file containing the output of an `LZLAMA` simulation (the `NEST` handler); more details can be found in [arvix:2001.09363](https://arxiv.org/abs/2001.09363)

In [None]:
files = glob.glob("/shared/scratch/ak18773/lz/mssi/*.root")
print(f'N. files to process: {len(files)}')

In [None]:
delayed_results = [dask.delayed(process_file)(file) for file in files]
futures = client.compute(delayed_results)

In [None]:
# monitor the progress
progress(futures)

In [None]:
# Once complete, retrieve the results
results = client.gather(futures)

In [None]:
df = pd.DataFrame(results, columns=['File', 'nSS', 'nSS FV', 'nSS FV ROI', 'nMSSI', 'nMSSI FV', 'eventWeight'])
df

### Post processing
Now that we have the fraction of events in each region, we can calculate the rates using the known `decays/day`

In [None]:
rates = {
    "Co60_CalibrationSourceTubes": 4690.57902,
    "Co60_DomePMTs": 3885.410702,
    "K40_BottomTruss": 28927.99798,
    "K40_DomePMTs": 88935.50817,
    "Th232-early_BottomTPCPMTBodies": 38003.65201,
    "Th232-late_BottomTPCPMTBases": 20626.61384,
    "Th232-late_BottomTPCPMTBodies": 51716.2229,
    "Th232-late_ForwardFieldResistors": 77545.76613,
    "Th232-late_HVInnerCone": 363483.6619,
    "U238-late_AnodeGridWires": 4316.423461
}

In [10]:
# Multiple weights by rates and the number of events to get events per day that are SS, MSSI, etc...