# Binning demonstration on locally generated fake data
In this example, we generate a table with random data simulating a single event dataset.
We showcase the binning method, first on a simple single table using the bin_partition method and then in the distributed mehthod bin_dataframe, using daks dataframes.
The first method is never really called directly, as it is simply the function called by the bin_dataframe on each partition of the dask dataframe.

In [None]:
import sys

import numpy as np
import pandas as pd
import dask.dataframe

import matplotlib.pyplot as plt

import timeit

from sed.binning import bin_partition, bin_dataframe

# Generate Fake Data

In [None]:
# 1 Billion events, ~ 30 GByte.
n_pts = 1000000000
#n_pts = 1000000
cols = ["posx", "posy", "energy", "delay"]
df = pd.DataFrame(np.random.randn(n_pts, len(cols)), columns=cols)
ddf = dask.dataframe.from_pandas(df, npartitions=100)
ddf

In [None]:
bench_fake = {}
hist_modes = ["numba", "numpy"]
for mode in hist_modes:
    bench_fake[mode] = {}

In [None]:
# benchmark 1D binning
axes = ["posx"]
ranges = [(-2, 2)]
#for bins in [[100], [1000], [10000]]:
bin_dataframe(df=ddf, bins=[100], axes=axes, ranges=ranges, hist_mode="numba", pbar=False, n_cores=20)
for bins in [[100], [1000], [10000], [100000], [1000000], [10000000], [100000000]]:
    for hist_mode in hist_modes:
        print(f"hist_mode: {hist_mode}, bins: {bins}")
        coords = {ax: np.linspace(r[0], r[1], n) for ax, r, n in zip(axes, ranges, bins)}
        timer = timeit.Timer('bin_dataframe(df=ddf, bins=bins, axes=axes, ranges=ranges, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_fake[hist_mode][f"{bins}"] = timer.repeat(3, number=1)
        timer = timeit.Timer('bin_dataframe(df=ddf, bins=coords, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_fake[hist_mode][f"array_{bins}"] = timer.repeat(3, number=1)

In [None]:
# benchmark 4D binning
axes = ["posx", "posy", "energy", "delay"]
ranges = [(-2, 2), (-2, 2), (-2, 2), (-2, 2)]
#for bins in [[10, 10, 10, 10], [11, 11, 11, 11]]:
for bins in [[10, 10, 10, 10], [50, 50, 50, 50], [100, 100, 100, 100]]:
    for hist_mode in hist_modes:
        print(f"hist_mode: {hist_mode}, bins: {bins}")
        coords = {ax: np.linspace(r[0], r[1], n) for ax, r, n in zip(axes, ranges, bins)}
        timer = timeit.Timer('bin_dataframe(df=ddf, bins=bins, axes=axes, ranges=ranges, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_fake[hist_mode][f"{bins}"] = timer.repeat(3, number=1)
        timer = timeit.Timer('bin_dataframe(df=ddf, bins=coords, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_fake[hist_mode][f"array_{bins}"] = timer.repeat(3, number=1)

In [None]:
bench_fake

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(8, 5), constrained_layout=True)
#bins = [100, 1000, 10000]
bins = [100, 1000, 10000, 100000, 1000000, 10000000, 100000000]
res1D_numba = [np.mean(bench_fake["numba"][f"{[bin]}"]) for bin in bins]
res1D_numba_err = [np.std(bench_fake["numba"][f"{[bin]}"]) for bin in bins]
res1D_numpy = [np.mean(bench_fake["numpy"][f"{[bin]}"]) for bin in bins]
res1D_numpy_err = [np.std(bench_fake["numpy"][f"{[bin]}"]) for bin in bins]
res1D_numba_array = [np.mean(bench_fake["numba"][f"array_{[bin]}"]) for bin in bins]
res1D_numba_array_err = [np.std(bench_fake["numba"][f"array_{[bin]}"]) for bin in bins]
res1D_numpy_array = [np.mean(bench_fake["numpy"][f"array_{[bin]}"]) for bin in bins]
res1D_numpy_array_err = [np.std(bench_fake["numpy"][f"array_{[bin]}"]) for bin in bins]
axs[0].set_title("Fake data 1D binning")
axs[0].set_xscale("log")
axs[0].set_yscale("log")
axs[0].errorbar(bins, res1D_numba, res1D_numba_err, label="numba bins")
axs[0].errorbar(bins, res1D_numpy, res1D_numpy_err, label="numpy bins")
axs[0].errorbar(bins, res1D_numba_array, res1D_numba_array_err, label="numba arrays")
axs[0].errorbar(bins, res1D_numpy_array, res1D_numpy_array_err, label="numpy arrays")
axs[0].set_xlabel('# Bins')
axs[0].set_ylabel('Time (s)')
axs[0].legend()
#bins = [[10, 10, 10, 10], [11, 11, 11, 11]]
bins = [[10, 10, 10, 10], [50, 50, 50, 50], [100, 100, 100, 100]]
bins_s = [f"{bin}" for bin in bins]
res4D_numba = [np.mean(bench_fake["numba"][f"{bin}"]) for bin in bins]
res4D_numba_err = [np.std(bench_fake["numba"][f"{bin}"]) for bin in bins]
res4D_numpy = [np.mean(bench_fake["numpy"][f"{bin}"]) for bin in bins]
res4D_numpy_err = [np.std(bench_fake["numpy"][f"{bin}"]) for bin in bins]
res4D_numba_array = [np.mean(bench_fake["numba"][f"array_{bin}"]) for bin in bins]
res4D_numba_array_err = [np.std(bench_fake["numba"][f"array_{bin}"]) for bin in bins]
res4D_numpy_array = [np.mean(bench_fake["numpy"][f"array_{bin}"]) for bin in bins]
res4D_numpy_array_err = [np.std(bench_fake["numpy"][f"array_{bin}"]) for bin in bins]
axs[1].set_title("Fake data 4D binning")
axs[1].set_yscale("log")
axs[1].errorbar(bins_s, res4D_numba, res4D_numba_err, label="numba bins")
axs[1].errorbar(bins_s, res4D_numpy, res4D_numpy_err, label="numpy bins")
axs[1].errorbar(bins_s, res4D_numba_array, res4D_numba_array_err, label="numba arrays")
axs[1].errorbar(bins_s, res4D_numpy_array, res4D_numpy_array_err, label="numpy arrays")
axs[1].set_xlabel('Bin shape')
axs[1].set_ylabel('Time (s)')
axs[1].legend()


In [None]:
import sed
import os

In [None]:
data_path = '../../' # Put in Path to a storage of at least 20 Gbyte free space.
if not os.path.exists(data_path + "/WSe2.zip"):
    os.system(f"curl --output {data_path}/WSe2.zip https://zenodo.org/record/6369728/files/WSe2.zip")
if not os.path.isdir(data_path + "/Scan049_1") or not os.path.isdir(data_path + "energycal_2019_01_08/"):
    os.system(f"unzip -d {data_path} -o {data_path}/WSe2.zip")

In [None]:
# The Scan directory
fdir = data_path + '/Scan049_1'
# create sed processor using the config file:
sp = sed.SedProcessor(folder=fdir, config="../tests/data/config/config_local.yaml")

In [None]:
bench_real = {}
hist_modes = ["numba", "numpy"]
for mode in hist_modes:
    bench_real[mode] = {}

In [None]:
# benchmark 1D binning
axes = ["X"]
ranges = [(0, 2024)]
#for bins in [[100], [1000], [10000]]:
bin_dataframe(df=sp._dataframe, bins=[100], axes=axes, ranges=ranges, hist_mode="numba", pbar=False, n_cores=20)
for bins in [[100], [1000], [10000], [100000], [1000000], [10000000], [100000000]]:
    for hist_mode in hist_modes:
        print(f"hist_mode: {hist_mode}, bins: {bins}")
        coords = {ax: np.linspace(r[0], r[1], n) for ax, r, n in zip(axes, ranges, bins)}
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=bins, axes=axes, ranges=ranges, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_real[hist_mode][f"{bins}"] = timer.repeat(3, number=1)
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=coords, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_real[hist_mode][f"array_{bins}"] = timer.repeat(3, number=1)

In [None]:
# benchmark 4D binning
axes = ["X", "Y", "t", "ADC"]
ranges = [(0, 2024), (0, 2024), (65000, 70000), (0, 6500)]
#for bins in [[10, 10, 10, 10], [11, 11, 11, 11]]:
for bins in [[10, 10, 10, 10], [50, 50, 50, 50], [100, 100, 100, 100]]:
    for hist_mode in hist_modes:
        print(f"hist_mode: {hist_mode}, bins: {bins}")
        coords = {ax: np.linspace(r[0], r[1], n) for ax, r, n in zip(axes, ranges, bins)}
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=bins, axes=axes, ranges=ranges, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_real[hist_mode][f"{bins}"] = timer.repeat(3, number=1)
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=coords, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_real[hist_mode][f"array_{bins}"] = timer.repeat(3, number=1)

In [None]:
bench_real

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(8, 5), constrained_layout=True)
#bins = [100, 1000, 10000]
bins = [100, 1000, 10000, 100000, 1000000, 10000000, 100000000]
res1D_numba = [np.mean(bench_real["numba"][f"{[bin]}"]) for bin in bins]
res1D_numba_err = [np.std(bench_real["numba"][f"{[bin]}"]) for bin in bins]
res1D_numpy = [np.mean(bench_real["numpy"][f"{[bin]}"]) for bin in bins]
res1D_numpy_err = [np.std(bench_real["numpy"][f"{[bin]}"]) for bin in bins]
res1D_numba_array = [np.mean(bench_real["numba"][f"array_{[bin]}"]) for bin in bins]
res1D_numba_array_err = [np.std(bench_real["numba"][f"array_{[bin]}"]) for bin in bins]
res1D_numpy_array = [np.mean(bench_real["numpy"][f"array_{[bin]}"]) for bin in bins]
res1D_numpy_array_err = [np.std(bench_real["numpy"][f"array_{[bin]}"]) for bin in bins]
axs[0].set_title("Real data 1D binning")
axs[0].set_xscale("log")
axs[0].set_yscale("log")
axs[0].errorbar(bins, res1D_numba, res1D_numba_err, label="numba bins")
axs[0].errorbar(bins, res1D_numpy, res1D_numpy_err, label="numpy bins")
axs[0].errorbar(bins, res1D_numba_array, res1D_numba_array_err, label="numba arrays")
axs[0].errorbar(bins, res1D_numpy_array, res1D_numpy_array_err, label="numpy arrays")
axs[0].set_xlabel('# Bins')
axs[0].set_ylabel('Time (s)')
axs[0].legend()
#bins = [[10, 10, 10, 10], [11, 11, 11, 11]]
bins = [[10, 10, 10, 10], [50, 50, 50, 50], [100, 100, 100, 100]]
bins_s = [f"{bin}" for bin in bins]
res4D_numba = [np.mean(bench_real["numba"][f"{bin}"]) for bin in bins]
res4D_numba_err = [np.std(bench_real["numba"][f"{bin}"]) for bin in bins]
res4D_numpy = [np.mean(bench_real["numpy"][f"{bin}"]) for bin in bins]
res4D_numpy_err = [np.std(bench_real["numpy"][f"{bin}"]) for bin in bins]
res4D_numba_array = [np.mean(bench_real["numba"][f"array_{bin}"]) for bin in bins]
res4D_numba_array_err = [np.std(bench_real["numba"][f"array_{bin}"]) for bin in bins]
res4D_numpy_array = [np.mean(bench_real["numpy"][f"array_{bin}"]) for bin in bins]
res4D_numpy_array_err = [np.std(bench_real["numpy"][f"array_{bin}"]) for bin in bins]
axs[1].set_title("Real data 4D binning")
axs[1].set_yscale("log")
axs[1].errorbar(bins_s, res4D_numba, res4D_numba_err, label="numba bins")
axs[1].errorbar(bins_s, res4D_numpy, res4D_numpy_err, label="numpy bins")
axs[1].errorbar(bins_s, res4D_numba_array, res4D_numba_array_err, label="numba arrays")
axs[1].errorbar(bins_s, res4D_numpy_array, res4D_numpy_array_err, label="numpy arrays")
axs[1].set_xlabel('Bin shape')
axs[1].set_ylabel('Time (s)')
axs[1].legend()

In [None]:
sp.add_jitter()
sp.apply_momentum_calibration()
sp.apply_energy_correction()
sp.append_energy_axis()
delay_range = (-500, 1500)
sp.calibrate_delay_axis(delay_range=delay_range)

In [None]:
bench_converted = {}
hist_modes = ["numba", "numpy"]
for mode in hist_modes:
    bench_converted[mode] = {}

In [None]:
# benchmark 1D binning
axes = ['kx']
ranges = [(-2, 2)]
#for bins in [[100], [1000], [10000]]:
bin_dataframe(df=sp._dataframe, bins=[100], axes=axes, ranges=ranges, hist_mode="numba", pbar=False, n_cores=20)
for bins in [[100], [1000], [10000], [100000], [1000000], [10000000], [100000000]]:
    for hist_mode in hist_modes:
        print(f"hist_mode: {hist_mode}, bins: {bins}")
        coords = {ax: np.linspace(r[0], r[1], n) for ax, r, n in zip(axes, ranges, bins)}
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=bins, axes=axes, ranges=ranges, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_converted[hist_mode][f"{bins}"] = timer.repeat(3, number=1)
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=coords, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_converted[hist_mode][f"array_{bins}"] = timer.repeat(3, number=1)

In [None]:
# benchmark 4D binning
axes = ['kx', 'ky', 'E', 'delay']
ranges = [(-2, 2), (-2, 2), (-4, 2), (-600, 1600)]
#for bins in [[10, 10, 10, 10], [11, 11, 11, 11]]:
for bins in [[10, 10, 10, 10], [50, 50, 50, 50], [100, 100, 100, 100]]:
    for hist_mode in hist_modes:
        print(f"hist_mode: {hist_mode}, bins: {bins}")
        coords = {ax: np.linspace(r[0], r[1], n) for ax, r, n in zip(axes, ranges, bins)}
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=bins, axes=axes, ranges=ranges, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_converted[hist_mode][f"{bins}"] = timer.repeat(3, number=1)
        timer = timeit.Timer('bin_dataframe(df=sp._dataframe, bins=coords, hist_mode=hist_mode, pbar=False, n_cores=20)', globals=globals())
        bench_converted[hist_mode][f"array_{bins}"] = timer.repeat(3, number=1)

In [None]:
bench_converted

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(8, 5), constrained_layout=True)
#bins = [100, 1000, 10000]
bins = [100, 1000, 10000, 100000, 1000000, 10000000, 100000000]
res1D_numba = [np.mean(bench_converted["numba"][f"{[bin]}"]) for bin in bins]
res1D_numba_err = [np.std(bench_converted["numba"][f"{[bin]}"]) for bin in bins]
res1D_numpy = [np.mean(bench_converted["numpy"][f"{[bin]}"]) for bin in bins]
res1D_numpy_err = [np.std(bench_converted["numpy"][f"{[bin]}"]) for bin in bins]
res1D_numba_array = [np.mean(bench_converted["numba"][f"array_{[bin]}"]) for bin in bins]
res1D_numba_array_err = [np.std(bench_converted["numba"][f"array_{[bin]}"]) for bin in bins]
res1D_numpy_array = [np.mean(bench_converted["numpy"][f"array_{[bin]}"]) for bin in bins]
res1D_numpy_array_err = [np.std(bench_converted["numpy"][f"array_{[bin]}"]) for bin in bins]
axs[0].set_title("Calibrated data 1D binning")
axs[0].set_xscale("log")
axs[0].set_yscale("log")
axs[0].errorbar(bins, res1D_numba, res1D_numba_err, label="numba bins")
axs[0].errorbar(bins, res1D_numpy, res1D_numpy_err, label="numpy bins")
axs[0].errorbar(bins, res1D_numba_array, res1D_numba_array_err, label="numba arrays")
axs[0].errorbar(bins, res1D_numpy_array, res1D_numpy_array_err, label="numpy arrays")
axs[0].set_xlabel('# Bins')
axs[0].set_ylabel('Time (s)')
axs[0].legend()
#bins = [[10, 10, 10, 10], [11, 11, 11, 11]]
bins = [[10, 10, 10, 10], [50, 50, 50, 50], [100, 100, 100, 100]]
bins_s = [f"{bin}" for bin in bins]
res4D_numba = [np.mean(bench_converted["numba"][f"{bin}"]) for bin in bins]
res4D_numba_err = [np.std(bench_converted["numba"][f"{bin}"]) for bin in bins]
res4D_numpy = [np.mean(bench_converted["numpy"][f"{bin}"]) for bin in bins]
res4D_numpy_err = [np.std(bench_converted["numpy"][f"{bin}"]) for bin in bins]
res4D_numba_array = [np.mean(bench_converted["numba"][f"array_{bin}"]) for bin in bins]
res4D_numba_array_err = [np.std(bench_converted["numba"][f"array_{bin}"]) for bin in bins]
res4D_numpy_array = [np.mean(bench_converted["numpy"][f"array_{bin}"]) for bin in bins]
res4D_numpy_array_err = [np.std(bench_converted["numpy"][f"array_{bin}"]) for bin in bins]
axs[1].set_title("Calibrated data 4D binning")
axs[1].set_yscale("log")
axs[1].errorbar(bins_s, res4D_numba, res4D_numba_err, label="numba bins")
axs[1].errorbar(bins_s, res4D_numpy, res4D_numpy_err, label="numpy bins")
axs[1].errorbar(bins_s, res4D_numba_array, res4D_numba_array_err, label="numba arrays")
axs[1].errorbar(bins_s, res4D_numpy_array, res4D_numpy_array_err, label="numpy arrays")
axs[1].set_xlabel('Bin shape')
axs[1].set_ylabel('Time (s)')
axs[1].legend()