In [None]:
from utils import get_list_of_files
from datasets.singlemuon_at_purdue import datasets

In [None]:
# get all ROOT files in datasets
files = get_list_of_files(datasets)
print(len(files), "files")

In [None]:
def xrootd_process(file):
    from XRootD import client
    from XRootD.client.flags import OpenFlags
    tick = time.time()
    with client.File() as f:
        status, response = f.open(file, OpenFlags.READ)
        status, data = f.read() # Reads the whole file
        server = f.get_property('DataServer')
    tock = time.time()
    elapsed = tock - tick
    return pd.DataFrame([{"server": server, "time": elapsed}])

In [None]:
from dask.distributed import Client, as_completed, TimeoutError, KilledWorker
# Measure time for a list of files
def run_benchmark_xrootd(process, files, use_dask=False, client=None):
    
    tick = time.time()

    nevts_total = 0

    xrootd_df = pd.DataFrame(columns=["server", "time"])

    if use_dask:
        if not client:
            raise "Dask client is missing!"
        futures = client.map(process, files)
        results = []
        failed = 0
        for f in futures:
            try:
                results.append(client.gather(f))
            except:
                failed += 1
        results = client.gather(futures)
        xrootd_df = pd.concat(results)
        print(failed, "failed")
    else:
        for file in tqdm.tqdm(files):
            result = xrootd_process(file)
            xrootd_df = pd.concat([xrootd_df, result])
        # print(xrootd_df)

    tock = time.time()
    elapsed = tock - tick

    print(round(elapsed,3), "s")
    xrootd_df.reset_index(inplace=True)
    return xrootd_df


In [None]:
# Sequential processing
# run_benchmark_xrootd(xrootd_process, files)

In [None]:
from dask_gateway import Gateway
gateway = Gateway()
cluster = gateway.new_cluster(
    worker_memory = 8,
    queue = "cms-express",
    # reservation = "DASKTEST",
    env={
        "PYTHONPATH": "/depot/cms/private/users/dkondra/af-benchmark",
        "X509_USER_PROXY": "/depot/cms/private/users/dkondra/x509up_u616617"
    }
)
cluster

In [None]:
# Process via Dask Gateway cluster
df = run_benchmark_xrootd(xrootd_process, files, use_dask=True, client=cluster.get_client())

In [None]:
cluster.shutdown()

In [None]:
gateway.list_clusters()

In [None]:
df

In [None]:
df.groupby("server").mean().sort_values(by="time", ascending=False).drop("index", axis=1)

In [None]:
df.groupby("server").count().sort_values(by="time", ascending=False).drop("index", axis=1)

In [None]:
df.loc[df.server=="eos-a01.cms.rcac.purdue.edu:1103", "time"].plot.hist()