In [1]:
from dask_jobqueue import SLURMCluster
from dask.distributed import Client
from dask import array as da
import numpy as np
from scipy import ndimage
import os
import time

In [2]:
def process_with_retry(client, dask_data, save_path, max_retries=3):
    for attempt in range(max_retries):
        try:
            # Check if we still have workers
            if len(client.scheduler_info()['workers']) == 0:
                print("No workers available, waiting for 30 seconds...")
                client.wait_for_workers(1, timeout=30)

            print("Workers detected. Proceeding...")

            # Process the data
            high_pass_filtered = dask_data.map_overlap(
                ndimage.gaussian_filter, sigma=3, order=0, mode="nearest", depth=40)

            low_pass_filtered = dask_data.map_overlap(
                ndimage.gaussian_filter, sigma=10, order=0, mode="nearest", depth=40)

            dog_filtered = da.map_blocks(
                np.subtract, high_pass_filtered, low_pass_filtered)

            # Save with progress monitoring
            future = client.compute(dog_filtered.to_zarr(save_path, overwrite=True))
            _ = client.gather(future)

            print("Filtering complete and saved to Zarr file.")
            return True

        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                print("Retrying in 30 seconds...")
                time.sleep(30)
            else:
                print("All retry attempts failed")
                raise

    return False

In [7]:
import subprocess
result = subprocess.run("umask", shell=True, capture_output=True, text=True)
print("Subprocess umask:", result.stdout.strip())

Subprocess umask: 0022


In [8]:

# Usage
local_directory="/project/bioinformatics/Danuser_lab/Dean/dean/dask_temp"
subprocess.run(f"mkdir -p {local_directory} && chmod -R 777 {local_directory}", shell=True)


CompletedProcess(args='mkdir -p /project/bioinformatics/Danuser_lab/Dean/dean/dask_temp && chmod -R 777 /project/bioinformatics/Danuser_lab/Dean/dean/dask_temp', returncode=0)

In [9]:
cluster_kwargs = {
    'cores': 40, # Number of threads per worker (utilizing cores within each process)
    'processes': 1, # Number of Python processes/worker.
    'memory': '220GB',
    'local_directory': local_directory,
    'interface': 'ib0',
    'walltime': "36:00:00",
    'job_name': "multinode_warp",
    'queue': "256GB",
    'death_timeout': "600s",
    'job_extra_directives': [
        "--nodes=1",
        "--ntasks=1",
        "--mail-type=FAIL",
        "--mail-user=kevin.dean@utsouthwestern.edu",
        "-o job_%j.out",
        "-e job_%j.err",
    ],
    'scheduler_options': {
        "dashboard_address": ":8788",
        # "heartbeat-interval": "5s",
        # "worker-timeout": "120s",
    },
    'worker_extra_args': ['--tcp-timeout=30s', '--worker-heartbeat-timeout=120s']
}

cluster = SLURMCluster(**cluster_kwargs)
cluster.scale(4)
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41517 instead


In [10]:
# Location of the data.
base_path = "/archive/bioinformatics/Danuser_lab/Dean/dean/2024-05-21-tiling"
data_path = os.path.join(base_path, "cell5_fused_tp_0_ch_0.zarr")
save_path = os.path.join(base_path, 'example_4.zarr')

# Load the Zarr file with Dask
dask_data = da.from_zarr(data_path, component='0/0')
data_shape = dask_data.shape

# Eliminate singleton dimensions, and rechunk the data.
dask_data = dask_data.squeeze()
dask_data = dask_data.rechunk((32, 64, 64))

# Process the data with retries
process_with_retry(client, dask_data, save_path)

# Close the client and cluster
client.close()
cluster.close()
print("Client and cluster closed.")

No workers available, waiting for 30 seconds...
Attempt 1 failed: Only 0/1 workers arrived after 30
Retrying in 30 seconds...
No workers available, waiting for 30 seconds...
Attempt 2 failed: Only 0/1 workers arrived after 30
Retrying in 30 seconds...
No workers available, waiting for 30 seconds...
Attempt 3 failed: Only 0/1 workers arrived after 30
All retry attempts failed


TimeoutError: Only 0/1 workers arrived after 30

In [None]:
# ssh -N -L 8788:localhost:8788 your-cluster-login

        # "dashboard": {
        #     "session_token_expiration": 3600,
        #     "dashboard_address": ":8788",
        #     "host": "10.100.161.251",
        # }