In [1]:
import sys
import pathlib
import datetime
import xarray as xr
import dask
from cluster import PerlmutterSLURMCluster
from process_files import (
    glob_nc_files,
    process_cases,
    memory,
    open_compress_and_save_file,
    set_coords,
    add_additional_coords,
    expand_ensemble_dims,
    set_encoding,
    save_to_netcdf,
    get_case_metadata,
)
from data_config import (
    get_scratch_dir,
    get_dask_log_dir,
    get_dask_local_dir,
    get_compressed_data_dir,
    get_data_archive_dir,
)

In [2]:
scratch = get_scratch_dir()
dask_log_directory = get_dask_log_dir()
dask_local_directory = get_dask_local_dir()

In [3]:
n_workers = 1  # Number of Slurm jobs to launch in parallel
n_nodes_per_calc = 1  # Number of nodes to reserve for each Slurm job
n_cores_per_node = 48  # Number of CPU cores per node
mem_per_node = "512 GB"  # Total memory per node
cluster_kwargs = {
    # Dask worker options
    "processes": n_cores_per_node,
    "cores": n_cores_per_node,  # total number of cores (per Slurm job) for Dask worker
    "memory": mem_per_node,  # total memory (per Slurm job) for Dask worker
    # SLURM options
    "job_name": "dor-dataset-compression",
    "shebang": "#!/bin/bash",
    "walltime": "00:30:00",  # DD:HH:SS
    "job_mem": "0",  # all memory on node
    "job_script_prologue": [
        "source ~/.bashrc"
    ],  # commands to run before calculation, including exports
    "job_directives_skip": ["-n", "--cpus-per-task"],  # Slurm directives we can skip
    "job_extra_directives": [
        f"-N {n_nodes_per_calc}",
        "-q debug",
        "-C cpu",
    ],  # num. of nodes for calc (-N), queue (-q), and constraints (-c)
    "log_directory": str(dask_log_directory),
    "local_directory": str(dask_local_directory),
    "worker_extra_args": ["--lifetime", "25m", "--lifetime-stagger", "4m"],
}

cluster = PerlmutterSLURMCluster(**cluster_kwargs)

client = dask.distributed.client.Client(cluster)
cluster

0,1
Dashboard: https://jupyter.nersc.gov/user/abanihi/perlmutter-login-node-base/proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.55.64.47:37739,Workers: 0
Dashboard: https://jupyter.nersc.gov/user/abanihi/perlmutter-login-node-base/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [4]:
# cluster.scale(20*n_cores_per_node)
cluster.adapt(minimum_jobs=2, maximum_jobs=3)

<distributed.deploy.adaptive.Adaptive at 0x7f7c0695fe90>

2025-02-20 10:23:56,865 - tornado.application - ERROR - Exception in callback functools.partial(<bound method IOLoop._discard_future_result of <tornado.platform.asyncio.AsyncIOMainLoop object at 0x7f7c0ecb3470>>, <Task finished name='Task-112' coro=<SpecCluster._correct_state_internal() done, defined at /global/homes/a/abanihi/.conda/envs/dor/lib/python3.12/site-packages/distributed/deploy/spec.py:346> exception=RuntimeError('Command exited with non-zero exit code.\nExit code: 1\nCommand:\nsbatch /tmp/tmpvg69vnc_.sh\nstdout:\n\nstderr:\nsbatch: error: Batch job submission failed: Required partition not available (inactive or drain)\n\n')>)
Traceback (most recent call last):
  File "/global/homes/a/abanihi/.conda/envs/dor/lib/python3.12/site-packages/tornado/ioloop.py", line 750, in _run_callback
    ret = callback()
          ^^^^^^^^^^
  File "/global/homes/a/abanihi/.conda/envs/dor/lib/python3.12/site-packages/tornado/ioloop.py", line 774, in _discard_future_result
    future.result(

In [5]:
# print(cluster.job_script())

In [6]:
# cluster.scale(0)

In [7]:
parent_dir = pathlib.Path.cwd().parent
sys.path.append(str(parent_dir))

In [None]:
import atlas

In [None]:
@memory.cache
def get_done_cases_df(today=datetime.datetime.today().date()):
    calc = atlas.global_irf_map(cdr_forcing="DOR", vintage="001")

    data = calc.df_case_status
    done = data.loc[data.archive]

    done_cases = done.index.to_list()
    done_cases.remove("smyle.cdr-atlas-v0.control.001")
    done_cases = sorted(done_cases)

    df = calc.df.loc[done_cases]
    return df, done_cases

In [None]:
%%time

df, done_cases = get_done_cases_df()
df

In [None]:
base_directory = get_data_archive_dir()

In [None]:
case = done_cases[0]
nc_files = glob_nc_files(base_path=base_directory, case=case)
len(nc_files)

## Add coordinate information and expand dimensions

In [None]:
path = nc_files[146]
path

In [None]:
case_metadata = get_case_metadata(case, df=df)

expanded = (
    xr.open_dataset(path, engine="netcdf4")
    .pipe(set_coords)
    .pipe(add_additional_coords, case, case_metadata)
    .pipe(expand_ensemble_dims)
)
expanded

## Compute anomalies

In [None]:
# anomalies = compute_anomalies(expanded)
# anomalies

## Quick check for polygon's correctness

In [None]:
(expanded["ALK"] - expanded["ALK_ALT_CO2"]).isel(z_t=range(0, 60, 5)).plot(
    col_wrap=4, col="z_t", robust=True
);

## Fix encoding

In [None]:
encoded = expanded.pipe(set_encoding)
encoded

In [None]:
encoded.ALK_ALT_CO2.encoding, encoded["ALK"].encoding

In [None]:
save_to_netcdf(encoded, out_filepath=f"{scratch}/compressed-data-test.nc")

In [None]:
!du -ch {scratch}/compressed-data-test.nc

In [None]:
!du -ch {path}

## Whole pipeline for any task

In [None]:
%%time
test_out_path_prefix = get_compressed_data_dir(scratch)

ds = open_compress_and_save_file(
    nc_files[53],
    out_path_prefix=test_out_path_prefix,
    case=case,
    case_metadata=case_metadata,
)
ds

In [None]:
out_path_prefix = get_compressed_data_dir(
    "/global/cfs/projectdirs/m4746/Projects/Ocean-CDR-Atlas-v0/data/research-grade-archive"
)
out_path_prefix

In [None]:
%%time

process_cases(
    data_dir_path=base_directory,
    out_path_prefix=out_path_prefix,
    done_cases=done_cases,
    df=df,
)

In [None]:
cluster.close()
client.close()

In [3]:
ds = xr.open_dataset(
    "/global/cfs/projectdirs/m4746/Projects/Ocean-CDR-Atlas-v0/data/analysis/smyle.cdr-atlas-v0.glb-dor_North_Pacific_basin_130_1999-07-01_01122.001.analysis.zarr",
    engine="zarr",
    chunks={},
)
ds

  ds = xr.open_dataset("/global/cfs/projectdirs/m4746/Projects/Ocean-CDR-Atlas-v0/data/analysis/smyle.cdr-atlas-v0.glb-dor_North_Pacific_basin_130_1999-07-01_01122.001.analysis.zarr",


Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 0.94 MiB 240.00 kiB Shape (384, 320) (192, 160) Dask graph 4 chunks in 2 graph layers Data type float64 numpy.ndarray",320  384,

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 0.94 MiB 240.00 kiB Shape (384, 320) (192, 160) Dask graph 4 chunks in 2 graph layers Data type float64 numpy.ndarray",320  384,

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 0.94 MiB 240.00 kiB Shape (384, 320) (192, 160) Dask graph 4 chunks in 2 graph layers Data type float64 numpy.ndarray",320  384,

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 0.94 MiB 240.00 kiB Shape (384, 320) (192, 160) Dask graph 4 chunks in 2 graph layers Data type float64 numpy.ndarray",320  384,

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,timedelta64[ns] numpy.ndarray,timedelta64[ns] numpy.ndarray
"Array Chunk Bytes 1.41 kiB 1.41 kiB Shape (180,) (180,) Dask graph 1 chunks in 2 graph layers Data type timedelta64[ns] numpy.ndarray",180  1,

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,timedelta64[ns] numpy.ndarray,timedelta64[ns] numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.38 MiB 1.32 MiB Shape (180, 384, 320) (45, 96, 80) Dask graph 64 chunks in 2 graph layers Data type float32 numpy.ndarray",320  384  180,

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.41 kiB 1.41 kiB Shape (180,) (180,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",180  1,

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 0.94 MiB 240.00 kiB Shape (384, 320) (192, 160) Dask graph 4 chunks in 2 graph layers Data type float64 numpy.ndarray",320  384,

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.38 MiB 1.32 MiB Shape (180, 384, 320) (45, 96, 80) Dask graph 64 chunks in 2 graph layers Data type float32 numpy.ndarray",320  384  180,

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.38 MiB 1.32 MiB Shape (180, 384, 320) (45, 96, 80) Dask graph 64 chunks in 2 graph layers Data type float32 numpy.ndarray",320  384  180,

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.41 kiB 1.41 kiB Shape (180,) (180,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",180  1,

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.41 kiB 1.41 kiB Shape (180,) (180,) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",180  1,

Unnamed: 0,Array,Chunk
Bytes,1.41 kiB,1.41 kiB
Shape,"(180,)","(180,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 0.94 MiB 240.00 kiB Shape (384, 320) (192, 160) Dask graph 4 chunks in 2 graph layers Data type float64 numpy.ndarray",320  384,

Unnamed: 0,Array,Chunk
Bytes,0.94 MiB,240.00 kiB
Shape,"(384, 320)","(192, 160)"
Dask graph,4 chunks in 2 graph layers,4 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.38 MiB 1.32 MiB Shape (180, 384, 320) (45, 96, 80) Dask graph 64 chunks in 2 graph layers Data type float32 numpy.ndarray",320  384  180,

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.38 MiB 1.32 MiB Shape (180, 384, 320) (45, 96, 80) Dask graph 64 chunks in 2 graph layers Data type float32 numpy.ndarray",320  384  180,

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.38 MiB 1.32 MiB Shape (180, 384, 320) (45, 96, 80) Dask graph 64 chunks in 2 graph layers Data type float32 numpy.ndarray",320  384  180,

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 84.38 MiB 1.32 MiB Shape (180, 384, 320) (45, 96, 80) Dask graph 64 chunks in 2 graph layers Data type float32 numpy.ndarray",320  384  180,

Unnamed: 0,Array,Chunk
Bytes,84.38 MiB,1.32 MiB
Shape,"(180, 384, 320)","(45, 96, 80)"
Dask graph,64 chunks in 2 graph layers,64 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
