# Convert B-grid variable to C-grid

## 1. Load packages

In [1]:
# Ignore warnings
from os import environ
environ["PYTHONWARNINGS"] = "ignore"
# Deactivate SSL verification (not sure if/why SSL verification is an issue in my case)
environ['WDM_SSL_VERIFY'] = '0'

In [2]:
# Import makedirs to create directories where I write new files
from os import makedirs

In [3]:
# Load dask
from dask.distributed import Client

# Load intake and cosima cookbook
import intake

# Load xarray for N-dimensional arrays
import xarray as xr

# Load xgcm for interpolation between Arakawa grids
import xgcm

# Load datetime to deal with time formats
# import datetime
import cftime

# Load numpy for numbers!
import numpy as np

# Load xmip for preprocessing (trying to get consistent metadata for making matrices down the road)
from xmip.preprocessing import combined_preprocessing

# Load pandas for DataFrame manipulations
import pandas as pd

## 2. Define some functions

(to avoid too much boilerplate code)

In [4]:
########## functions ##########
print("Defining functions")

def time_window_strings(year_start, num_years, timetype=cftime.DatetimeNoLeap):
    """
    return strings for `start_time` and `end_time`

    So if you give it `year_start = 1850` and `num_years = 30`,
    It will return `start_time` as the first second of Jan 1 1850
    and `end_time` as the last second of Dec 31 1879.
    """
    # start_time is first second of year_start
    # start_time = datetime.datetime(year_start, 1, 1, 0, 0, 0)
    start_time =timetype(year_start, 1, 1, 0, 0, 0)
    # end_time is last second of last_year
    # end_time = datetime.datetime(year_start + num_years - 1, 12, 31, 23, 59, 59)
    end_time = timetype(year_start + num_years - 1, 12, 31, 23, 59, 59)

    # Return the weighted average
    return start_time, end_time

def find_latest_version(cat):
    """
    find latest version of selected data
    """
    sorted_versions = cat.df.version.to_list()
    sorted_versions.sort()
    latest_version = sorted_versions[-1]
    return latest_version

def select_latest_cat(cat, **kwargs):
    """
    search latest version of selected data
    """
    selectedcat = cat.search(**kwargs)
    latestselectedcat = selectedcat.search(version=find_latest_version(selectedcat))
    return latestselectedcat

def select_latest_data(cat, xarray_open_kwargs, **kwargs):
    latestselectedcat = select_latest_cat(cat, **kwargs)
    xarray_combine_by_coords_kwargs=dict(
        compat="override",
        data_vars="minimal",
        coords="minimal"
    )
    datadask = latestselectedcat.to_dask(
        xarray_open_kwargs=xarray_open_kwargs,
        xarray_combine_by_coords_kwargs=xarray_combine_by_coords_kwargs,
        parallel=True,
        preprocess=combined_preprocessing,
    )
    return datadask



Defining functions


## 3. Load the Pangeo Forge catalog



In [8]:
# The catalog
# copied from the ReadMe at https://github.com/leap-stc/cmip6-leap-feedstock
url = "https://storage.googleapis.com/cmip6/cmip6-pgf-ingestion-test/catalog/catalog.json" # Only stores that pass current tests
cat = intake.open_esm_datastore(url)
cat

FileNotFoundError: https://storage.googleapis.com/cmip6/cmip6-pgf-ingestion-test/catalog/catalog.json

## 4. Select the model, experiment, ensemble, and time window

A little detour to list all the models in the catalog that have monthly ocean variables:

In [6]:
models = np.sort(cat.search(table_id = 'Omon',).df.source_id.unique())
print(*models, sep = "\n")

ACCESS-CM2
ACCESS-ESM1-5
AWI-CM-1-1-MR
AWI-ESM-1-1-LR
BCC-CSM2-MR
BCC-ESM1
CAMS-CSM1-0
CAS-ESM2-0
CESM2
CESM2-FV2
CESM2-WACCM
CESM2-WACCM-FV2
CIESM
CMCC-CM2-HR4
CMCC-CM2-SR5
CMCC-ESM2
CNRM-CM6-1
CNRM-CM6-1-HR
CNRM-ESM2-1
CanESM5
CanESM5-1
CanESM5-CanOE
E3SM-1-0
E3SM-1-1
E3SM-1-1-ECA
E3SM-2-0
E3SM-2-0-NARRM
EC-Earth3
EC-Earth3-AerChem
EC-Earth3-CC
EC-Earth3-LR
EC-Earth3-Veg
EC-Earth3-Veg-LR
FGOALS-f3-L
FGOALS-g3
FIO-ESM-2-0
GFDL-CM4
GFDL-ESM2M
GFDL-ESM4
GFDL-OM4p5B
GISS-E2-1-G
GISS-E2-1-G-CC
GISS-E2-1-H
GISS-E2-2-G
GISS-E2-2-H
HadGEM3-GC31-HH
HadGEM3-GC31-LL
HadGEM3-GC31-MM
ICON-ESM-LR
IITM-ESM
INM-CM4-8
INM-CM5-0
IPSL-CM5A2-INCA
IPSL-CM6A-LR
IPSL-CM6A-LR-INCA
KACE-1-0-G
KIOST-ESM
MCM-UA-1-0
MIROC-ES2H
MIROC-ES2L
MPI-ESM-1-2-HAM
MPI-ESM1-2-HR
MPI-ESM1-2-LR
NESM3
NorCPM1
NorESM1-F
NorESM2-LM
NorESM2-MM
SAM0-UNICON
TaiESM1
UKESM1-0-LL
UKESM1-1-LL


The creation of a matrix can only work if the following set of variables is available:
- mass transports (`umo` and `vmo`)
- mixed-layer depth (`mlotst`)

Alternatively, we can use `umo` and `vmo` (in kg/s) can be replaced by `uo` and `vo` (m/s). However, the conversion to mass transport requires the grid-cell volume (`volcello`), grid-cell areas (from vertices and `thkcello`), and density (no variable so I guess constant will do).

So the notebook here will create all the files for transport, if available: `umo`, `vmo`, `uo`, `vo`, `mlotst`.

Although they can vary with time, I think the output from other variables (volumes, areas, thicknesses, vertices) is stored as constant (`table_id = Ofx` if I understand correctly).

So let's list the models and check which ones have ((`umo` and `vmo`) or (`uo` and `vo`)) and `mlotst`.

In [7]:
def summary_variable_availability(df):

    # Step 1: Filter the dataframe to include only the specified variables
    filtered_df_1 = df[df['variable_id'].isin(['umo', 'vmo', 'mlotst'])]
    filtered_df_2 = df[df['variable_id'].isin(['uo', 'vo', 'mlotst'])]

    # Step 2: Group by 'source_id' and 'member_id'
    grouped_1 = filtered_df_1.groupby(['experiment_id', 'source_id', 'member_id'])
    grouped_2 = filtered_df_2.groupby(['experiment_id', 'source_id', 'member_id'])

    # Step 3: Find groups that contain all the variables in each set
    valid_groups_1 = grouped_1.filter(lambda x: set(['umo', 'vmo', 'mlotst']).issubset(set(x['variable_id'])))
    valid_groups_2 = grouped_2.filter(lambda x: set(['uo', 'vo', 'mlotst']).issubset(set(x['variable_id'])))

    # Step 4: Get the list of source_id and their member_id for each set
    result_1 = valid_groups_1[['experiment_id', 'source_id', 'member_id']].drop_duplicates().reset_index(drop=True)
    result_2 = valid_groups_2[['experiment_id', 'source_id', 'member_id']].drop_duplicates().reset_index(drop=True)

    # Step 5: Group by 'source_id' and aggregate member_id into a list for each set
    final_result_1 = result_1.groupby(['experiment_id', 'source_id'])['member_id'].apply(list).reset_index()
    final_result_2 = result_2.groupby(['experiment_id', 'source_id'])['member_id'].apply(list).reset_index()

    # Step 6: Merge the results into a single dataframe
    merged_result_1 = pd.merge(final_result_1, final_result_2, on=['experiment_id', 'source_id'], how='outer', suffixes=('_with_umo_vmo', '_with_uo_vo'))

    final_restult = merged_result_1.sort_values(by='source_id')

    return final_restult

In [8]:
df_vars_avail = summary_variable_availability(cat.df)
df_vars_avail

Unnamed: 0,experiment_id,source_id,member_id_with_umo_vmo,member_id_with_uo_vo
0,1pctCO2,ACCESS-CM2,[r1i1p1f1],[r1i1p1f1]
205,ssp585,ACCESS-CM2,,[r1i1p1f1]
176,ssp245,ACCESS-CM2,"[r3i1p1f1, r2i1p1f1, r1i1p1f1]","[r3i1p1f1, r2i1p1f1, r1i1p1f1]"
155,ssp126,ACCESS-CM2,[r1i1p1f1],[r1i1p1f1]
112,piControl,ACCESS-CM2,[r1i1p1f1],[r1i1p1f1]
...,...,...,...,...
62,esm-piControl,UKESM1-0-LL,[r1i1p1f2],
34,1pctCO2-bgc,UKESM1-0-LL,[r1i1p1f2],
25,1pctCO2,UKESM1-0-LL,"[r1i1p1f2, r3i1p1f2, r4i1p1f2, r2i1p1f2]",[r4i1p1f2]
54,esm-hist,UKESM1-0-LL,[r1i1p1f2],[r1i1p1f2]


Let's list those models with the right variables in the historical run to start with. These are the candidates for a historical TMIP!

In [9]:
# historical_models = np.sort(df_vars_avail[df_vars_avail.experiment_id == 'historical'].source_id.unique())
# print(*historical_models, sep = "\n")
historical_models = df_vars_avail[df_vars_avail.experiment_id == 'historical'].sort_values(by='source_id')
historical_models

Unnamed: 0,experiment_id,source_id,member_id_with_umo_vmo,member_id_with_uo_vo
63,historical,ACCESS-CM2,"[r2i1p1f1, r1i1p1f1]","[r3i1p1f1, r2i1p1f1, r1i1p1f1]"
64,historical,ACCESS-ESM1-5,"[r2i1p1f1, r14i1p1f1, r23i1p1f1, r17i1p1f1, r2...","[r2i1p1f1, r3i1p1f1, r10i1p1f1, r4i1p1f1, r8i1..."
65,historical,AWI-CM-1-1-MR,,"[r3i1p1f1, r1i1p1f1, r4i1p1f1, r5i1p1f1, r2i1p..."
66,historical,BCC-CSM2-MR,"[r1i1p1f1, r3i1p1f1, r2i1p1f1]","[r2i1p1f1, r1i1p1f1, r3i1p1f1]"
67,historical,BCC-ESM1,"[r2i1p1f1, r3i1p1f1, r1i1p1f1]","[r3i1p1f1, r1i1p1f1, r2i1p1f1]"
68,historical,CAMS-CSM1-0,,"[r2i1p1f1, r1i1p1f1]"
69,historical,CAS-ESM2-0,,[r1i1p1f1]
70,historical,CESM2,,"[r2i1p1f1, r8i1p1f1, r3i1p1f1, r7i1p1f1, r10i1..."
71,historical,CESM2-FV2,,[r1i1p1f1]
72,historical,CESM2-WACCM,,"[r2i1p1f1, r3i1p1f1, r1i1p1f1]"


In [10]:
experiment = "historical"

In [11]:
model = "GFDL-CM4"
# model = "ACCESS-ESM1-5"

In [12]:
# Check which members are available for the model
members = historical_models[historical_models.source_id == model].member_id_with_umo_vmo.values[0]
# parse numbers in members list of strings, which are formatted as "r%di%dp%df%d"
import re
def extract_first_number(s):
    numbers = re.findall(r'\d+', s)
    return int(numbers[0])
# sort members by the first number in the string
members = sorted(members, key=extract_first_number)

print(*members, sep = "\n")

r1i1p1f1
r2i1p1f1
r3i1p1f1
r4i1p1f1
r5i1p1f1
r6i1p1f1
r7i1p1f1
r8i1p1f1
r9i1p1f1
r11i1p1f1
r12i1p1f1
r13i1p1f1
r14i1p1f1
r15i1p1f1
r16i1p1f1
r17i1p1f1
r18i1p1f1
r19i1p1f1
r20i1p1f1
r21i1p1f1
r22i1p1f1
r23i1p1f1
r24i1p1f1
r25i1p1f1
r26i1p1f1
r27i1p1f1
r28i1p1f1
r29i1p1f1
r30i1p1f1


In [13]:
ensemble = "r1i1p1f1"

In [14]:
year_start = 1870
num_years = 30

In [15]:
model, experiment, ensemble

('ACCESS-ESM1-5', 'historical', 'r1i1p1f1')

In [16]:
# Do the catalog search once first to avoid boiler plate
searched_cat = cat.search(
    experiment_id = experiment,
    source_id = model,
    member_id = ensemble,
    table_id = 'Omon',
    grid_label = 'gn',
    variable_id = ['umo', 'vmo', 'uo', 'vo', 'mlotst'],
)
searched_cat.df

Unnamed: 0,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,sub_experiment_id,variant_label,version,zstore
0,CMIP,CSIRO,ACCESS-ESM1-5,historical,r1i1p1f1,Omon,mlotst,gn,none,r1i1p1f1,v20191115,gs://cmip6/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/hist...
1,CMIP,CSIRO,ACCESS-ESM1-5,historical,r1i1p1f1,Omon,uo,gn,none,r1i1p1f1,v20191115,gs://cmip6/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/hist...
2,CMIP,CSIRO,ACCESS-ESM1-5,historical,r1i1p1f1,Omon,umo,gn,none,r1i1p1f1,v20191115,gs://cmip6/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/hist...
3,CMIP,CSIRO,ACCESS-ESM1-5,historical,r1i1p1f1,Omon,vo,gn,none,r1i1p1f1,v20191115,gs://cmip6/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/hist...
4,CMIP,CSIRO,ACCESS-ESM1-5,historical,r1i1p1f1,Omon,vmo,gn,none,r1i1p1f1,v20191115,gs://cmip6/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/hist...


In [17]:
# Create directory on gdata
datadir = '/Users/benoitpasquier/Data'
start_time, end_time = time_window_strings(year_start, num_years)
start_time, end_time

(cftime.DatetimeNoLeap(1870, 1, 1, 0, 0, 0, 0, has_year_zero=True),
 cftime.DatetimeNoLeap(1899, 12, 31, 23, 59, 59, 0, has_year_zero=True))

In [18]:
start_time_str = start_time.strftime("%b%Y")
end_time_str = end_time.strftime("%b%Y")

outputdir = f'{datadir}/{model}/{experiment}/{ensemble}/{start_time_str}-{end_time_str}'
print(outputdir)

/Users/benoitpasquier/Data/ACCESS-ESM1-5/historical/r1i1p1f1/Jan1870-Dec1899


In [19]:
makedirs(outputdir, exist_ok=True)

In [20]:
########## Start the client and make the `.nc` files ##########
print("Starting client")
client = Client(n_workers=4)#, threads_per_worker=1, memory_limit='16GB') # Note: with 1thread/worker cannot plot thetao. Maybe I need to understand why?
client

Starting client


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 12,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:53213,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 12
Started: Just now,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:53225,Total threads: 3
Dashboard: http://127.0.0.1:53226/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:53216,
Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-2gkcahmx,Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-2gkcahmx

0,1
Comm: tcp://127.0.0.1:53228,Total threads: 3
Dashboard: http://127.0.0.1:53229/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:53218,
Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-9ieqqee8,Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-9ieqqee8

0,1
Comm: tcp://127.0.0.1:53231,Total threads: 3
Dashboard: http://127.0.0.1:53232/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:53220,
Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-wmavmj77,Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-wmavmj77

0,1
Comm: tcp://127.0.0.1:53234,Total threads: 3
Dashboard: http://127.0.0.1:53235/status,Memory: 8.00 GiB
Nanny: tcp://127.0.0.1:53222,
Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-dikbx0mj,Local directory: /var/folders/75/bdw1v8j10w9dbmvjmc13fkj80000gn/T/dask-scratch-space/worker-dikbx0mj


In [21]:
# umo dataset
if (searched_cat.df.variable_id == 'umo').any():
    print("Loading umo data")
    umo_datadask = select_latest_data(searched_cat,
        dict(
            use_cftime=True,
            chunks={'i': 60, 'j': 60, 'time': -1, 'lev':50}
        ),
        variable_id = "umo",
    )
    # print("\numo_datadask: ", umo_datadask)
umo_datadask

Loading umo data


Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 5 graph layers,30 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 843.75 kiB 28.12 kiB Shape (300, 360) (60, 60) Dask graph 30 chunks in 5 graph layers Data type float64 numpy.ndarray",360  300,

Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 5 graph layers,30 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 8 graph layers,30 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 843.75 kiB 28.12 kiB Shape (300, 360) (60, 60) Dask graph 30 chunks in 8 graph layers Data type float64 numpy.ndarray",360  300,

Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 8 graph layers,30 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.30 MiB,112.50 kiB
Shape,"(300, 360, 4)","(60, 60, 4)"
Dask graph,30 chunks in 3 graph layers,30 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 3.30 MiB 112.50 kiB Shape (300, 360, 4) (60, 60, 4) Dask graph 30 chunks in 3 graph layers Data type float64 numpy.ndarray",4  360  300,

Unnamed: 0,Array,Chunk
Bytes,3.30 MiB,112.50 kiB
Shape,"(300, 360, 4)","(60, 60, 4)"
Dask graph,30 chunks in 3 graph layers,30 chunks in 3 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,3.30 MiB,112.50 kiB
Shape,"(300, 360, 4)","(60, 60, 4)"
Dask graph,30 chunks in 6 graph layers,30 chunks in 6 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 3.30 MiB 112.50 kiB Shape (300, 360, 4) (60, 60, 4) Dask graph 30 chunks in 6 graph layers Data type float64 numpy.ndarray",4  360  300,

Unnamed: 0,Array,Chunk
Bytes,3.30 MiB,112.50 kiB
Shape,"(300, 360, 4)","(60, 60, 4)"
Dask graph,30 chunks in 6 graph layers,30 chunks in 6 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,30.94 kiB,30.94 kiB
Shape,"(1980, 2)","(1980, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 30.94 kiB 30.94 kiB Shape (1980, 2) (1980, 2) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",2  1980,

Unnamed: 0,Array,Chunk
Bytes,30.94 kiB,30.94 kiB
Shape,"(1980, 2)","(1980, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,800 B,800 B
Shape,"(50, 2)","(50, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 800 B 800 B Shape (50, 2) (50, 2) Dask graph 1 chunks in 2 graph layers Data type float64 numpy.ndarray",2  50,

Unnamed: 0,Array,Chunk
Bytes,800 B,800 B
Shape,"(50, 2)","(50, 2)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.65 MiB,28.12 kiB
Shape,"(2, 300, 360)","(1, 60, 60)"
Dask graph,60 chunks in 15 graph layers,60 chunks in 15 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.65 MiB 28.12 kiB Shape (2, 300, 360) (1, 60, 60) Dask graph 60 chunks in 15 graph layers Data type float64 numpy.ndarray",360  300  2,

Unnamed: 0,Array,Chunk
Bytes,1.65 MiB,28.12 kiB
Shape,"(2, 300, 360)","(1, 60, 60)"
Dask graph,60 chunks in 15 graph layers,60 chunks in 15 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.65 MiB,28.12 kiB
Shape,"(2, 300, 360)","(1, 60, 60)"
Dask graph,60 chunks in 12 graph layers,60 chunks in 12 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 1.65 MiB 28.12 kiB Shape (2, 300, 360) (1, 60, 60) Dask graph 60 chunks in 12 graph layers Data type float64 numpy.ndarray",360  300  2,

Unnamed: 0,Array,Chunk
Bytes,1.65 MiB,28.12 kiB
Shape,"(2, 300, 360)","(1, 60, 60)"
Dask graph,60 chunks in 12 graph layers,60 chunks in 12 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,39.83 GiB,1.33 GiB
Shape,"(1, 1, 1980, 50, 300, 360)","(1, 1, 1980, 50, 60, 60)"
Dask graph,30 chunks in 3 graph layers,30 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 39.83 GiB 1.33 GiB Shape (1, 1, 1980, 50, 300, 360) (1, 1, 1980, 50, 60, 60) Dask graph 30 chunks in 3 graph layers Data type float32 numpy.ndarray",1980  1  1  360  300  50,

Unnamed: 0,Array,Chunk
Bytes,39.83 GiB,1.33 GiB
Shape,"(1, 1, 1980, 50, 300, 360)","(1, 1, 1980, 50, 60, 60)"
Dask graph,30 chunks in 3 graph layers,30 chunks in 3 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [22]:
# Get the time type of umo_datadask
umo_time_type = type(umo_datadask.time.values[0])
umo_time_type


cftime._cftime.DatetimeProlepticGregorian

In [23]:
# Redefine start_time and end_time to match the time type of umo_datadask
start_time, end_time = time_window_strings(year_start, num_years, umo_time_type)
start_time, end_time

(cftime.DatetimeProlepticGregorian(1870, 1, 1, 0, 0, 0, 0, has_year_zero=True),
 cftime.DatetimeProlepticGregorian(1899, 12, 31, 23, 59, 59, 0, has_year_zero=True))

In [24]:

# Slice umo dataset for the time period
umo_datadask_sel = umo_datadask.sel(time=slice(start_time, end_time))
# Take the time average of the monthly evaporation (using month length as weights)
umo = umo_datadask_sel["umo"].weighted(umo_datadask_sel.time.dt.days_in_month).mean(dim="time")
umo

Unnamed: 0,Array,Chunk
Bytes,41.20 MiB,1.37 MiB
Shape,"(1, 1, 50, 300, 360)","(1, 1, 50, 60, 60)"
Dask graph,30 chunks in 18 graph layers,30 chunks in 18 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 41.20 MiB 1.37 MiB Shape (1, 1, 50, 300, 360) (1, 1, 50, 60, 60) Dask graph 30 chunks in 18 graph layers Data type float64 numpy.ndarray",1  1  360  300  50,

Unnamed: 0,Array,Chunk
Bytes,41.20 MiB,1.37 MiB
Shape,"(1, 1, 50, 300, 360)","(1, 1, 50, 60, 60)"
Dask graph,30 chunks in 18 graph layers,30 chunks in 18 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 5 graph layers,30 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 843.75 kiB 28.12 kiB Shape (300, 360) (60, 60) Dask graph 30 chunks in 5 graph layers Data type float64 numpy.ndarray",360  300,

Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 5 graph layers,30 chunks in 5 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 8 graph layers,30 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 843.75 kiB 28.12 kiB Shape (300, 360) (60, 60) Dask graph 30 chunks in 8 graph layers Data type float64 numpy.ndarray",360  300,

Unnamed: 0,Array,Chunk
Bytes,843.75 kiB,28.12 kiB
Shape,"(300, 360)","(60, 60)"
Dask graph,30 chunks in 8 graph layers,30 chunks in 8 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [25]:
# Save to netcdfs (and compute!)
umo.to_netcdf(f'{outputdir}/umo.nc', compute=True)

Task exception was never retrieved
future: <Task finished name='Task-16467' coro=<Client._gather.<locals>.wait() done, defined at /Users/benoitpasquier/Projects/TMIP/notebooks/.venv/lib/python3.12/site-packages/distributed/client.py:2382> exception=AllExit()>
Traceback (most recent call last):
  File "/Users/benoitpasquier/Projects/TMIP/notebooks/.venv/lib/python3.12/site-packages/distributed/client.py", line 2391, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
future: <Task finished name='Task-16466' coro=<Client._gather.<locals>.wait() done, defined at /Users/benoitpasquier/Projects/TMIP/notebooks/.venv/lib/python3.12/site-packages/distributed/client.py:2382> exception=AllExit()>
Traceback (most recent call last):
  File "/Users/benoitpasquier/Projects/TMIP/notebooks/.venv/lib/python3.12/site-packages/distributed/client.py", line 2391, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
future: <Task fin

KeyboardInterrupt: 

Python(78585) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(98606) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(9169) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
Python(13708) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
2024-09-14 21:35:37,042 - tornado.application - ERROR - Exception in callback <bound method SystemMonitor.update of <SystemMonitor: cpu: 1 memory: 43 MB fds: 26>>
Traceback (most recent call last):
  File "/Users/benoitpasquier/Projects/TMIP/notebooks/.venv/lib/python3.12/site-packages/tornado/ioloop.py", line 937, in _run
    val = self.callback()
          ^^^^^^^^^^^^^^^
  File "/Users/benoitpasquier/Projects/TMIP/notebooks/.venv/lib/python3.12/site-packages/distributed/system_monitor.py", line 168, in update
    net_ioc = psutil.net_io_counters()
              ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/User

In [99]:
# vmo dataset
if (searched_cat.df.variable_id == 'vmo').any():
    print("Loading vmo data")
    vmo_datadask = select_latest_data(searched_cat,
        dict(
            chunks={'i': 60, 'j': 60, 'time': -1, 'lev':50}
        ),
        variable_id = "vmo",
    )
    print("\nvmo_datadask: ", vmo_datadask)

In [None]:
# uo dataset
print("Loading uo data")
uo_datadask = select_latest_data(searched_cat,
    dict(
        chunks={'i': 60, 'j': 60, 'time': -1, 'lev':50}
    ),
    variable_id = "uo",
)
print("\nuo_datadask: ", uo_datadask)

In [None]:
# Slice uo dataset for the time period
uo_datadask_sel = uo_datadask.sel(time=slice(start_time, end_time))
print("\nuo_datadask_sel: ", uo_datadask_sel)

In [None]:
# Take the time average of the monthly evaporation (using month length as weights)
uo = uo_datadask_sel["uo"].weighted(uo_datadask_sel.time.dt.days_in_month).mean(dim="time")
print("\nuo_datadask: ", uo)

In [None]:
uo.to_netcdf(f'{outputdir}/uo.nc', compute=True)

In [None]:
# vo dataset
if (searched_cat.df.variable_id == 'vo').any():
    print("Loading vo data")
    vo_datadask = select_latest_data(searched_cat,
        dict(
            chunks={'i': 60, 'j': 60, 'time': -1, 'lev':50}
        ),
        variable_id = "vo",
    )
    print("\nvo_datadask: ", vo_datadask)

In [None]:
# mlotst dataset
print("Loading mlotst data")
mlotst_datadask = select_latest_data(searched_cat,
    dict(
        chunks={'i': 60, 'j': 60, 'time': -1, 'lev':50}
    ),
    variable_id = "mlotst",
)
print("\nmlotst_datadask: ", mlotst_datadask)

In [None]:
# Deal with thkcello for a different script,
# given that its location (fixed or time-dependent) depends on the model and/or project
# # thkcello dataset
# print("Loading thkcello data")
# thkcello_datadask = select_latest_data(searched_cat,
#     dict(
#         chunks={'i': 60, 'j': 60, 'time': -1, 'lev':50}
#     ),
#     variable_id = "thkcello",
#     frequency = "mon",
# )
# print("\nthkcello_datadask: ", thkcello_datadask)

In [None]:
# Slice vmo dataset for the time period
vmo_datadask_sel = vmo_datadask.sel(time=slice(start_time, end_time))
# Take the time average of the monthly evaporation (using month length as weights)
vmo = vmo_datadask_sel["vmo"].weighted(vmo_datadask_sel.time.dt.days_in_month).mean(dim="time")
vmo

In [None]:
# Slice mlotst dataset for the time period
mlotst_datadask_sel = mlotst_datadask.sel(time=slice(start_time, end_time))
# Take the time mean of the yearly maximum of mlotst
mlotst_yearlymax = mlotst_datadask_sel.groupby("time.year").max(dim="time")
mlotst_yearlymax

In [None]:
mlotst = mlotst_yearlymax.mean(dim="year")
mlotst

In [None]:
# # Slice thkcello dataset for the time period
# thkcello_datadask_sel = thkcello_datadask.sel(time=slice(start_time, end_time))
# # Take the time average of the monthly evaporation (using month length as weights)
# thkcello = thkcello_datadask_sel["thkcello"].weighted(thkcello_datadask_sel.time.dt.days_in_month).mean(dim="time")

In [None]:
# Save to netcdfs (and compute!)
vmo.to_netcdf(f'{outputdir}/vmo.nc', compute=True)
mlotst.to_netcdf(f'{outputdir}/mlotst.nc', compute=True)
# thkcello.to_netcdf(f'{outputdir}/thkcello.nc', compute=True)

In [None]:
client.close()