# make netcdf GIS ready

Date: 8 June, 2024

Author = {"name": "Thomas Moore", "affiliation": "CSIRO", "email": "thomas.moore@csiro.au", "orcid": "0000-0003-3930-1946"}

### BRAN2020 is on the order of 50TB of float data over nearly 9000 `netcdf` file assests in total.

#### required packages

In [None]:
import intake
import xarray as xr
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd

from dask.distributed import Client, LocalCluster
import dask
import datetime
import zarr

from rechunker import rechunk

import gc
import sys
import subprocess
from tabulate import tabulate
import os
import glob
import streamjoy
import pickle
import json

In [None]:
# Append the directory of the module to sys.path - import functions
sys.path.append('/g/data/es60/users/thomas_moore/code/Climatology-generator-demo/src/')
import bran2020_demo_functions as my_tools
from bran2020_demo_functions import keep_only_selected_vars, load_rechunker_config, print_chunks, rechunk_each_st_ocean, remove_zarr_encoding, version_table, concatinate_st_ocean_zarrs

#### start a local Dask client

In [None]:
# Set configuration options
dask.config.set({
    'distributed.comm.timeouts.connect': '90s',  # Timeout for connecting to a worker
    'distributed.comm.timeouts.tcp': '90s',  # Timeout for TCP communications
})

cluster = LocalCluster(
    n_workers=14,          # Number of workers
    threads_per_worker=1#,
    #memory_limit='8GB' # Memory limit per each worker
)
client = Client(cluster)

In [None]:
# Create an empty dictionary
dynamic_ds = {}

# Define your var and phase lists
var_values = ['temp', 'salt','u','v','eta_t','mld']  # replace with your actual list
#var_values = ['u']
phase_values = ['alltime', 'neutral','la_nina','el_nino']  # replace with your actual list

# Iterate over all combinations of var and phase
for var_name in var_values:
    for phase_name in phase_values:
        # Generate the object name
        ds_name = f'{var_name}_{phase_name}_ds'
        
        # Store the value in the dictionary
        results_path = '/g/data/es60/users/thomas_moore/clim_demo_results/daily/bran2020_intermediate_results/'
        files = glob.glob(results_path+'*_'+var_name+'_*'+phase_name+'*.nc')
        sorted_files = sorted(files, key=os.path.getctime)
        #print('>>>>> '+var_name+'&'+phase_name+' <<<<<')
        #print(sorted_files)
        
        dynamic_ds[ds_name] = xr.open_mfdataset(files,parallel=True)  # replace with your actual value

In [None]:
# Add the phase string to the name of all variables in each dataset
for ds_name, dataset in dynamic_ds.items():
    # Get the phase name from the dataset name
    phase_name = '_'.join(ds_name.split('_')[1:-1])
    if phase_name not in phase_values:
        phase_name = '_'.join(ds_name.split('_')[2:-1])
    if phase_name in phase_values:
        # Add the phase string to the name of all variables
        for var_name in dataset.data_vars:
            new_var_name = f'{var_name}_{phase_name}'
            dataset = dataset.rename({var_name: new_var_name})
        dynamic_ds[ds_name] = dataset
    else:
        print(f"No match found for phase name: {phase_name}")

In [None]:
merged_datasets = {}
for var_name in var_values:
        # Get all datasets with the same var_name
        var_datasets = [dataset for ds_name, dataset in dynamic_ds.items() if var_name+'_' in ds_name]
        
        # Merge the datasets along the time dimension
        merged_dataset = xr.merge(var_datasets)
        
        # Store the merged dataset in the dictionary
        merged_datasets[var_name] = merged_dataset

In [None]:
# Calculate the total size of all datasets in the dictionary
total_size_gb = sum(merged_dataset.nbytes / (1024**3) for merged_dataset in merged_datasets.values())
print(f"Total size of all datasets: {total_size_gb} GB")

##### `/g/data/es60/users/thomas_moore/clim_demo_results/daily/bran2020_intermediate_results du -hsc *.nc` = 532G	total

In [None]:
# Show the keys for the dynamic_ds dictionary
print(merged_datasets.keys())

In [None]:
# Lazy load each dataset
lazy_datasets = {}
for var_name, merged_dataset in merged_datasets.items():
    #print([var_name,merged_dataset])
    lazy_datasets[var_name] = merged_dataset

lazy_datasets.keys()

## coordinate nomeclature

In [None]:
coordinate_names = {
    "lat_name_dict": {
        "temp": "yt_ocean",
        "salt": "yt_ocean",
        "u": "yu_ocean",
        "v": "yu_ocean",
        "mld": "yt_ocean",
        "eta_t": "yt_ocean"
    },
    "lon_name_dict": {
        "temp": "xt_ocean",
        "salt": "xt_ocean",
        "u": "xu_ocean",
        "v": "xu_ocean",
        "mld": "xt_ocean",
        "eta_t": "xt_ocean"
    },
    "depth_name_dict": {
        "temp": "st_ocean",
        "salt": "st_ocean",
        "u": "st_ocean",
        "v": "st_ocean"
    }
}

# prototype

In [None]:
core_variable_list = ['mld','eta_t','temp','salt','u','v']
phase_list = ['alltime','el_nino','la_nina','neutral']
statistics_list = ['mean','std','min','max','median','quantile_05','quantile_95']

# Create a list of all possible combinations of variable and statistics and phase
statistics_core_variable_phase_list = [(statistic, core_variable, phase) for core_variable in core_variable_list for statistic in statistics_list for phase in phase_list]
print(statistics_core_variable_phase_list)
print(len(statistics_core_variable_phase_list))

In [None]:
statistics_core_variable_phase_string_list = [f"{statistic}_{core_variable}_{phase}" for statistic, core_variable, phase in statistics_core_variable_phase_list]
print(statistics_core_variable_phase_string_list)

In [None]:
for variable_name in statistics_core_variable_phase_string_list:
    found_variable = None
    for dataset_name, dataset in lazy_datasets.items():
        if variable_name in dataset.variables:
            found_variable = dataset[variable_name]
            break
    if found_variable is not None:
        print(f"Variable '{variable_name}' found in dataset '{dataset_name}'")
    else:
        print(f"Variable '{variable_name}' not found in any dataset")

In [None]:
# folder structure
folder_structure = {}
base_folder_name = 'BRAN2020_climatology'
folder_structure[base_folder_name] = {}
for phase in phase_list:
    folder_structure[base_folder_name][phase] = {}
    for core_variable in core_variable_list:
        folder_structure[base_folder_name][phase][core_variable] = {}

folder_structure

In [None]:
def get_folder_paths(folder_structure, parent_path=''):
    paths = []
    for folder_name, subfolders in folder_structure.items():
        current_path = f"{parent_path}/{folder_name}" if parent_path else folder_name
        paths.append(current_path)
        if subfolders:
            paths.extend(get_folder_paths(subfolders, current_path))
    return paths

folder_paths = get_folder_paths(folder_structure)
print(folder_paths)

In [None]:
def create_folder_structure(base_path, folder_structure):
    for folder_name, subfolders in folder_structure.items():
        current_path = os.path.join(base_path, folder_name)
        os.makedirs(current_path, exist_ok=True)
        if subfolders:
            create_folder_structure(current_path, subfolders)

# Example usage
base_path = '/scratch/es60/thomas_moore/bran2020_GISready_delivery'


create_folder_structure(base_path, folder_structure)

In [None]:
def get_folder_paths(folder_structure, parent_path=''):
    paths = []
    for folder_name, subfolders in folder_structure.items():
        current_path = f"{parent_path}/{folder_name}" if parent_path else folder_name
        paths.append(current_path)
        if subfolders:
            paths.extend(get_folder_paths(subfolders, current_path))
    return paths

folder_paths = get_folder_paths(folder_structure)
print(folder_paths)

In [None]:
GISready_datasets = {}
for core_variable in core_variable_list:
    for phase in phase_list:
        dataset_name = f"{core_variable}_{phase}"
        GISready_datasets[dataset_name] = []
        for variable_name in statistics_core_variable_phase_string_list:
            if core_variable+'_' in variable_name and phase in variable_name:
                found_variable = None
                for core_dataset_name, dataset in lazy_datasets.items():
                    if variable_name in dataset.data_vars:
                        found_variable = dataset[variable_name]
                        break
                if found_variable is not None:
                    GISready_datasets[dataset_name].append(found_variable)
                else:
                    print(f"Variable '{variable_name}' not found in any dataset")

In [None]:
xr.merge(GISready_datasets['u_alltime']).isel(month=0)

In [None]:
search_string = "temp_"  # Replace "string" with the desired search string
matching_datasets = [dataset_name for dataset_name in GISready_datasets.keys() if search_string in dataset_name]
print(matching_datasets)

In [None]:
# Use a list comprehension to get the values for the keys in the list
list_list_of_DA = [GISready_datasets[key] for key in matching_datasets if key in GISready_datasets]
flattened_list_of_DA = [item for sublist in list_list_of_DA for item in sublist]

In [None]:
xr.merge(flattened_list_of_DA)

In [None]:
for core_variable in core_variable_list:
    for phase in phase_list:
        print(phase + core_variable)

# rechunk all the datasets for 1,1,300,300, or 1,300,300

# $The$ $End$

In [None]:
client.shutdown()

## Plot current vectors for August

In [None]:
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter,
                               AutoMinorLocator)
import matplotlib.ticker as ticker
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cartopy
from matplotlib import mlab, cm, gridspec
import matplotlib.ticker as mticker
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
%matplotlib inline 

In [None]:
%%time
# Define the u and v components of the currents
time_choice = 8
u = clim_uv.u.sel(month=time_choice)
v = clim_uv.v.sel(month=time_choice)
speed = np.sqrt(u**2 + v**2)

In [None]:
#plot model data
transform = ccrs.PlateCarree()
cmap = 'Spectral_r'
cbar_label='current speed'
plot_data = speed

###
fig = plt.figure(num=None, figsize=(8, 6), dpi=300, facecolor='w', edgecolor='k')
ax = plt.subplot(projection=ccrs.PlateCarree(180))
ax.set_extent([142,160, -25, -10], ccrs.PlateCarree())
ax.add_feature(cfeature.NaturalEarthFeature('physical', 'land', '50m', edgecolor='face', facecolor='white'))
ax.coastlines('50m',linewidth=0.5,edgecolor='grey')
plot_data.plot(transform=transform,cmap=cmap,cbar_kwargs={'label': cbar_label,'shrink':0.5},robust=True)

#plot u/v vectors
# Define the x and y coordinates
x = clim_uv.xu_ocean
y = clim_uv.yu_ocean
ax.quiver(x.values,y.values,u.values,v.values,transform=transform, units='x', width=0.01, scale=0.7, headwidth=2,alpha=0.2)
ax.set_title('BRAN2020 1993-2022\ncurrent speed \n August Climatology')