### Generating a filename list

In [1]:
from zen.files import FilesList, date_seq, delta

In [2]:
# OEMC filename convention
oemc_filename = "{var_generic}_{var_procedure}_{variable}_{resolution}_{depth}_{start_date}_{end_date}_{bbox}_{epsg}_{version}.tif"

In [4]:
# long-term
long_term = FilesList(oemc_filename)\
    .expand(var_procedure =\
            [vp + m for vp in ['mcd19a2v061.seasconv.m.', 'mcd19a2v061.seasconv.sd.'] for m in
             ['m01', 'm02', 'm03', 'm04', 'm05', 'm06', 'm07', 'm08', 'm09', 'm10', 'm11', 'm12']])\
    .expand(variable = ['p25', 'p50', 'p75', 'sd'])\
    .expand(var_generic = ['wv'], resolution = ['1km'], depth = ['s'], start_date = ['20000101'], end_date = ['20221231'],
            bbox = ['go'], epsg = ['epsg.4326'], version = ['v20230619'])\
    .set_dir('http://192.168.1.30:8333/global/wv/')\
    .cache('wv/wv_long-term_files.json', num_threads=1, stop_on_error=True)

  0%|          | 0/96 [00:00<?, ?it/s]

100%|██████████| 96/96 [00:00<00:00, 127.60it/s]


In [8]:
# Reading from cache file
long_term = FilesList.from_cache('wv/wv_long-term_files.json')

### Create a new deposition and upload files

In [3]:
from zen.api import Zenodo
import os

In [4]:
os.environ['ZENODO_ACCESS_TOKEN'] = ''

In [5]:
# Conect to Zenodo's sandbox service
zen = Zenodo('https://sandbox.zenodo.org', token=os.environ['ZENODO_ACCESS_TOKEN'])

In [None]:
# Create a new deposition
new_deposition = zen.depositions.create()

In [None]:
# Upload files from long_term file list
for file in long_term.path_list:
    new_deposition.upload_file(file)

In [None]:
import json

template = 'wv/wv_long-term_go_v20230619_2000_2022.json'
with open(template, 'r') as file:
    metadata = json.load(file)

In [None]:
new_deposition.metadata = metadata

In [None]:
# Publish dataset
new_deposition.publish()

### Setup topology of dataset depositions

In [6]:
deposition_list = zen.depositions.list()

In [7]:
deposition_list.sort(key=lambda d: d.title)

In [8]:
# List depositions' title
[d.title for d in deposition_list]

['Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Long-term data (2000-2022)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2000-2002)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2003-2005)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2006-2008)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2009-2011)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2012-2014)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2015-2017)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2018-2020)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2021-2022)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Yearly time-series (2000-2011)',
 'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Yearly time-series (2012-2022)']

In [None]:
from zen.api import setup_linear_related_identifiers

In [None]:
# Switch depositions to edit mode
for d in deposition_list:
    d.edit()

# Clear and setup a new set of deposition's relations
setup_linear_related_identifiers(deposition_list)

# Publish changes
for d in deposition_list:
    d.publish()

### Computing dataset size

In [9]:
os.getcwd()

'/home/rolf/gl/zen/docs'

In [10]:
os.chdir('../../zendata')

In [12]:
# monthly time-series dataset files
monthly = FilesList(oemc_filename)\
    .expand(var_procedure = ['mcd19a2v061'])\
    .expand(variable = ['n'])\
    .merge(
        FilesList(oemc_filename)\
            .expand(var_procedure = ['mcd19a2v061.seasconv', 'mcd19a2v061.seasconv.whittaker'])\
            .expand(variable = ['m', 'sd'])
    )\
    .expand(start_date = date_seq('20000101', '20221231', delta(months=1))[1:],
            end_date = date_seq('20000131', '20221231', delta(months=1))[1:])\
    .expand(var_generic = ['wv'], resolution = ['1km'], depth = ['s'], bbox = ['go'], 
            epsg = ['epsg.4326'], version = ['v20230619'])\
    .set_dir("http://192.168.1.30:8333/global/wv/")\
    .cache('wv/wv_monthly_files.json', num_threads=10)

100%|██████████| 1375/1375 [00:02<00:00, 650.52it/s]


In [11]:
# Open files list from cache
monthly = FilesList.from_cache('wv/wv_monthly_files.json')

In [14]:
# Size in GB
monthly.data_size() * 1e-9

361.71041900200004

In [15]:
# Average size by year (GB)
monthly.data_size() * 1e-9 / 23

15.726539956608697

### Splitting dataset and check uploaded files

In [22]:
# Get deposition
d_m18_20 = deposition_list[7]
d_m18_20.title

'Monthly aggregated Water Vapor MODIS MCD19A2 (1 km): Monthly time-series (2018-2020)'

In [23]:
# Filter all files in deposition of monthly WV 2018-2020
m18_20 = monthly.filter_matched(d_m18_20.files)

In [24]:
# Alternative way to filter files list using placeholders' values in 'properties' key
m18_20 = monthly.filter(lambda f: f['properties']['start_date'] >= '20180101' and f['properties']['end_date'] <= '20201231')

In [25]:
files_to_update = m18_20\
    .filter_unmatched(d_m18_20.files, field = 'checksum')\
    .path_list()

In [26]:
# List of unmatched files
files_to_update

['http://192.168.1.30:8333/global/wv/wv_mcd19a2v061.seasconv.whittaker_m_1km_s_20201101_20201130_go_epsg.4326_v20230619.tif']

In [None]:
# Upload again
for file in files_to_update:
    d_m18_20.upload_file(file)