Create a list which points each rgi_id to a provide region and a batch file:
  - e.g. P01/0_500
    - provide region 1
    - batch 0_500

# Imports

In [4]:
import xarray as xr
import numpy as np
import pandas as pd
import glob
import os
import sys
import json
import time
from oggm import utils

In [5]:
base_path = os.getcwd()

# go up until we are in the project base directory
path_to_add = base_path
while path_to_add.split('/')[-1] != 'provide':
    path_to_add = os.path.normpath(os.path.join(path_to_add, '..'))

# add paths for tools and data
things_to_add = ['general_tools', 'aggregation_tools', 'general_data_for_aggregation']
for thing in things_to_add:
    sys.path.append(os.path.join(path_to_add, thing))

# import stuff we need
from general_tools import check_if_notebook
from oggm_result_filepath_and_realisations import (gcms_mesmer, quantiles_mesmer,
    scenarios_mesmer, oggm_result_dir, provide_regions, raw_oggm_output_file)

In [6]:
output_dir_general_data = os.path.join(path_to_add, 'general_data_for_aggregation')

# Template for resulting conversion list (drop connectivity=2, drop RGI19, only common running glaciers)

In [30]:
frgi = utils.file_downloader('https://cluster.klima.uni-bremen.de/~oggm/rgi/rgi62_stats.h5')
df_rgi = pd.read_hdf(frgi, index_col=0)
rgi_ids_connect_2 = list(df_rgi[df_rgi.Connect != 2].index)

In [44]:
# template structure
fp_rgi_prov_region = "rgi_ids_to_provide_region.json"
with open(os.path.join(output_dir_general_data, fp_rgi_prov_region), 'r') as f:
    dict_rgis_to_preg = json.load(f)

# open common running glaciers
fp_common_running = "commonly_running_glaciers.json"
with open(os.path.join(output_dir_general_data, fp_common_running), 'r') as f:
    list_common_running = json.load(f)

all_rgi_ids = list(dict_rgis_to_preg)

# drop connectivity level 2
all_rgi_ids = list(set(all_rgi_ids) & set(rgi_ids_connect_2))

# drop rgi_region 19
all_rgi_ids = [rgi_id for rgi_id in all_rgi_ids if rgi_id[6:8] != 19]

# keep only common running glaciers
all_rgi_ids = list(set(all_rgi_ids) & set(list_common_running))

# create template structure and set everything to None
dict_rgis_to_batch = {}
for rgi_id in all_rgi_ids:
    dict_rgis_to_batch[rgi_id] = None

assert all(value is None for value in dict_rgis_to_batch.values())

In [45]:
len(dict_rgis_to_batch)

206685

# define region and batch per rgi_id

In [48]:
# use one realisation for checking where glaciers are located
start_time = time.time()
dummy_file = raw_oggm_output_file.format(scenarios_mesmer[0],
                                         gcms_mesmer[0],
                                         quantiles_mesmer[0])
for region in provide_regions:
    print(f'Region {region}')
    for file_path in glob.glob(os.path.join(oggm_result_dir, region, dummy_file)):
        batch_nr = file_path.split('_')[-2] + '_' + file_path.split('_')[-1].split('.')[0]

        with xr.open_dataset(file_path) as ds:
            rgi_ids = ds.rgi_id.values
            # only keep rgi_ids which we need (e.g. only common running)
            rgi_ids = list(set(rgi_ids) & set(list(dict_rgis_to_batch)))

            # before updating check if all values are still None
            if not all(dict_rgis_to_batch[rgi_id] is None for rgi_id in rgi_ids):
                assigned_rgi_ids = []
                assigned_batch = []
                for rgi_id in rgi_ids:
                    if dict_rgis_to_batch[rgi_id] is not None:
                        assigned_rgi_ids.append(rgi_id)
                        assigned_batch.append(dict_rgis_to_batch[rgi_id])
                raise AttributeError(f'some rgi_ids ({assigned_rgi_ids}) where already assigned '
                                     f'to a different file ({assigned_batch})! '
                                     f'Current batch: {region}/{batch_nr}')
            dict_rgis_to_batch.update({rgi_id: f'{region}/{batch_nr}' for rgi_id in rgi_ids})

# at the end all should be assined
assert all(value is not None for value in dict_rgis_to_batch.values())

# save conversion list as json
with open(os.path.join(output_dir_general_data,
                       "rgi_ids_to_result_batch.json"), "w") as outfile: 
    json.dump(dict_rgis_to_batch, outfile)

print(f'Time needed: {time.time() - start_time:.1f} s')

Region P01
Region P02
Region P03
Region P04
Region P05
Region P06
Region P07
Region P08
Region P09
Region P10
Region P11
Region P12
Time needed: 18.2 s
