# Stitch and normalise

In [156]:
%load_ext nb_black

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [157]:
import glob
import os
import os.path
import re
import traceback
import warnings
from concurrent.futures import as_completed, ProcessPoolExecutor
from multiprocessing import Pool
from time import sleep

import netcdf_scm.retractions
import netcdf_scm.stitching
import tqdm.autonotebook as tqdman

<IPython.core.display.Javascript object>

In [158]:
RUN_CHECK = False

<IPython.core.display.Javascript object>

In [159]:
CRUNCH_DIR = "./country-crunch"
STITCHED_NORMALISED_DIR = "./country-crunch-stitched-normalised"
MAX_WORKERS = 60

<IPython.core.display.Javascript object>

In [160]:
!mkdir -p {STITCHED_NORMALISED_DIR}

<IPython.core.display.Javascript object>

In [161]:
ssp_files = [
    f
    for f in glob.glob(os.path.join(CRUNCH_DIR, "**", "*.nc"), recursive=True)
    if "ssp" in f and "ssp245-" not in f
]
# ssp_files = [f for f in glob.glob(os.path.join(CRUNCH_DIR, "**", "*.nc"), recursive=True) if "ssp" in f]
display(len(ssp_files))
ssp_files[:2]

311

['./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp370/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp370_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp585/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp585_r1i1p1f1_gn_201501-210012.nc']

<IPython.core.display.Javascript object>

In [162]:
ssp_files[-1]

'./country-crunch/netcdf-scm-crunched/CMIP6/AerChemMIP/MIROC/MIROC6/ssp370-lowNTCF/r1i1p1f1/Amon/tas/gn/v20190807/netcdf-scm_tas_Amon_MIROC6_ssp370-lowNTCF_r1i1p1f1_gn_201501-205512.nc'

<IPython.core.display.Javascript object>

In [163]:
cms = set([f.split(os.sep)[6] for f in ssp_files])
display(len(cms))
cms

42

{'ACCESS-CM2',
 'ACCESS-ESM1-5',
 'AWI-CM-1-1-MR',
 'BCC-CSM2-MR',
 'BCC-ESM1',
 'CAMS-CSM1-0',
 'CESM2',
 'CESM2-WACCM',
 'CIESM',
 'CMCC-CM2-SR5',
 'CMCC-ESM2',
 'CNRM-CM6-1',
 'CNRM-CM6-1-HR',
 'CNRM-ESM2-1',
 'CanESM5',
 'CanESM5-CanOE',
 'EC-Earth3',
 'EC-Earth3-CC',
 'EC-Earth3-Veg',
 'EC-Earth3-Veg-LR',
 'FGOALS-f3-L',
 'FGOALS-g3',
 'GISS-E2-1-G',
 'HadGEM3-GC31-LL',
 'HadGEM3-GC31-MM',
 'IITM-ESM',
 'INM-CM4-8',
 'INM-CM5-0',
 'IPSL-CM6A-LR',
 'KACE-1-0-G',
 'KIOST-ESM',
 'MIROC-ES2L',
 'MIROC6',
 'MPI-ESM-1-2-HAM',
 'MPI-ESM1-2-HR',
 'MPI-ESM1-2-LR',
 'MRI-ESM2-0',
 'NESM3',
 'NorESM2-LM',
 'NorESM2-MM',
 'TaiESM1',
 'UKESM1-0-LL'}

<IPython.core.display.Javascript object>

In [164]:
# TODO: move this into netcdf_scm
retracted_ids = netcdf_scm.retractions.check_retractions(
    [".".join(f.split(os.sep)[3:-1]) for f in ssp_files], esgf_query_batch_size=20
)
retracted_files = []
for i in retracted_ids:
    retracted_dir = os.path.join(
        CRUNCH_DIR, "netcdf-scm-crunched", i.replace(".", os.sep)
    )
    retracted_files_dir = os.listdir(retracted_dir)
    assert len(retracted_files_dir) == 1
    retracted_files.append(os.path.join(retracted_dir, retracted_files_dir[0]))

retracted_files

Querying ESGF (submitting jobs):   0%|          | 0/16 [00:00<?, ?it/s]

Retrieving results from ESGF jobs:   0%|          | 0/16 [00:00<?, ?it/s]



['./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg/ssp245/r2i1p1f1/Amon/tas/gr/v20190925/netcdf-scm_tas_Amon_EC-Earth3-Veg_ssp245_r2i1p1f1_gr_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg/ssp245/r1i1p1f1/Amon/tas/gr/v20190629/netcdf-scm_tas_Amon_EC-Earth3-Veg_ssp245_r1i1p1f1_gr_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg/ssp119/r1i1p1f1/Amon/tas/gr/v20190711/netcdf-scm_tas_Amon_EC-Earth3-Veg_ssp119_r1i1p1f1_gr_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/IPSL/IPSL-CM6A-LR/ssp585/r1i1p1f1/Amon/tas/gr/v20190119/netcdf-scm_tas_Amon_IPSL-CM6A-LR_ssp585_r1i1p1f1_gr_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/IPSL/IPSL-CM6A-LR/ssp126/r1i1p1f1/Amon/tas/gr/v20190121/netcdf-scm_tas_Amon_IPSL-CM6A-LR_ssp126_r1i1p1f1_gr_201501-210012.nc',
 './country-crunch/netcdf-scm

<IPython.core.display.Javascript object>

In [165]:
ssp_files = [f for f in ssp_files if f not in retracted_files]
display(len(ssp_files))

288

<IPython.core.display.Javascript object>

In [166]:
def stitch_and_normalise(f, catch=True, norm_years=21, normalise=True, verbose=False):
    def get_result():
        if verbose:
            print(f"Loading and stitching {f}")
        (
            scmrun,
            picontrol_branching_time,
            picontrol_file,
        ) = netcdf_scm.stitching.get_continuous_timeseries_with_meta(
            f, drs="CMIP6Output"
        )

        if not normalise:
            if verbose:
                print(f"Not normalising, returning")
            return scmrun

        variable = scmrun.get_unique_meta("variable", True)
        climate_model = scmrun.get_unique_meta("climate_model", True)
        scenario = scmrun.get_unique_meta("scenario", True)
        member_id = scmrun.get_unique_meta("member_id", True)

        min_time = scmrun["time"].min()
        start_year = min_time.year
        start_month = min_time.month

        max_time = scmrun["time"].max()
        end_year = max_time.year
        end_month = max_time.month

        table = os.path.basename(f).split("_")[2]
        grid = os.path.basename(f).split("_")[-2]
        out_name = f"netcdf-scm_{variable}_Amon_{climate_model}_{scenario}_{member_id}_{grid}_{start_year}{start_month:02d}-{end_year}{end_month:02d}.nc"
        out_file = os.path.join(STITCHED_NORMALISED_DIR, out_name)

        if os.path.isfile(out_file):
            if verbose:
                print(f"Out file already exists: {out_file}")
            return None

        if verbose:
            print(f"Loading {picontrol_file}")

        picontrol_scmrun = netcdf_scm.io.load_scmrun(picontrol_file)
        picontrol_scmrun.metadata["netcdf-scm crunched file"] = picontrol_file

        if verbose:
            print(f"Normalising using {norm_years} years")

        normaliser = netcdf_scm.normalisation.NormaliserRunningMean(nyears=norm_years)

        normalised = normaliser.normalise_against_picontrol(
            scmrun, picontrol_scmrun, picontrol_branching_time
        )

        normalised["grid"] = grid

        normalised_to_disk = normalised.copy()
        normalised_to_disk.metadata = {
            k.replace("(", "").replace(")", ""): v
            for k, v in normalised_to_disk.metadata.items()
        }

        if verbose:
            print(f"Saving to {out_file}")

        normalised_to_disk.to_nc(out_file)

        return None

    if catch:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            try:
                return get_result()
            except Exception as exc:
                raise ValueError("File failed: {}".format(f)) from exc
    else:
        return get_result()

<IPython.core.display.Javascript object>

In [167]:
checker = [f for f in ssp_files if "ACCESS" in f][0]
checker

'./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/CSIRO-ARCCSS/ACCESS-CM2/ssp370/r2i1p1f1/Amon/tas/gn/v20200303/netcdf-scm_tas_Amon_ACCESS-CM2_ssp370_r2i1p1f1_gn_201501-210012.nc'

<IPython.core.display.Javascript object>

In [168]:
if RUN_CHECK:
    tmp = stitch_and_normalise(checker, catch=False, verbose=True)
    display(tmp)

<IPython.core.display.Javascript object>

In [169]:
# source = netcdf_scm.io.load_scmrun(checker)

<IPython.core.display.Javascript object>

In [170]:
# scmdata.run_append([source, parent]).filter(region="World").lineplot()

<IPython.core.display.Javascript object>

In [171]:
ssp_files = [f for f in ssp_files if "MPI-ESM1-2-HR" in f]
ssp_files

['./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2-HR/ssp585/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp585_r2i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2-HR/ssp126/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp126_r2i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp370/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp370_r2i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp370/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp370_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp585/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp585_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2

<IPython.core.display.Javascript object>

In [172]:
pool = ProcessPoolExecutor(max_workers=MAX_WORKERS)

futures = []
for f in tqdman.tqdm(ssp_files):
    futures.append(pool.submit(stitch_and_normalise, f))

all_errors = []
errors = []
for i, future in tqdman.tqdm(
    enumerate(as_completed(futures, timeout=None)), total=len(futures)
):
    try:
        future.result()
    except Exception as exc:
        errors.append(traceback.format_exc())

    if i % 50 == 10 or i == len(futures) - 1:
        print("\n\n".join(errors))
        all_errors += list(
            set([v for e in errors for v in re.findall(".*File failed: (.*.nc)", e)])
        )
        errors = []

all_errors

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

concurrent.futures.process._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "<ipython-input-166-141b9142bf74>", line 76, in stitch_and_normalise
    return get_result()
  File "<ipython-input-166-141b9142bf74>", line 9, in get_result
    ) = netcdf_scm.stitching.get_continuous_timeseries_with_meta(
  File "/data/ubuntu-znicholls/miniconda3/envs/cmip6-country-level-processing/lib/python3.8/site-packages/netcdf_scm/stitching.py", line 237, in get_continuous_timeseries_with_meta
    parent_file_path = get_parent_file_path(infile, parent_replacements, drs)
  File "/data/ubuntu-znicholls/miniconda3/envs/cmip6-country-level-processing/lib/python3.8/site-packages/netcdf_scm/stitching.py", line 357, in get_parent_file_path
    raise IOError(
OSError: No parent data (historical) available for ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp370/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp370_r2i1p1f1_gn_201501-210012.nc, we loo

['./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp245/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp245_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp370/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp370_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2-HR/ssp126/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp126_r2i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp370/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp370_r2i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp245/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp245_r2i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DWD/MPI-ESM1-2

<IPython.core.display.Javascript object>

In [69]:
!ls /data/cmip6/CMIP6/CMIP/

AS-RCEC  CCCma	       CSIRO-ARCCSS	    INM    MPI-M      NIMS-KMA	 UA
AWI	 CCCR-IITM     E3SM-Project	    IPSL   MRI	      NOAA-GFDL
BCC	 CMCC	       EC-Earth-Consortium  KIOST  NASA-GISS  NUIST
CAMS	 CNRM-CERFACS  FIO-QLNM		    MIROC  NCAR       SNU
CAS	 CSIRO	       HAMMOZ-Consortium    MOHC   NCC	      THU


<IPython.core.display.Javascript object>

In [70]:
# historical data submitted by MPI-M, scenarios submitted by DKRZ
./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp585/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp585_r1i1p1f1_gn_201501-210012.nc

SyntaxError: invalid syntax (<ipython-input-70-a923ca543572>, line 2)

ERROR:root:Cannot parse: 2:1: ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/DKRZ/MPI-ESM1-2-HR/ssp585/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp585_r1i1p1f1_gn_201501-210012.nc
Traceback (most recent call last):
  File "/data/ubuntu-znicholls/miniconda3/envs/cmip6-country-level-processing/lib/python3.8/site-packages/lab_black.py", line 218, in format_cell
    formatted_code = _format_code(cell)
  File "/data/ubuntu-znicholls/miniconda3/envs/cmip6-country-level-processing/lib/python3.8/site-packages/lab_black.py", line 29, in _format_code
    return format_str(src_contents=code, mode=FileMode())
  File "/data/ubuntu-znicholls/miniconda3/envs/cmip6-country-level-processing/lib/python3.8/site-packages/black/__init__.py", line 974, in format_str
    src_node = lib2to3_parse(src_contents.lstrip(), mode.target_versions)
  File "/data/ubuntu-znicholls/miniconda3/envs/cmip6-country-level-processing/lib/python3.8/site-packages/black/__init__.py", line 1083, in

In [None]:
# parent_variant r2i1p1f1 doesn't exist...
!ncdump -h ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/THU/CIESM/historical/r1i1p1f1/Amon/tas/gr/v20200417/netcdf-scm_tas_Amon_CIESM_historical_r1i1p1f1_gr_185001-201412.nc | grep parent

In [None]:
# branch time units wrong?
./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/NUIST/NESM3/ssp585/r1i1p1f1/Amon/tas/gn/v20190728/netcdf-scm_tas_Amon_NESM3_ssp585_r1i1p1f1_gn_201501-210012.nc

# piControl too short
./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp585/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp585_r1i1p1f1_gn_201501-210012.nc

# key error, branch_time_in_parent
./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/NCAR/CESM2-WACCM/ssp585/r1i1p1f1/Amon/tas/gn/v20200702/netcdf-scm_tas_Amon_CESM2-WACCM_ssp585_r1i1p1f1_gn_201501-229912.nc

# Branching time `185804` not available in piControl data in ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/CAMS/CAMS-CSM1-0/piControl/r1i1p1f1/Amon/tas/gn/v20190729/netcdf-scm_tas_Amon_CAMS-CSM1-0_piControl_r1i1p1f1_gn_290001-339912.nc
./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/CAMS/CAMS-CSM1-0/ssp126/r1i1p1f1/Amon/tas/gn/v20190708/netcdf-scm_tas_Amon_CAMS-CSM1-0_ssp126_r1i1p1f1_gn_201501-209912.nc

# Branching time `003410` not available in piControl data in ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/CAS/FGOALS-f3-L/piControl/r1i1p1f1/Amon/tas/gr/v20191029/netcdf-scm_tas_Amon_FGOALS-f3-L_piControl_r1i1p1f1_gr_060001-116012.nc
./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/CAS/FGOALS-f3-L/ssp585/r1i1p1f1/Amon/tas/gr/v20191013/netcdf-scm_tas_Amon_FGOALS-f3-L_ssp585_r1i1p1f1_gr_201501-210012.nc

# Branching time `232001` not available in piControl data in ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/AS-RCEC/TaiESM1/piControl/r1i1p1f1/Amon/tas/gn/v20200211/netcdf-scm_tas_Amon_TaiESM1_piControl_r1i1p1f1_gn_020101-070012.nc
./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp585/r1i1p1f1/Amon/tas/gn/v20200901/netcdf-scm_tas_Amon_TaiESM1_ssp585_r1i1p1f1_gn_201501-210012.nc



In [None]:
len(all_errors)

In [None]:
!find {STITCHED_NORMALISED_DIR} -name '*_ssp245_*' -type f  #| wc -l