# Stitch and normalise

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import datetime as dt
import glob
import os
import os.path
import re
import traceback
import warnings
from concurrent.futures import as_completed, ProcessPoolExecutor
from multiprocessing import Pool
from time import sleep

import netcdf_scm.retractions
import netcdf_scm.stitching
import pandas as pd
import scmdata
import tqdm.autonotebook as tqdman
import xarray as xr

  import tqdm.autonotebook as tqdman


<IPython.core.display.Javascript object>

In [3]:
RUN_CHECK = False

<IPython.core.display.Javascript object>

In [4]:
CRUNCH_DIR = "./country-crunch"
STITCHED_DIR = "./country-crunch-stitched"
STITCHED_NORMALISED_DIR = "./country-crunch-stitched-normalised"
MAX_WORKERS = 60

<IPython.core.display.Javascript object>

In [5]:
!mkdir -p {STITCHED_DIR}
!mkdir -p {STITCHED_NORMALISED_DIR}

<IPython.core.display.Javascript object>

In [6]:
ssp_files = [
    f
    for f in glob.glob(os.path.join(CRUNCH_DIR, "**", "*.nc"), recursive=True)
    if "ssp" in f and "ssp245-" not in f
]
# ssp_files = [f for f in glob.glob(os.path.join(CRUNCH_DIR, "**", "*.nc"), recursive=True) if "ssp" in f]
display(len(ssp_files))
ssp_files[:2]

487

['./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp370/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp370_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp585/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp585_r1i1p1f1_gn_201501-210012.nc']

<IPython.core.display.Javascript object>

In [7]:
ssp_files[-1]

'./country-crunch/netcdf-scm-crunched/CMIP6/AerChemMIP/MIROC/MIROC6/ssp370-lowNTCF/r3i1p1f1/Amon/tas/gn/v20190807/netcdf-scm_tas_Amon_MIROC6_ssp370-lowNTCF_r3i1p1f1_gn_201501-205512.nc'

<IPython.core.display.Javascript object>

In [8]:
cms = set([f.split(os.sep)[6] for f in ssp_files])
display(len(cms))
print("\n".join(sorted(cms)))

47

ACCESS-CM2
ACCESS-ESM1-5
AWI-CM-1-1-MR
BCC-CSM2-MR
BCC-ESM1
CAMS-CSM1-0
CAS-ESM2-0
CESM2
CESM2-WACCM
CIESM
CMCC-CM2-SR5
CMCC-ESM2
CNRM-CM6-1
CNRM-CM6-1-HR
CNRM-ESM2-1
CanESM5
CanESM5-CanOE
E3SM-1-1
EC-Earth3
EC-Earth3-CC
EC-Earth3-Veg
EC-Earth3-Veg-LR
FGOALS-f3-L
FGOALS-g3
FIO-ESM-2-0
GFDL-CM4
GFDL-ESM4
GISS-E2-1-G
HadGEM3-GC31-LL
HadGEM3-GC31-MM
IITM-ESM
INM-CM4-8
INM-CM5-0
IPSL-CM6A-LR
KACE-1-0-G
KIOST-ESM
MIROC-ES2L
MIROC6
MPI-ESM-1-2-HAM
MPI-ESM1-2-HR
MPI-ESM1-2-LR
MRI-ESM2-0
NESM3
NorESM2-LM
NorESM2-MM
TaiESM1
UKESM1-0-LL


<IPython.core.display.Javascript object>

In [9]:
# TODO: move this into netcdf_scm
retracted_ids = netcdf_scm.retractions.check_retractions(
    [".".join(f.split(os.sep)[3:-1]) for f in ssp_files], esgf_query_batch_size=20
)
retracted_files = []
for i in retracted_ids:
    retracted_dir = os.path.join(
        CRUNCH_DIR, "netcdf-scm-crunched", i.replace(".", os.sep)
    )
    retracted_files_dir = os.listdir(retracted_dir)
    assert len(retracted_files_dir) == 1
    retracted_files.append(os.path.join(retracted_dir, retracted_files_dir[0]))

sorted(retracted_files)

Querying ESGF (submitting jobs):   0%|          | 0/25 [00:00<?, ?it/s]

Retrieving results from ESGF jobs:   0%|          | 0/25 [00:00<?, ?it/s]

KeyboardInterrupt: 

<IPython.core.display.Javascript object>

In [10]:
ssp_files = [f for f in ssp_files if f not in retracted_files]
display(len(ssp_files))

NameError: name 'retracted_files' is not defined

<IPython.core.display.Javascript object>

In [11]:
def get_bcc_csm2_mr_hack_extension(picontrol, norm_years):
    picontrol_last_year = picontrol["year"].max()
    last_nyear_mean = (
        picontrol.filter(
            year=range(picontrol_last_year - norm_years, picontrol_last_year + 1)
        )
        .timeseries()
        .mean(axis=1)
    )
    last_nyear_mean.name = dt.datetime(picontrol_last_year + 1, 1, 16, 12)
    hack_extension = pd.concat([picontrol.timeseries(), last_nyear_mean], axis=1)
    hack_extension = scmdata.ScmRun(hack_extension)
    hack_extension = hack_extension.interpolate(
        hack_extension["time"].tolist()
        + [
            dt.datetime(y, v.month, v.day, v.hour)
            for y in range(picontrol_last_year + 1, picontrol_last_year + 100)
            for v in hack_extension.filter(year=int(picontrol_last_year))[
                "time"
            ].tolist()
        ][1:],
        extrapolation_type="constant",
    )
    hack_extension.metadata = picontrol.metadata

    return hack_extension

<IPython.core.display.Javascript object>

In [12]:
def stitch_and_normalise(f, catch=True, norm_years=21, normalise=True, verbose=False):
    def get_result():
        if verbose:
            print(f"Loading and stitching {f}")
        (
            scmrun,
            picontrol_branching_time,
            picontrol_file,
        ) = netcdf_scm.stitching.get_continuous_timeseries_with_meta(
            f, drs="CMIP6Output", return_picontrol_info=normalise
        )

        variable = scmrun.get_unique_meta("variable", True)
        climate_model = scmrun.get_unique_meta("climate_model", True)
        scenario = scmrun.get_unique_meta("scenario", True)
        member_id = scmrun.get_unique_meta("member_id", True)

        min_time = scmrun["time"].min()
        start_year = min_time.year
        start_month = min_time.month

        max_time = scmrun["time"].max()
        end_year = max_time.year
        end_month = max_time.month

        table = os.path.basename(f).split("_")[2]
        grid = os.path.basename(f).split("_")[-2]
        out_name = f"netcdf-scm_{variable}_Amon_{climate_model}_{scenario}_{member_id}_{grid}_{start_year}{start_month:02d}-{end_year}{end_month:02d}.nc"

        if normalise:
            out_file = os.path.join(STITCHED_NORMALISED_DIR, out_name)
        else:
            out_file = os.path.join(STITCHED_DIR, out_name)

        if os.path.isfile(out_file):
            if verbose:
                print(f"Out file already exists: {out_file}")
            return None

        if normalise:
            if verbose:
                print(f"Loading {picontrol_file}")

            picontrol_scmrun = netcdf_scm.io.load_scmrun(picontrol_file)
            picontrol_scmrun.metadata["netcdf-scm crunched file"] = picontrol_file

            if scmrun.get_unique_meta("climate_model", True) == "BCC-CSM2-MR":
                if verbose:
                    print("Performing hack extension of piControl")

                picontrol_scmrun = get_bcc_csm2_mr_hack_extension(
                    picontrol_scmrun, norm_years
                )

            if verbose:
                print(f"Normalising using {norm_years} years")

            normaliser = netcdf_scm.normalisation.NormaliserRunningMean(
                nyears=norm_years
            )

            out = normaliser.normalise_against_picontrol(
                scmrun, picontrol_scmrun, picontrol_branching_time
            )
        else:
            out = scmrun

        out["grid"] = grid

        out_to_disk = out.copy()
        out_to_disk.metadata = {
            k.replace("(", "").replace(")", ""): v
            for k, v in out_to_disk.metadata.items()
        }

        if verbose:
            print(f"Saving to {out_file}")

        out_to_disk.to_nc(out_file)

        return None

    if catch:
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")

            try:
                return get_result()
            except Exception as exc:
                raise ValueError("File failed: {}".format(f)) from exc
    else:
        return get_result()

<IPython.core.display.Javascript object>

In [13]:
checker = "./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp370/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp370_r1i1p1f1_gn_201501-210012.nc"
checker

'./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp370/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp370_r1i1p1f1_gn_201501-210012.nc'

<IPython.core.display.Javascript object>

In [14]:
# RUN_CHECK = True
# %pdb off

<IPython.core.display.Javascript object>

In [15]:
if RUN_CHECK:
    tmp = stitch_and_normalise(checker, catch=False, verbose=True)
    display(tmp)

<IPython.core.display.Javascript object>

In [16]:
if RUN_CHECK:
    source = netcdf_scm.io.load_scmrun(checker)
    display(source)

<IPython.core.display.Javascript object>

In [17]:
if RUN_CHECK:
    parent_replacements = netcdf_scm.stitching.get_parent_replacements(source)
    display(parent_replacements)

<IPython.core.display.Javascript object>

In [18]:
if RUN_CHECK:
    parent_file = netcdf_scm.stitching.get_parent_file_path(
        checker, parent_replacements, "CMIP6Output"
    )
    display(parent_file)

<IPython.core.display.Javascript object>

In [19]:
if RUN_CHECK:
    parent = netcdf_scm.io.load_scmrun(parent_file)

<IPython.core.display.Javascript object>

In [20]:
if RUN_CHECK:
    display(netcdf_scm.stitching.get_branch_time(parent, parent=True))
    display(netcdf_scm.stitching.get_branch_time(parent))

<IPython.core.display.Javascript object>

In [21]:
if RUN_CHECK:
    !ncdump -h {parent_file} | grep parent

<IPython.core.display.Javascript object>

In [22]:
if RUN_CHECK:
    picontrol = netcdf_scm.io.load_scmrun(
        "./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/BCC/BCC-CSM2-MR/piControl/r1i1p1f1/Amon/tas/gn/v20181016/netcdf-scm_tas_Amon_BCC-CSM2-MR_piControl_r1i1p1f1_gn_185001-244912.nc"
    )

<IPython.core.display.Javascript object>

In [23]:
if RUN_CHECK:
    display(picontrol)

<IPython.core.display.Javascript object>

In [24]:
if RUN_CHECK:
    picontrol_new_time = picontrol.timeseries(time_axis="year-month")
    year_shift = 2289 - 1850
    # year_shift = 3030 - 1850
    picontrol_new_time.columns = picontrol_new_time.columns.map(
        lambda x: x - year_shift
    )
    # picontrol_new_time = picontrol_new_time.rolling(
    #     window=21 * 12, center=True, axis="columns"
    # ).mean()
    picontrol_new_time = scmdata.ScmRun(picontrol_new_time)
    display(picontrol_new_time)

<IPython.core.display.Javascript object>

In [25]:
if RUN_CHECK:
    ax = (
        #         scmdata.run_append([source, parent, picontrol])
        #         scmdata.run_append([source, parent, picontrol_new_time])
        scmdata.run_append([source, parent, hack_extension])
        .filter(region="World")
        .time_mean("AC")
        #     .filter(year=range(1850, 1950))
        .lineplot(time_axis="year", style="climate_model")
    )
    ax.set_xlim([1650, 2500])

<IPython.core.display.Javascript object>

In [26]:
ssp_files = [f for f in ssp_files if "BCC-CSM2-MR" in f]
ssp_files

['./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp370/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp370_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp585/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp585_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp126/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp126_r1i1p1f1_gn_201501-210012.nc',
 './country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp245/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp245_r1i1p1f1_gn_201501-210012.nc']

<IPython.core.display.Javascript object>

In [27]:
normalise = False
normalise = True
pool = ProcessPoolExecutor(max_workers=MAX_WORKERS)

futures = []
for f in tqdman.tqdm(ssp_files):
    futures.append(pool.submit(stitch_and_normalise, f, normalise=normalise))

all_errors = []
errors = []
for i, future in tqdman.tqdm(
    enumerate(as_completed(futures, timeout=None)), total=len(futures)
):
    try:
        future.result()
    except Exception as exc:
        errors.append(traceback.format_exc())

    if i % 50 == 10 or i == len(futures) - 1:
        print("\n\n".join(errors))
        all_errors += list(
            set([v for e in errors for v in re.findall(".*File failed: (.*.nc)", e)])
        )
        #         if errors:
        #             break
        errors = []

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]




<IPython.core.display.Javascript object>

In [28]:
all_errors

[]

<IPython.core.display.Javascript object>

In [29]:
len(all_errors)

0

<IPython.core.display.Javascript object>

In [30]:
!find {STITCHED_NORMALISED_DIR} -name '*MPI-ESM1-2-HR*_ssp245_*' -type f  #| wc -l

./country-crunch-stitched-normalised/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_ssp245_r1i1p1f1_gn_185001-210012.nc


<IPython.core.display.Javascript object>

In [31]:
!find {CRUNCH_DIR} -name '*MPI-ESM1-2-HR*' -type f  #| wc -l

./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/piControl/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_piControl_r1i1p1f1_gn_185001-234912.nc
./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r2i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_historical_r2i1p1f1_gn_185001-201412.nc
./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_185001-201412.nc
./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r10i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_historical_r10i1p1f1_gn_185001-201412.nc
./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r4i1p1f1/Amon/tas/gn/v20190710/netcdf-scm_tas_Amon_MPI-ESM1-2-HR_historical_r4i1p1f1_gn_185001-201412.nc
./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/histori

<IPython.core.display.Javascript object>

In [32]:
!ls /data/cmip6/CMIP6/CMIP/

AS-RCEC  CCCma	       CSIRO-ARCCSS	    INM    MPI-M      NIMS-KMA	 UA
AWI	 CCCR-IITM     E3SM-Project	    IPSL   MRI	      NOAA-GFDL
BCC	 CMCC	       EC-Earth-Consortium  KIOST  NASA-GISS  NUIST
CAMS	 CNRM-CERFACS  FIO-QLNM		    MIROC  NCAR       SNU
CAS	 CSIRO	       HAMMOZ-Consortium    MOHC   NCC	      THU


<IPython.core.display.Javascript object>

In [33]:
# parent_variant r2i1p1f1 doesn't exist...
!ncdump -h ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/THU/CIESM/historical/r1i1p1f1/Amon/tas/gr/v20200417/netcdf-scm_tas_Amon_CIESM_historical_r1i1p1f1_gr_185001-201412.nc | grep parent

		:branch_time_in_parent = 182500. ;
		:parent_activity_id = "CMIP" ;
		:parent_experiment_id = "piControl" ;
		:parent_mip_era = "CMIP6" ;
		:parent_source_id = "CIESM" ;
		:parent_time_units = "days since 0001-01-01" ;
		:parent_variant_label = "r2i1p1f1" ;


<IPython.core.display.Javascript object>

In [34]:
# # branch time units wrong?
# ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/NUIST/NESM3/ssp585/r1i1p1f1/Amon/tas/gn/v20190728/netcdf-scm_tas_Amon_NESM3_ssp585_r1i1p1f1_gn_201501-210012.nc

# # piControl too short
# ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/BCC/BCC-CSM2-MR/ssp585/r1i1p1f1/Amon/tas/gn/v20190314/netcdf-scm_tas_Amon_BCC-CSM2-MR_ssp585_r1i1p1f1_gn_201501-210012.nc

# # key error, branch_time_in_parent
# ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/NCAR/CESM2-WACCM/ssp585/r1i1p1f1/Amon/tas/gn/v20200702/netcdf-scm_tas_Amon_CESM2-WACCM_ssp585_r1i1p1f1_gn_201501-229912.nc

# # Branching time `185804` not available in piControl data in ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/CAMS/CAMS-CSM1-0/piControl/r1i1p1f1/Amon/tas/gn/v20190729/netcdf-scm_tas_Amon_CAMS-CSM1-0_piControl_r1i1p1f1_gn_290001-339912.nc
# ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/CAMS/CAMS-CSM1-0/ssp126/r1i1p1f1/Amon/tas/gn/v20190708/netcdf-scm_tas_Amon_CAMS-CSM1-0_ssp126_r1i1p1f1_gn_201501-209912.nc

# # Branching time `003410` not available in piControl data in ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/CAS/FGOALS-f3-L/piControl/r1i1p1f1/Amon/tas/gr/v20191029/netcdf-scm_tas_Amon_FGOALS-f3-L_piControl_r1i1p1f1_gr_060001-116012.nc
# ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/CAS/FGOALS-f3-L/ssp585/r1i1p1f1/Amon/tas/gr/v20191013/netcdf-scm_tas_Amon_FGOALS-f3-L_ssp585_r1i1p1f1_gr_201501-210012.nc

# # Branching time `232001` not available in piControl data in ./country-crunch/netcdf-scm-crunched/CMIP6/CMIP/AS-RCEC/TaiESM1/piControl/r1i1p1f1/Amon/tas/gn/v20200211/netcdf-scm_tas_Amon_TaiESM1_piControl_r1i1p1f1_gn_020101-070012.nc
# ./country-crunch/netcdf-scm-crunched/CMIP6/ScenarioMIP/AS-RCEC/TaiESM1/ssp585/r1i1p1f1/Amon/tas/gn/v20200901/netcdf-scm_tas_Amon_TaiESM1_ssp585_r1i1p1f1_gn_201501-210012.nc



<IPython.core.display.Javascript object>