In [1]:
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cftime

import xarray
import dask

In [2]:
dask.config.set(scheduler="processes")

<dask.config.set at 0x77762ede8290>

# Locate data

In [3]:
esgf_va_catalog = pd.read_csv("https://hub.climate4r.ifca.es/thredds/fileServer/public/inventory.csv.zip").fillna("")
atlas_ipcc_catalog = pd.read_csv("atlas-tas.csv")
model_runs = atlas_ipcc_catalog[["model", "member"]].set_index("model").to_dict()["member"]

  esgf_va_catalog = pd.read_csv("https://hub.climate4r.ifca.es/thredds/fileServer/public/inventory.csv.zip").fillna("")


In [4]:
def find_model_runs(esgf_va_catalog, atlas_ipcc_catalog, model_runs, experiment):
    ignored_models = [
        "EC-Earth3-Veg", # replica unavailable
        "BCC-CSM2-MR", # time units change across netCDF files
        "KACE-1-0-G", # there is no ssp585 for this model
        "UKESM1-0-LL", # there is no ssp585 for this model
        "MIROC-ES2L", # no replica seems to work

        # not available later for Pangeo
        "ACCESS-CM2",
        "ACCESS-ESM1-5",

        # temporal removes
        "KIOST-ESM",
    ]

    ignored_models = [
        "EC-Earth3-Veg", # replica unavailable
        "MIROC-ES2L", # no replica seems to work
        "BCC-CSM2-MR", # no replica seems to work
        "FGOALS-g3", # time series split across different versions
        "KACE-1-0-G", # unavailable
    ]
    
    preferred_replicas = {
        "historical": {
            "HadGEM3-GC31-LL": "", # esgf-data04.diasjp.net netcdf-java error, ...
            "UKESM1-0-LL": "",
        },
        "ssp126": {
            "HadGEM3-GC31-LL": "",
            "UKESM1-0-LL": "",
        },
        "ssp245": {
            "HadGEM3-GC31-LL": "",
            "UKESM1-0-LL": "",
        },
        "ssp370": {
            "HadGEM3-GC31-LL": "",
            "UKESM1-0-LL": "",
        },
        "ssp585": {
            "HadGEM3-GC31-LL": "",
            "UKESM1-0-LL": "",
        },
    }
    
    esgf_va_model_runs = []
    for model in model_runs:
        if model in ignored_models:
            continue
    
        esgf_va_subset = esgf_va_catalog.query(f"project == 'CMIP6' & model == '{model}' & table == 'day' & variable == 'tas' & experiment == '{experiment}'").set_index(["project", "model", "experiment", "version"]).drop(["variable"], axis=1)
        atlas_ipcc_subset = atlas_ipcc_catalog.drop(["variable"], axis=1)
        
        subset = esgf_va_subset.join(atlas_ipcc_subset.set_index(["project", "model", "experiment", "version"]), on=["project", "model", "experiment", "version"], how="inner")
        if len(subset) > 0:
            # Choose the preferred replica or default to esgf.ceda.ac.uk
            if model in preferred_replicas[experiment]:
                replica = preferred_replicas[experiment][model]
            else:
                replica = "esgf.ceda.ac.uk"
    
            # Probe if the required model member is included in the esgf-va
            print(model)
            subset.to_csv("subset.csv", index=False)
            url = subset[subset["replica"] == replica]["location"].iloc[0]
            print(url)
            ds = xarray.open_dataset(
                url,
                decode_times=False,
                drop_variables=["tracking_id", "further_info_url", "time_bnds", "lat_bnds", "lon_bnds"])
    
            if model_runs[model].encode("ascii") in ds["variant_label"].values:
                record = {"url": url, "model_run": model + "_" + model_runs[model]}
                esgf_va_model_runs.append(record)

    return pd.DataFrame.from_records(esgf_va_model_runs)

In [13]:
ssp126_model_runs = find_model_runs(esgf_va_catalog, atlas_ipcc_catalog, model_runs, "ssp126").set_index("model_run").rename({"url": "ssp126"}, axis=1)
ssp245_model_runs = find_model_runs(esgf_va_catalog, atlas_ipcc_catalog, model_runs, "ssp245").set_index("model_run").rename({"url": "ssp245"}, axis=1)
ssp370_model_runs = find_model_runs(esgf_va_catalog, atlas_ipcc_catalog, model_runs, "ssp370").set_index("model_run").rename({"url": "ssp370"}, axis=1)
ssp585_model_runs = find_model_runs(esgf_va_catalog, atlas_ipcc_catalog, model_runs, "ssp585").set_index("model_run").rename({"url": "ssp585"}, axis=1)
historical_model_runs = find_model_runs(esgf_va_catalog, atlas_ipcc_catalog, model_runs, "historical").set_index("model_run").rename({"url": "historical"}, axis=1)

AWI-CM-1-1-MR
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_AWI_AWI-CM-1-1-MR_ssp126_day_gn_v20190529/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_AWI_AWI-CM-1-1-MR_ssp126_day_tas_gn_v20190529.ncml
CAMS-CSM1-0
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_CAMS_CAMS-CSM1-0_ssp126_day_gn_v20191106/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_CAMS_CAMS-CSM1-0_ssp126_day_tas_gn_v20191106.ncml
CMCC-CM2-SR5
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_CMCC_CMCC-CM2-SR5_ssp126_day_gn_v20200717/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_CMCC_CMCC-CM2-SR5_ssp126_day_tas_gn_v20200717.ncml
CNRM-CM6-1
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp126_day_gr_v20190219/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp126_day_tas_gr_v20190219.ncml
CNRM-ESM2-1
http

  var = coder.decode(var, name=name)


CMCC-CM2-SR5
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_CMCC_CMCC-CM2-SR5_ssp585_day_gn_v20200622/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_CMCC_CMCC-CM2-SR5_ssp585_day_tas_gn_v20200622.ncml
CNRM-CM6-1
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp585_day_gr_v20190219/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1_ssp585_day_tas_gr_v20190219.ncml
CNRM-CM6-1-HR
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1-HR_ssp585_day_gr_v20191202/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-CM6-1-HR_ssp585_day_tas_gr_v20191202.ncml
CNRM-ESM2-1
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/ScenarioMIP/day/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-ESM2-1_ssp585_day_gr_v20191021/replicas/esgf.ceda.ac.uk/CMIP6_ScenarioMIP_CNRM-CERFACS_CNRM-ESM2-1_ssp585_day_tas_

  var = coder.decode(var, name=name)


CMCC-CM2-SR5
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/CMIP/day/CMIP6_CMIP_CMCC_CMCC-CM2-SR5_historical_day_gn_v20200616/replicas/esgf.ceda.ac.uk/CMIP6_CMIP_CMCC_CMCC-CM2-SR5_historical_day_tas_gn_v20200616.ncml
CNRM-CM6-1
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/CMIP/day/CMIP6_CMIP_CNRM-CERFACS_CNRM-CM6-1_historical_day_gr_v20180917/replicas/esgf.ceda.ac.uk/CMIP6_CMIP_CNRM-CERFACS_CNRM-CM6-1_historical_day_tas_gr_v20180917.ncml
CNRM-CM6-1-HR
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/CMIP/day/CMIP6_CMIP_CNRM-CERFACS_CNRM-CM6-1-HR_historical_day_gr_v20191021/replicas/esgf.ceda.ac.uk/CMIP6_CMIP_CNRM-CERFACS_CNRM-CM6-1-HR_historical_day_tas_gr_v20191021.ncml
CNRM-ESM2-1
https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/ensemble/CMIP6/CMIP/day/CMIP6_CMIP_CNRM-CERFACS_CNRM-ESM2-1_historical_day_gr_v20181206/replicas/esgf.ceda.ac.uk/CMIP6_CMIP_CNRM-CERFACS_CNRM-ESM2-1_historical_day_tas_gr_v20181206.ncml
EC-Earth3
https://hub.ipcc.ifca.es

In [18]:
df = historical_model_runs.join(
    ssp585_model_runs).join(
    ssp126_model_runs).join(
    ssp245_model_runs).join(
    ssp370_model_runs)
df

Unnamed: 0_level_0,historical,ssp585,ssp126,ssp245,ssp370
model_run,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACCESS-CM2_r1i1p1f1,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...
ACCESS-ESM1-5_r1i1p1f1,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...
AWI-CM-1-1-MR_r1i1p1f1,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...
CAMS-CSM1-0_r2i1p1f1,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...
CanESM5_r1i1p1f1,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,,,
CESM2-WACCM_r1i1p1f1,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,,,
CMCC-CM2-SR5_r1i1p1f1,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...
CNRM-CM6-1_r1i1p1f2,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...
CNRM-CM6-1-HR_r1i1p1f2,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,,,
CNRM-ESM2-1_r1i1p1f2,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...,,https://hub.ipcc.ifca.es/thredds/dodsC/esgeva/...


Show the number of available members for each scenario.

In [23]:
(~df.isna()).sum()

historical    29
ssp585        17
ssp126        13
ssp245        11
ssp370        13
dtype: int64

Load the datasets.

In [27]:
def load_datasets(locations, members, experiment):
    dss = []
    for location in locations:
        if location is np.nan:
            continue
        
        name = location.split("/")[-1]
        model = name.split("_")[3]

        ds = xarray.open_dataset(
            location,
            drop_variables=["tracking_id", "further_info_url", "time_bnds", "lat_bnds", "lon_bnds"]).sel(
            variant_label=members[model].encode("ascii"))

        if experiment.startswith("ssp"):
            ds = ds.isel(time=slice(None, 31390))
            # limit to 2100, avoid cf time indexing issues
            if isinstance(ds["time"][0].item(), cftime.Datetime360Day):
                ds = ds.sel(time=slice("20150101", "21001230"))
            else:
                ds = ds.sel(time=slice("20150101", "21001231"))
        elif experiment == "historical":
            pass
        else:
            raise ValueError("Invalid experiment... exiting")
    
        # add the model_run global attribute
        ds.attrs["model_run"] = f"{model}_{members[model]}"
        dss.append(ds)

    return dss

In [29]:
historical_dss = load_datasets(df["historical"], model_runs, "historical")
ssp126_dss = load_datasets(df["ssp126"], model_runs, "ssp126")
ssp245_dss = load_datasets(df["ssp245"], model_runs, "ssp245")
ssp370_dss = load_datasets(df["ssp370"], model_runs, "ssp370")
ssp585_dss = load_datasets(df["ssp585"], model_runs, "ssp585")

  var = coder.decode(var, name=name)
  ds = xarray.open_dataset(
  var = coder.decode(var, name=name)


In [34]:
for ds in historical_dss:
    print(f'{ds["tas"].size * ds["tas"].dtype.itemsize / 2**20:.2f} MB')

6356.07 MB
6400.21 MB
16949.53 MB
11770.51 MB
1882.03 MB
12703.92 MB
12703.71 MB
7533.12 MB
59588.20 MB
7533.12 MB
30132.50 MB
11909.73 MB
11909.73 MB
6264.84 MB
4149.17 MB
4962.39 MB
4962.39 MB
4733.95 MB
4234.57 MB
7533.12 MB
4237.38 MB
16949.53 MB
4237.38 MB
11770.51 MB
4237.38 MB
3175.93 MB
12703.71 MB
12703.71 MB
6264.84 MB


In [35]:
for ds in ssp126_dss:
    print(f'{ds["tas"].size * ds["tas"].dtype.itemsize / 2**20:.2f} MB')

8828.44 MB
6063.67 MB
6621.33 MB
3923.75 MB
3923.75 MB
7305.50 MB
6207.50 MB
2137.44 MB
2465.76 MB
3923.75 MB
8828.44 MB
2207.11 MB
3265.31 MB


In [36]:
for ds in ssp245_dss:
    print(f'{ds["tas"].size * ds["tas"].dtype.itemsize / 2**20:.2f} MB')

3310.66 MB
3333.65 MB
8828.44 MB
6063.67 MB
6621.33 MB
3923.75 MB
2137.44 MB
2465.76 MB
8828.44 MB
6130.86 MB
3265.31 MB


In [37]:
for ds in ssp370_dss:
    print(f'{ds["tas"].size * ds["tas"].dtype.itemsize / 2**20:.2f} MB')

3310.66 MB
3333.65 MB
8828.44 MB
6063.67 MB
6621.33 MB
3923.75 MB
3923.75 MB
2112.31 MB
2465.76 MB
1052.93 MB
8828.44 MB
6130.86 MB
3265.31 MB


In [38]:
for ds in ssp585_dss:
    print(f'{ds["tas"].size * ds["tas"].dtype.itemsize / 2**20:.2f} MB')

3310.66 MB
3333.65 MB
8828.44 MB
6063.67 MB
980.94 MB
6621.33 MB
6621.33 MB
3923.75 MB
31037.48 MB
3923.75 MB
5479.50 MB
3265.31 MB
2137.44 MB
2465.76 MB
8828.44 MB
1655.33 MB
6621.33 MB


- Historical - 227 GB, Wall time: 1h 59min 6s

In [51]:
%%time

historical_spatial_mean = []
for ds in historical_dss:
    spatial_mean = ds["tas"].chunk({"time": 100}).mean(["lat", "lon"]).compute(num_workers=8, scheduler="processes")
    historical_spatial_mean.append(spatial_mean)

CPU times: user 43.3 s, sys: 2.23 s, total: 45.5 s
Wall time: 1h 59min 6s


In [52]:
%%time

ssp126_spatial_mean = []
for ds in ssp126_dss:
    spatial_mean = ds["tas"].chunk({"time": 100}).mean(["lat", "lon"]).compute(num_workers=8, scheduler="processes")
    ssp126_spatial_mean.append(spatial_mean)

CPU times: user 9.22 s, sys: 550 ms, total: 9.76 s
Wall time: 29min 56s


In [53]:
%%time

ssp245_spatial_mean = []
for ds in ssp245_dss:
    spatial_mean = ds["tas"].chunk({"time": 100}).mean(["lat", "lon"]).compute(num_workers=8, scheduler="processes")
    ssp245_spatial_mean.append(spatial_mean)

CPU times: user 7.29 s, sys: 500 ms, total: 7.79 s
Wall time: 22min 38s


In [54]:
%%time

ssp370_spatial_mean = []
for ds in ssp370_dss:
    spatial_mean = ds["tas"].chunk({"time": 100}).mean(["lat", "lon"]).compute(num_workers=8, scheduler="processes")
    ssp370_spatial_mean.append(spatial_mean)

CPU times: user 7.63 s, sys: 547 ms, total: 8.17 s
Wall time: 24min 29s


In [55]:
%%time

ssp585_spatial_mean = []
for ds in ssp585_dss:
    spatial_mean = ds["tas"].chunk({"time": 100}).mean(["lat", "lon"]).compute(num_workers=8, scheduler="processes")
    ssp585_spatial_mean.append(spatial_mean)

CPU times: user 11.6 s, sys: 754 ms, total: 12.4 s
Wall time: 40min 38s
