# CMIP6 variable availability

This Notebook creates a colorcoded overview table to identify which models offer a given list of variables.
The cells are colored with description as follows:
- green (all): All variables are available for a specific model and scenario.
- orange (list of variables): Less than what specified in ```limit``` number of variables are missing. The missing variables are noted inside the cell as a list.
- red (missing): More then ```limit``` variables are missing.
- black: No data available e.g. the scenario wasn't run for this model

In [1]:
# Imports
import intake
import pandas as pd
import seaborn as sns

# Load the catalog
col_url = "/work/ik1017/Catalogs/dkrz_cmip6_disk.json"
col = intake.open_esm_datastore(col_url)

  df = pd.read_csv(


Setup complete


In [2]:
# Define variables and scenarios to search for
variables = ["fgco2", "nbp", "npp", "gpp"] # ["fgco2", "nbp", "fLuc", "npp", "gpp", "rh", "ra", "cVeg", "cSoil", "cLitter"]
scenarios = ["historical", "ssp126", "ssp245", "ssp370", "ssp585"]

limit = 2 # number of variables alowed before labelled as missing

## Functions

In [18]:
def search_catalog(variables: list, scenarios: list, limit=2):
    """
    Search the CMIP6 catalog for specific variables across different scenarios and models and generate an overview.

    Parameters:
        variables (list): List of CMIP6 variable IDs to search for in the catalog.
        scenarios (list): List of scenarios (e.g., 'historical', 'ssp245') to query.
        limit (int, optional): Threshold for how many missing variables are acceptable before labeling as "missing". Defaults to 2.

    Returns:
        pd.DataFrame: A DataFrame summarizing the availability of variables for each source model and scenario.
                      Values in the DataFrame are categorized as:
                      - "all": All variables are available.
                      - A list of missing variables if fewer than `limit` are missing.
                      - "missing": More than `limit` variables are missing.
                      - "none": No variables are available.
    """
    
    # Initialize a list to store DataFrames
    DF = []
    
    # Loop through each scenario and query the catalog
    for scenario in scenarios:
        activity = 'CMIP' if scenario == "historical" else 'ScenarioMIP'
        
        # Search the catalog and create an overview DataFrame
        query = dict(activity_id=activity, variable_id=variables, experiment_id=scenario)
        col_subset = col.search(**query)
        
        # Create a DataFrame overview of available data
        df_init = col_subset.df
        df_overview = df_init.groupby(['source_id', 'variable_id']).member_id.nunique().unstack()
        
        # List variables without data for each model
        df_overview[scenario] = df_overview.isna().apply(lambda row: row[row].index.tolist(), axis=1)
        
        # Append the DataFrame to the list
        DF.append(pd.DataFrame(df_overview[scenario]))
    
    # Merge all DataFrames from the different scenarios
    merged_df = DF[0]
    for df in DF[1:]:
        merged_df = merged_df.join(df, how='outer')
    
    # Fill NaN values with the number of variables
    merged_df = merged_df.fillna(len(variables))
    
    # Apply the mapping function to each column
    for column in merged_df.columns:
        merged_df[column] = merged_df[column].apply(map)

    return merged_df

In [4]:
def map(val):
    """
    Map the input value to a descriptive label based on its length or content.

    Parameters:
        val (list or other): The value to categorize.
    
    Returns:
        str: A descriptive label:
             - "all" if the list is empty.
             - A string representation of the list if its length is less than `limit`.
             - "missing" if its length exceeds `limit`.
             - "none" for any other unexpected cases.
    """
    
    try:
        if len(val) == 0:
            description = "all"
        elif len(val) < limit:
            description = str(val)
        else:
            description = "missing"           
    except:
        description = "none"

    return description

def highlight_color(val: str):
    """
    Apply a color-coding style based on the value.

    Parameters:
        val (str): The value to style, expected to be one of:
                   - "all"
                   - "none"
                   - "missing"
                   - A string representing a list of variables.
    
    Returns:
        str: CSS style for the background color:
             - Green for "all".
             - Black for "none".
             - Red for "missing".
             - Orange for other values.
    """
    
    if val == "all":
        return 'background-color: rgba(0,255,50)'
    elif val == "none":
        return 'background-color: rgba(0,0,0)'
    elif val == "missing":
        return 'background-color: rgba(255,0,0)'
    return 'background-color: rgba(255,165,0)'

In [20]:
def style_dataframe(df: pd.DataFrame):
    """
    Style a DataFrame for better visualization, including color-coded cells and column headers.

    Parameters:
        df (pd.DataFrame): The DataFrame to style.
    
    Returns:
        pd.io.formats.style.Styler: A styled version of the DataFrame with:
                                    - Color-coded cells based on their content.
                                    - Rotated column headers for compactness.
                                    - Grey dotted borders for all cells.
    """
    # Define styles for visualization
    table_styles = [
        dict(selector="th", props=[('width', '60px')]),
        dict(selector="th.col_heading", props=[('transform', 'rotateZ(-90deg)',), 
                                               ('max-height', '180px'), 
                                               ('height', '160px'), 
                                               ('max-width', '5px'), 
                                               ('vertical-align', 'left')])]
    
    # Apply styles and render the DataFrame
    styled_df = (
        df.style
        .applymap(highlight_color)
        .set_table_styles(table_styles)
        .set_properties(**{'border-color': 'grey', 'border-style': 'dotted', 'border-width': 'thin'}))
    
    return styled_df

## Overview Table

In [19]:
df = search_catalog(variables, scenarios, limit=2)
style_dataframe(df)

Unnamed: 0_level_0,historical,ssp126,ssp245,ssp370,ssp585
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACCESS-ESM1-5,all,missing,all,all,missing
AWI-ESM-1-1-LR,missing,none,none,none,none
AWI-ESM-1-REcoM,missing,missing,missing,none,missing
BCC-CSM2-MR,['nbp'],['nbp'],['nbp'],['nbp'],['nbp']
BCC-ESM1,['nbp'],none,none,none,none
CESM2,all,missing,missing,missing,missing
CESM2-FV2,all,none,none,none,none
CESM2-WACCM,all,missing,all,all,all
CESM2-WACCM-FV2,all,none,none,none,none
CMCC-CM2-SR5,missing,missing,missing,missing,missing
