In [2]:

import xarray as xr
import pandas as pd
from pyclim_noresm.general_util_funcs import global_avg                             
from workflow.scripts.utils import (calculate_CI, diff_means_greater_than_varability, 
                                    masked_average, t_test_diff_sample_means)
import numpy as np
import yaml
import dataframe_image as dfi
from functools import partial
from scipy.stats import ttest_ind, t
import re

In [9]:
def create_dict(paths_ctrl, paths_exp):
    """
    Create a dictionary with the paths to the control and experiment
    """
    paths = {path.split('/')[-1].split('_')[0] : {} for path in paths_ctrl}

    for var in paths:
        paths[var]['ctrl'] = [path for path in paths_ctrl if re.search(var+'_', path)][0]
        paths[var]['exp'] = [path for path in paths_exp if re.search(var+'_', path)][0]

    return paths

In [10]:
paths_exp = snakemake.input.experiments_diags + snakemake.input.exp_derived_diag
paths_ctrl = snakemake.input.control_diags + snakemake.input.control_derived_diag

paths = create_dict(paths_ctrl, paths_exp)

areacello = xr.open_dataset(snakemake.input.get('areacello','workflow/input_data/common_grid.nc'))

if "mask" in snakemake.input:
    mask = xr.open_dataset(snakemake.input.mask)
else:
    mask=None

paths_erfs = {path.split('/')[-1].split('_')[0] : path for path in snakemake.input.erfs}
time_slice = snakemake.params.get('time_slice', slice(0,None))
CI_alpha = snakemake.params.get('CI_alpha', 0.05)

In [21]:
diff={}
ctrl={}
exp={}
meta_data = pd.DataFrame(index=['units','description','greater_than_variability',
                                'CI_low','CI_high','CI_alpha','pval', 'tval', 'diff_significant','ctrl_mean', 
                                'exp_mean'],
                         columns=list(paths.keys())+ list(paths_erfs.keys()))
experiment_meta_data = {}
for var, pathdict in paths.items():

    ds_ctrl = xr.open_dataset(pathdict['ctrl']).isel(time=time_slice)
    ds_exp = xr.open_dataset(pathdict['exp']).isel(time=time_slice)

    da_ctrl = ds_ctrl[var]
    da_exp = ds_exp[var]
    units = da_exp.units
    meta_data[var]['units'] = units

    expTs = masked_average(da_exp, mask,areacello['cell_area'], dim=['lon','lat'])
    ctrlTs=masked_average(da_ctrl, mask,areacello['cell_area'],dim=['lon','lat'])
    diff[var] = expTs-ctrlTs
    ctrl[var] = ctrlTs
    exp[var] = expTs

    meta_data[var]['greater_than_variability'] = diff_means_greater_than_varability(expTs, ctrlTs, mask, areacello['cell_area']).values
    
    CI_low, CI_high, _ = calculate_CI(expTs, ctrlTs, mask=mask, weights=areacello['cell_area'])
    meta_data[var]['CI_low'] = CI_low.values
    meta_data[var]['CI_high'] = CI_high.values
    meta_data[var]['CI_alpha'] = CI_alpha    
    meta_data[var]['description'] = da_exp.long_name
    meta_data[var]['ctrl_mean'] = ctrlTs.mean().values
    meta_data[var]['exp_mean'] = expTs.mean().values
    t_value, p_value, _ = t_test_diff_sample_means(expTs, ctrlTs, mask=mask, weights=areacello['cell_area'])
    meta_data[var]['pval'] = p_value
    meta_data[var]['tval'] = t_value
    meta_data[var]['diff_significant'] = p_value < CI_alpha

    
experiment_meta_data['total_area'] = float(areacello['cell_area'].sum(dim=['lon','lat']))
experiment_meta_data['total_area_units'] = 'm^2'
experiment_meta_data['source_id'] = snakemake.wildcards.model
experiment_meta_data['experiment_id'] = 'piClim-2xdust'
experiment_meta_data['control_id'] = 'piClim-control'
experiment_meta_data['time_slice'] = [time_slice.start, time_slice.stop]
experiment_meta_data['nyear'] = len(ctrlTs)
experiment_meta_data['member_id_ctrl'] = ds_ctrl.attrs['variant_label']
experiment_meta_data['member_id_exp'] = ds_exp.attrs['variant_label']

In [22]:

ctrl = pd.DataFrame(ctrl)
exp = pd.DataFrame(exp)
exp.to_csv(snakemake.output.exp)
ctrl.to_csv(snakemake.output.ctrl)

In [24]:
for erf, path in paths_erfs.items():
    temp = xr.open_dataset(path)
    meta_data[erf]['units'] = temp[erf].units
    meta_data[erf]['description'] = temp[erf].long_name
    if 'year' in temp.dims:
        temp = temp.rename_dims({'year':'time'})
    temp = temp.sel(time=time_slice)
    temp = masked_average(temp[erf], mask, areacello['cell_area'], dim=['lon','lat'])
    diff[erf] = temp.values 

In [None]:
diff = pd.DataFrame(diff)
diff.to_csv(snakemake.output.diff)
meta_data.to_csv(snakemake.output.metadata)

with open(snakemake.output.info_yaml, 'w') as of: 
    yaml.safe_dump(experiment_meta_data,of,default_flow_style=False)