## Data Processing Functions

#### File Operations

In [7]:
def download(url: str, dest_folder: str, dest_file: str, overwrite=False, create_dest=False):
    """
    Wrapper function to download data from a URL and save it to a folder. If the destination path is Google Cloud Storage and the
    source data is a netcdf, the downloaded data will be saved as a .zarr file
    
    Parameters
    ----------
    url : string
        The URL to the direct download
    dest_folder: string
        Desired file path for the destination folder
    dest_file: string
        Desired file name for the downloaded file
    overwrite: boolean
        if true, will overwrite the dest_file at the dest_folder location
    create_dest: boolean
        if true, the destination folder will be created automatically;
        otherwise, the function will stop and require a user to manually create the folder as an extra validation step
    """
    
    if dest_folder[0:5] == 'gs://':
        if url[-3:] == '.nc':
            download_to_gs_as_zarr(url, dest_folder, dest_file, overwrite, create_dest)
        elif url[-4:] == '.csv':
            print("Attempting to transfer non-ARCO file extension to cloud. This function is in-progress and may not work for large files.")
            tmp_folder = '../tmp/'
            download_to_folder(url, tmp_folder, dest_file, overwrite, create_dest=True)
            import gcsfs
            fs = gcsfs.GCSFileSystem()
            fs.touch(dest_folder) #create empty file to organize folder structure
            fs.put(tmp_folder+dest_file, dest_folder+dest_file, recursive=True)
            print(f'Moved to GS {dest_folder+dest_file}.')
            os.remove(tmp_folder+dest_file)
        else:
            print('Cancelling - This cloud storage function currently only supports transfer of netcdf files; please confirm the download file extension.')
            return None
    else:
        download_to_folder(url, dest_folder, dest_file, overwrite, create_dest)

In [3]:
#Some modification may be needed for additional operating systems
def download_to_folder(url: str, dest_folder: str, dest_file: str, overwrite=False, create_dest=False):
    """
    Downloads data from a URL and saves it to a folder
    Modified from https://stackoverflow.com/questions/56950987/download-file-from-url-and-save-it-in-a-folder-python
    
    Parameters
    ----------
    url : string
        The URL to the direct download
    dest_folder: string
        File path for the destination folder
    dest_file: string
        File name for the downloaded file
    overwrite: boolean
        if true, will overwrite the dest_file at the dest_folder location
    create_dest: boolean
        if true, the destination folder will be created automatically;
        otherwise, the function will stop and require a user to manually create the folder as an extra validation step
    """
    import os
    import requests
    
    if os.path.exists(dest_folder):
        pass
    else:
        if create_dest:
            os.makedirs(dest_folder)  # create folder if it does not exist
        else:
            print(f"Please confirm the destination folder exists: {dest_folder}. Or set create_dest=True.")  #extra check to place data in correct spot
            return None

    file_path = os.path.join(dest_folder, dest_file)
    
    if overwrite or not (os.path.isfile(file_path)):   #if you want to overwrite or if the file doesnt already exists, then download
        r = requests.get(url, stream=True)
        if r.ok:
            print(f"Saving {url} to {file_path}...")
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024 * 1024 * 10):  #10 MB chunk size; could increase for faster download speed
                    if chunk:
                        f.write(chunk)
                        f.flush()
                        os.fsync(f.fileno())
            print("Complete")
        else:  # HTTP status code 4XX/5XX. This could be incorporated into a try/catch to handle separately
            print(f"Download failed: status code {r.status_code}\n{r.text}")
            print(url)
    else:
        print(f"File {dest_file} already exists at {dest_folder} - (skipping download from {url} )")
    
    return None

In [4]:
#Some modification may be needed for additional operating systems
def download_to_gs_as_zarr(url: str, dest_folder: str, dest_file: str, overwrite=False, create_dest=False):
    """
    Downloads NetCDF data from a URL and saves it to a temporary folder; then loads and copies it to destination Google Storage as a zarr file

    Parameters
    ----------
    url : string
        The URL to the direct download
    dest_folder: string
        File path for the destination folder
    dest_file: string
        File name for the downloaded file
    overwrite: boolean
        if true, will overwrite the dest_file at the dest_folder location
    create_dest: boolean
        if true, the destination folder will be created automatically;
        otherwise, the function will stop and require a user to manually create the folder as an extra validation step
    """
    import os
    import requests
    import gcsfs
    import xarray as xr
    
    fs = gcsfs.GCSFileSystem()
    file_path = os.path.join(dest_folder, dest_file)
    zarr_dest_file = dest_file.replace('.nc','.zarr')  #hardcoded for netcdfs
    zarr_file_path = file_path.replace('.nc','.zarr')
    tmp_file_path = '../tmp/'+dest_file
    
    #check if destination path exists
    if fs.exists(dest_folder):
        pass
    else:
        if create_dest:
            print(f"Creating destination folder: {dest_folder}...")  
            fs.touch(dest_folder) #create empty file to organize folder structure
        else:
            print(f"Please confirm the destination folder exists using the touch() function: {dest_folder}. Or set create_dest=True.")  #extra check to place data in correct spot
            return None
    
    #check if file already exists in destination
    if fs.exists(zarr_file_path) and (not overwrite):
        #TODO - the exists() function seems unstable; if you remove files from another kernels the result may not register
        print(f"File {zarr_dest_file} already exists at {dest_folder} - (skipping download from {url} )")
        return None
    
    #download to temp folder
    print(f"Downloading to ../tmp/...")    
    download_to_folder(url, '../tmp/', dest_file, overwrite=overwrite, create_dest=True)
    
    #copy to destination
    tmp_xr = xr.open_dataset(tmp_file_path)
    print(f"Transferring to GS {zarr_file_path}...")
    tmp_xr.to_zarr(zarr_file_path, mode='w')
    
    #remove temp file
    os.remove(tmp_file_path)
    print("Complete")
        
    return None

In [33]:
def cdsapi_custom_download(year: int,  months: list, variable: str, dest_folder: str, dest_file: str
                          ,overwrite=False, create_dest=False ):
    """
    Downloads data using the cdsapi (European Centre for Medium-Range Weather Forecasts)
    If the destination folder is cloud storage, the function saves the file to a temporary folder; then loads and copies it to destination Google Storage as a zarr file

    Parameters
    ----------
    year: int
        The year of the data to be downloaded
    months:
        A list of months (set of integers from 1 to 12) of the data to be downloaded
    dest_folder: string
        Local file path for the destination folder
    dest_file: string
        Local file name for the downloaded file
    overwrite: boolean
        if true, will overwrite the dest_file at the dest_folder location
    create_dest: boolean
        if true, the destination folder will be created automatically;
        otherwise, the function will stop and require a user to manually create the folder as an extra validation step
    """
    import cdsapi
    import os
    import requests
    import xarray as xr
    import gcsfs
    
    file_path = os.path.join(dest_folder, dest_file)
    tmp_file_path = '../tmp/'+dest_file
    gfs = False
    if dest_folder[0:5] == 'gs://':
        gfs = True
        fs = gcsfs.GCSFileSystem()
        zarr_dest_file = dest_file.replace('.nc','.zarr')  #hardcoded for netcdfs
        zarr_file_path = file_path.replace('.nc','.zarr')
    
    #check if destination path exists
    if gfs:
        if not (fs.exists(dest_folder)):
            if create_dest:   #if dest folder doesnt exist but we want to create it
                print(f"Creating destination folder: {dest_folder}...")  
                fs.touch(dest_folder) #create empty file to organize folder structure
            else:
                print(f"Please confirm the destination folder exists using the touch() function: {dest_folder}. Or set create_dest=True.")  #extra check to place data in correct spot
                return None
    else:
        if not (os.path.exists(dest_folder)):
            if create_dest:
                os.makedirs(dest_folder)  
            else:
                print(f"Please confirm the destination folder exists: {dest_folder}. Or set create_dest=True.")  #extra check to place data in correct spot
                return None
    
    #check if data already exists or was previously downloaded
    if gfs:
        if fs.exists(zarr_file_path) and (not overwrite):
            print(f"File {zarr_dest_file} already exists - (skipping download for {year} )")
            return None
    else:
        if os.path.isfile(file_path) and (not overwrite):  
            print(f"File {dest_file} already exists - (skipping download from {year} )")
            return None
    
    #now download to temp folder
    if not os.path.exists('../tmp'): os.makedirs('../tmp')
    #print(f"Downloading to ../tmp...")    
    c = cdsapi.Client()
    c.retrieve(
        'reanalysis-era5-single-levels-monthly-means',
        {
            'format': 'netcdf',
            'year': year,
            'variable': variable,
            'product_type': 'monthly_averaged_reanalysis',
            'month': months,
            'time': '00:00'
        },
        tmp_file_path)
    
    if gfs: #load and transfer as zarr
        tmp_xr = xr.open_dataset(tmp_file_path)
        print(f"Transferring to GS {zarr_file_path}...")
        tmp_xr.to_zarr(zarr_file_path, mode='w')
    else:
        tmp_xr = xr.open_dataset(tmp_file_path)
        print(f"Transferring to Destination {file_path}...")
        tmp_xr.to_netcdf(file_path)
    
    os.remove(tmp_file_path)
    return None
        

In [31]:
def output_xarray_with_date(out_xarray, dest_folder: str, dest_filename: str, filetype='.nc', with_date=True, overwrite=False):
    """
    Outputs a file to a specified location and names it according to the date range contained in the xarray
    Must have a coordinate dimension named 'time' if outputting with_date=True
    
    Parameters
    ----------
    out_xarray : xarray dataset
        The object to output as a netCDF or Zarr file
    dest_folder: string
        file path for the destination folder
    dest_file: string
        file name desired for the output data (without specifying the filetype)
    filetype: str
        Either '.nc' or '.zarr'; specifying the type of output
    with_date: boolean
        if true, the time range of the xarray (using dimension "time") will be appended to the end of file name as '_YYYMM-YYYYMM'
    overwrite: boolean
        if true, will overwrite the dest_file at the dest_folder location
    """
    import os
    import xarray
    
    #clean up if file name already includes the filetype suffix
    dest_filename_new = dest_filename
    if dest_filename.strip()[-3:] == '.nc':
        dest_filename_new = dest_filename.strip()[:-3]
    elif dest_filename.strip()[-5:] == '.zarr':
        dest_filename_new = dest_filename.strip()[:-5]
    
    if with_date:
        min_yearmonth = str(out_xarray.time.min().data.astype('datetime64[s]').item().strftime('%Y%m')) #just gets the min date from the xarray in YYYYMM format
        max_yearmonth = str(out_xarray.time.max().data.astype('datetime64[s]').item().strftime('%Y%m')) 
    
        processed_filename = (dest_filename_new + '_' + min_yearmonth + '-' + max_yearmonth + filetype)
        processed_file_path = os.path.join(dest_folder, processed_filename)
        #print(processed_file_path)
    else:
        processed_filename = dest_filename_new + filetype
        processed_file_path = os.path.join(dest_folder, processed_filename)
    
    #check if file already exists
    already_exists = False
    if os.path.isfile(processed_file_path): already_exists = True
    if dest_folder[0:5] == 'gs://':
        import gcsfs
        fs = gcsfs.GCSFileSystem()
        if fs.exists(processed_file_path):
            already_exists = True
    
    if overwrite or not (already_exists):   #if you want to overwrite or if the file doesnt already exists, then save
        if filetype == '.nc':
            out_xarray.to_netcdf( processed_file_path )
            print(f"Saved {processed_filename} to {dest_folder}")
        elif filetype == '.zarr':
            out_xarray.to_zarr( processed_file_path, mode='w')
            print(f"Saved {processed_filename} to {dest_folder}")
        else: 
            print("Unsupported file output type; please choose '.nc' or '.zarr'")
    else:
        print(f"Cancelling output - {processed_filename} already exists in {dest_folder}")

#### XArray Operations

In [5]:
def xr_open_dataset_custom(file :str, decode_times=True):
    """
    Wrapper function for xarray.open_dataset() but compatible with either .nc (netcdf) or .zarr files.
    
    Parameters
    ----------
    file : string
        The file location of an xarray dataset
    decode_times : bool, optional
        If True, decode times encoded in the standard NetCDF datetime format 
        into datetime objects. Otherwise, leave them encoded as numbers.
    
    Returns
    ----------
    xr : an xarray dataset object
    """
    #correct if a user specified a .nc but meant .zarr on Google cloud:
    updated_file = file
    if file.strip()[0:5] == 'gs://' and file.strip()[-3:] == '.nc': 
        updated_file = file.strip()[0:-3]+file.strip()[-3:].replace('.nc','.zarr')  #hardcoded for netcdfs
        
    try:
        xr_ds = xr.open_dataset(file, decode_times=decode_times) 
    except:
        try:
            xr_ds = xr.open_dataset(file, decode_times=decode_times, engine='zarr', chunks={})
        except:
            print(f'Encountered an error - trying with {updated_file}...')
            xr_ds = xr.open_dataset(updated_file, decode_times=decode_times, engine='zarr', chunks={}) 
            print('Success.')
        
    return xr_ds

In [5]:
def xr_open_mfdataset_custom(file: str):
    """
    Wrapper function for xarray.open_mfdataset() but compatible with either .netcdf or .zarr files.
    
    Parameters
    ----------
    file : string
        The file location of an xarray dataset
    
    Returns
    ----------
    xr : an xarray dataset object
    """
    #correct if a user specified a .nc but meant .zarr on Google cloud:
    updated_file = file
    if file.strip()[0:5] == 'gs://' and file.strip()[-3:] == '.nc': 
        updated_file = file.strip()[0:-3]+file.strip()[-3:].replace('.nc','.zarr')  #hardcoded for netcdfs

    try:
        xr_ds = xr.open_mfdataset(file) 
    except:
        try:
            xr_ds = xr.open_mfdataset(file, engine='zarr', chunks={})  
        except:
            print(f'Encountered an error - trying with {updated_file}')
            xr_ds = xr.open_mfdataset(updated_file, engine='zarr', chunks={})
            print('Success.')
        
    return xr_ds

In [34]:
#This function was modified from Luke's
def add_time_to_globcolour(file: str):
    """
    Outputs an xarray dataset with a 'time' dimension based on an inputted file
    Must have a date in the file name in the YYYYMM format at the end
    
    Parameters
    ----------
    file : string
        The file location of an xarray dataset that is missing a time dimension.
    
    Returns
    ----------
    ds_tmp : an xarray dataset with the additional time coordinate
    """
    import pandas as pd
    import numpy as np
    import xarray as xr
    import re
    
    # extract start date
    file_month = re.findall(r'\d{4}\d{2}', file[-10:])[0]  #just try to find first date in YYYYMM format in last 10 characters of file name
    pd_datetime = pd.to_datetime(file_month, format='%Y%m') + np.timedelta64(14, 'D') #add days to be mid-month

    # open dataset and create time coordinate and dimension
    ds = xr_open_dataset_custom(file) #previously #ds = xr.open_dataset(file)
    ds_tmp = ds.assign_coords({'time':pd_datetime}).expand_dims(dim='time', axis=0)
    return ds_tmp

#chl_test = add_time_to_globcolour(data_folder_root+'CHL/originals/CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_199802.nc') 

In [None]:
def log_or_0_xr(xr_to_upd, field_name):
    """
    Function to compute the log (base 10) of a DataArray. 
    
    Parameters
    ----------
    Input 1 : DataArray
        must have time, ylat, xlon coordinates
    Input 2 : String 
        desired name of the new log field in the output DataArray
    
    Returns
    ----------
    res : a DataArray of the same shape with log values 
    """
    import numpy as np
    import xarray as xr
    from numpy import errstate,isneginf,array

    with errstate(divide='ignore'):
        n = np.log10(xr_to_upd).values #use .to_numpy() for newer versions of xr
    n[np.isneginf(n)]=0
    res = xr.DataArray(n, coords={'time': xr_to_upd.time,'ylat': xr_to_upd.ylat,'xlon': xr_to_upd.xlon}, dims=["time", "ylat", "xlon"], name=field_name)
    return res

In [1]:
def find_least_date_range(xarraylist):
    """
    Function to compute the minimum overlapping time range of a set of xarray datasets 
    
    Parameters
    ----------
    Input 1 : xarraylist
        A list of xarray datasets. Each must contain a 'time' coordinate
    
    Returns
    ----------
    min_date : string 
        The most recent start date (in YYYY-MM format) among the datasets 
    max_date : string
        The earliest end date (in YYYY-MM format) among the datasets
    """
    import xarray as xr
    
    min_date = []
    max_date = []
    for f in xarraylist:
        min_date.append(f.time.min().data.astype('datetime64[s]').item())
        max_date.append(f.time.max().data.astype('datetime64[s]').item())
    
    return max(min_date).strftime('%Y-%m'), min(max_date).strftime('%Y-%m')

#### fCO2_to_pCO2

These functions were taken from the fCO2_to_pCO2.ipynb file with no changes other than variable naming updates

In [28]:
class UnitError(Exception):
    pass

def check_array_bounds(arr, lims, action="warn", name=""):
    """
    Checks that units are within the given limits. If not, then
    will raise/warn the user. Will always raise an error if more
    than half of the non-nan values are outside the limits.
    Parameters
    ----------
    arr : array-like
        The array that will be checked
    lims : tuple
        lower and upper limits of checks
        note that limits are exclusive (i.e. < and >, and not >=/<=)
    action: string
        raise - will raise an error and not continue
        warn - will throw a warning and mask values with nan
        quiet - same as warn, but without warning
        ignore - nothing will be done, but may result in bad data
    name: string
        if given, will inform the user of the name of the array
        to make debugging easier
    Return
    ------
    arr : array-like
        returns the array, but if warn or quiet, will be masked
        with nans
    """

    from numpy import array, any, nan, isnan
    from warnings import warn

    arr = array(arr, ndmin=1, dtype=float)
    if arr.size <= 2:
        return arr

    outside = (arr < lims[0]) | (arr > lims[1])

    non_nan_count = arr.size - isnan(arr).sum()
    half_outside = outside.sum() > (non_nan_count * 0.5)
    if half_outside:
        raise UnitError(
            f"More than half of the values in {name} are outside the limits "
            f"{str(lims)}. Check that input contains the correct units."
        )

    msg = (
        f"There are {outside.sum():d} values that do not fall within "
        f"the given limits {str(lims)}"
        f" of {name}"
        if name != ""
        else ""
    )

    if any(outside) & (action == "raise"):
        raise UnitError(msg)
    elif action == "warn":
        if any(outside):
            warn(msg, Warning)
        arr[outside] = nan
    elif action == "quiet":
        arr[outside] = nan
    elif action == "ignore":
        pass
    else:
        raise Exception("action must have raise/warn/quiet/ignore as inputs")

    return arr

In [3]:
def temp_K(temp_K):
    return check_array_bounds(
        arr=temp_K, lims=(270, 318.5), action="warn", name="temperature (K)"
    )

In [4]:
def pres_atm(pres_atm):
    return check_array_bounds(
        arr=pres_atm, lims=(0.5, 1.5), action="warn", name="Pressure (atm)"
    )

In [7]:
def CO2_mol(CO2_mol):
    return check_array_bounds(
        arr=CO2_mol,
        lims=(5e-6, 0.08),
        action="warn",
        name="CO2 mole fraction (ppm)",
    )

In [5]:
def temperature_correction(temp_in, temp_out):
    """
    Calculate a correction factor for the temperature difference between the
    intake and equilibrator. This is based on the empirical relationship used
    in Takahashi et al. 1993.
    pCO2_Tout = pCO2_Tin * T_factor
    Parameters
    ----------
    temp_in : np.array
        temperature at which original pCO2 is measured
    temp_out : np.array
        temperature for which pCO2 should be represented
    Return
    ------
    factor : np.array
        a correction factor to be multiplied to pCO2 (unitless)
    References
    ----------
    Takahashi, Taro et al. (1993). Seasonal variation of CO2 and nutrients in
        the high-latitude surface oceans: A comparative study. Global
        Biogeochemical Cycles, 7(4), 843–878. https://doi.org/10.1029/93GB02263
    """

    from numpy import array, exp

    # see the Takahashi 1993 paper for full description

    Ti = array(temp_in)
    To = array(temp_out)

    factor = exp(0.0433 * (To - Ti) - 4.35e-05 * (To ** 2 - Ti ** 2))

    return factor

In [16]:
def virial_coeff(temp_K1, pres_atm1, xCO2_mol=None):
    """
    Calculate the ideal gas correction factor for converting pCO2 to fCO2.
    fCO2 = pCO2 * virial_expansion
    pCO2 = fCO2 / virial_expansion
    Based on the Lewis and Wallace 1998 Correction.
    Parameters
    ----------
    press_atm : np.array
        uncorrected pressure in atm
    temp_K : np.array
        temperature in degrees Kelvin
    xCO2_mol : np.array
        mole fraction of CO2. Can be pCO2/fCO2 if xCO2 is not defined or can
        leave this as undefined as makes only a small impact on output
    Return
    ------
    virial_expression : np.array
        the factor to multiply with pCO2. Unitless
    Examples
    --------
    The example below is from Dickson et al. (2007)
    >>> 350 * virial_coeff(298.15, 1)  # CO2 [uatm] * correction factor
    348.8836492182758
    References
    ----------
    Weiss, R. (1974). Carbon dioxide in water and seawater: the solubility of a
        non-ideal gas. Marine Chemistry, 2(3), 203–215.
        https://doi.org/10.1016/0304-4203(74)90015-2
    Compared with the Seacarb package in R
    """
    from numpy import array, exp
    #import check_units as check

    T = temp_K(temp_K1)
    P = pres_atm(pres_atm1)
    C = array(xCO2_mol)
    R = 82.057  # gas constant for ATM

    temp_K(T)
    pres_atm(P)

    # B is the virial coefficient for pure CO2
    B = -1636.75 + 12.0408 * T - 0.0327957 * T ** 2 + 3.16528e-5 * T ** 3
    # d is the virial coefficient for CO2 in air
    d = 57.7 - 0.118 * T

    # "x2" term often neglected (assumed = 1) in applications of Weiss's
    # (1974) equation 9
    if xCO2_mol is not None:
        CO2_mol(C)
        x2 = (1 - C) ** 2
    else:
        x2 = 1

    ve = exp(P * (B + 2 * x2 * d) / (R * T))

    return ve

#350 * virial_coeff(298.15, 1) #348.88364922

In [None]:
#This function is not required in this set of code but preserving for other use
def fCO2_to_pCO2(fCO2SW_uatm, tempSW_C, pres_hPa=1013.25, tempEQ_C=None):
    """
    Convert fCO2 to pCO2 for SOCAT in sea water. A simple version of the
    equation would simply be:
        pCO2sw = fCO2sw / virial_exp
    where the virial expansion is calculated without xCO2
    We get a simple approximate for equilibrator xCO2 with:
        xCO2eq = fCO2sw * deltaTemp(sw - eq) / press_eq
    pCO2sw is then calculated with:
        pCO2sw = fCO2sw / virial_exp(xCO2eq)
    Parameters
    ----------
    fCO2SW_uatm : array
        seawater fugacity of CO2 in micro atmospheres
    tempSW_C : array
        sea water temperature in degrees C
    pres_hPa : array
        equilibrator pressure in hecto Pascals
    tempEQ_C : array
        equilibrator temperature in degrees C
    Returns
    -------
    pCO2SW_uatm : array
        partial pressure of CO2 in seawater
    Note
    ----
    In FluxEngine, they account fully solve for the original xCO2 that is used
    in the calculation of the virial exponent. I use the first estimate of
    xCO2 (based on fCO2 rather than pCO2). The difference between the two
    approaches is so small that it is not significant to be concerned. Their
    correction is more precise, but the difference between their iterative
    correction and our approximation is on the order of 1e-14 atm (1e-8 uatm).
    Examples
    --------
    >>> fCO2_to_pCO2(380, 8)
    381.50806485658234
    >>> fCO2_to_pCO2(380, 8, pres_hPa=985)
    381.4659553134281
    >>> fCO2_to_pCO2(380, 8, pres_hPa=985, tempEQ_C=14)
    381.466027968504
    """
    #import check_units as check
    #import auxiliary_equations as eqs

    # if equilibrator inputs are None, tempEQ=tempSW
    if tempEQ_C is None:
        tempEQ_was_None = True
        tempEQ_C = tempSW_C
    else:
        tempEQ_was_None = False

    # standardise the inputs and convert units
    fCO2sw = CO2_mol(fCO2SW_uatm * 1e-6)
    Tsw = temp_K(tempSW_C + 273.15)
    Teq = temp_K(tempEQ_C + 273.15)
    Peq = pres_atm(pres_hPa / 1013.25)

    # calculate the CO2 diff due to equilibrator and seawater temperatures
    # if statement is there to save a bit of time
    if tempEQ_was_None:
        dT = 1.0
    else:
        dT = temperature_correction(Tsw, Teq)

    # a best estimate of xCO2 - this is an aproximation
    # one would have to use pCO2 / Peq to get real xCO2
    # Not getting the exact equilibrator xCO2
    xCO2eq = fCO2sw * dT / Peq

    pCO2SW = fCO2sw / virial_coeff(Tsw, Peq, xCO2eq)
    pCO2SW_uatm = pCO2SW * 1e6

    return pCO2SW_uatm

#fCO2_to_pCO2(380, 8) #381.50806486

In [None]:
#This function is not required in this set of code but preserving for other use
def pCO2_to_fCO2(pCO2SW_uatm, tempSW_C, pres_hPa=None, tempEQ_C=None):
    """
    Convert fCO2 to pCO2 for SOCAT in sea water. A simple version of the
    equation would simply be:
        fCO2sw = pCO2sw / virial_exp
    where the virial expansion is calculated without xCO2
    We get a simple approximate for equilibrator xCO2 with:
        xCO2eq = pCO2sw * deltaTemp(sw - eq) / press_eq
    fCO2sw is then calculated with:
        fCO2sw = pCO2sw * virial_exp(xCO2eq)
    Parameters
    ----------
    pCO2SW_uatm : array
        seawater fugacity of CO2 in micro atmospheres
    tempSW_C : array
        sea water temperature in degrees C/K
    tempEQ_C : array
        equilibrator temperature in degrees C/K
    pres_hPa : array
        pressure in kilo Pascals
    Returns
    -------
    fCO2SW_uatm : array
        partial pressure of CO2 in seawater
    Note
    ----
    In FluxEngine, they account for the change in xCO2. This error is so small
    that it is not significant to be concerned about it. Their correction is
    more precise, but the difference between their iterative correction and our
    approximation is less than 1e-14 atm (or 1e-8 uatm).
    Examples
    --------
    >>> pCO2_to_fCO2(380, 8)
    378.49789637942064
    >>> pCO2_to_fCO2(380, 8, pres_hPa=985)
    378.53967828231225
    >>> pCO2_to_fCO2(380, 8, pres_hPa=985, tempEQ_C=14)
    378.53960618459695
    """
    #import check_units as check
    #import auxiliary_equations as eqs

    # if equilibrator inputs are None then make defaults Patm=1, tempEQ=tempSW
    if tempEQ_C is None:
        tempEQ_C = tempSW_C
    if pres_hPa is None:
        pres_hPa = 1013.25

    # standardise the inputs and convert units
    pCO2sw = CO2_mol(pCO2SW_uatm * 1e-6)
    Tsw = temp_K(tempSW_C + 273.15)
    Teq = temp_K(tempEQ_C + 273.15)
    Peq = pres_atm(pres_hPa / 1013.25)

    # calculate the CO2 diff due to equilibrator and seawater temperatures
    dT = temperature_correction(Tsw, Teq)
    # a best estimate of xCO2 - this is an aproximation
    # one would have to use pCO2 / Peq to get real xCO2
    xCO2eq = pCO2sw * dT / Peq

    fCO2sw = pCO2sw * virial_coeff(Tsw, Peq, xCO2eq)
    fCO2sw_uatm = fCO2sw * 1e6

    return fCO2sw_uatm

#pCO2_to_fCO2(380, 8) #378.49789638

## Machine Learning Functions

#### Model Evaluation

In [None]:
def evaluate_test(y, pred):
    """
    Create metrics for evaluation of a model's predictions
    Parameters
    ----------
    y : numpy array
        actual values for a dependent variable
    pred : numpy array
        predicted values for the dependent variable
    Returns
    -------
    scores : dictionary
        a dictionary of 13 metrics 
    """
    import numpy as np
    from sklearn.metrics import r2_score, max_error, mean_squared_error, mean_absolute_error, median_absolute_error
    
    y_mean = np.mean(y)
    pred_mean = np.mean(pred)
    centered_rmse = np.sqrt(np.square((pred - pred_mean) - (y - y_mean)).sum()/pred.size)

    scores = {
        'mse':mean_squared_error(y, pred),
        'mae':mean_absolute_error(y, pred),
        'medae':median_absolute_error(y, pred),
        'max_error':max_error(y, pred),
        'bias':pred.mean() - y.mean(),
        'r2':r2_score(y, pred),
        'corr':np.corrcoef(y,pred)[0,1],
        'cent_rmse':centered_rmse,
        'stdev' :np.std(pred),
        'amp_ratio':(np.max(pred)-np.min(pred))/(np.max(y)-np.min(y)), # added when doing temporal decomposition
        'stdev_ref':np.std(y),
        'range_ref':np.max(y)-np.min(y),
        'iqr_ref':np.subtract(*np.percentile(y, [75, 25]))
        }
    return scores

In [None]:
#This function is useful to format the output from evaluate_test()
def print_dict_as_table(seq, columns=4):
    """
    Prints a dictionary formatted as a table
    Parameters
    ----------
    seq : dictionary
        a dictionary to print
    columns : int
        number of columns to print
    Returns
    -------
    None
    """
    table = ''
    col_height = (len(seq) // columns) +1
    for x in range(col_height):
        for col in range(columns):
            if x + (col_height * col) <= len(seq)-1:
                a = list(seq.keys())[x + (col_height * col)]
                b = seq[list(seq.keys())[x + (col_height * col)]]
                num = '{:.9s}: {:.3f}'.format(a + ' '*20, round(b,3))
            else:
                num = ''
            table += ('%s' % (num)).ljust(24)
        table += '\n'
    print(table)

## Other Functions

TBD