# Compute additional carbon chemistry variables and extract surface and bottom data
Created by Ivan Lima on Thu Jan  6 2022 11:17:56 -0500

In this notebook we compute additional carbon chemistry variables, extract the data near the surface and bottom and create seasonal and monthly fields of surface and bottom DIC and TA using interpolation. 

In [1]:
import pandas as pd
import numpy as np
import datetime, warnings
from tqdm import notebook
warnings.filterwarnings('ignore')
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Wed Apr  6 17:03:29 2022


## Read data

In [2]:
cols = ['Cast', 'Latitude', 'Longitude', 'Date', 'Depth', 'Temperature', 'Salinity', 'DIC', 'TA', 'bottom_depth', 'Platform_Type']
df_wod = pd.read_hdf('data/CombinedCTD_satellite_bgc_pytorch_2002-2019.h5', key='df_ctd')
df_wod = df_wod[cols]
df_wod = df_wod[df_wod.Date.dt.year>2012] # use only contemporary data
df_wod = df_wod[df_wod.Depth<600]         # use only shelf data
df_wod.loc[df_wod.Date.dt.month.isin([1,2,12]),'season'] = 'winter' # set seasons
df_wod.loc[df_wod.Date.dt.month.isin([3,4,5]),'season'] = 'spring'
df_wod.loc[df_wod.Date.dt.month.isin([6,7,8]),'season'] = 'summer'
df_wod.loc[df_wod.Date.dt.month.isin([9,10,11]),'season'] = 'fall'
df_wod.head()

Unnamed: 0,Cast,Latitude,Longitude,Date,Depth,Temperature,Salinity,DIC,TA,bottom_depth,Platform_Type,season
5686516,15637258.0,44.267502,-63.317699,2013-01-09,0.991947,3.952,30.871,2026.018433,2128.127441,144,1.0,winter
5686517,15637258.0,44.267502,-63.317699,2013-01-09,1.487919,3.943,30.868999,2026.168091,2128.027832,144,1.0,winter
5686518,15637258.0,44.267502,-63.317699,2013-01-09,1.98389,3.937,30.870001,2026.392456,2127.993164,144,1.0,winter
5686519,15637258.0,44.267502,-63.317699,2013-01-09,2.479859,3.936,30.870001,2026.564087,2127.935303,144,1.0,winter
5686520,15637258.0,44.267502,-63.317699,2013-01-09,2.975827,3.936,30.870001,2026.731812,2127.877686,144,1.0,winter


## Remove ouliers ($x < -3\sigma$ or $ x > 3\sigma$)

In [3]:
df = df_wod[['Temperature', 'Salinity', 'DIC', 'TA']]
df_clean = df[np.abs(df - df.mean()) < 3*df.std()].dropna()
n_orig, n_clean = len(df_wod), len(df_clean)
df_wod = df_wod.reindex(df_clean.index)
print('Original data: {:,} points'.format(n_orig))
print('Cleaned data:  {:,} points'.format(n_clean))
print('{:,} points removed ({:.2f}%)'.format(n_orig - n_clean, (n_orig - n_clean)/n_orig * 100))

Original data: 8,417,747 points
Cleaned data:  8,379,138 points
38,609 points removed (0.46%)


## Compute additional carbon chemistry variables

In [4]:
import PyCO2SYS as pyco2
import gsw

pressure =  gsw.p_from_z(-df_wod.Depth.values, df_wod.Latitude.values) # water pressure in dbar
kwargs = dict(
    par1 = df_wod.TA.values,   # TA
    par2 = df_wod.DIC.values,  # DIC
    par1_type = 1,             # type 1 = alkalinity
    par2_type = 2,             # type 2 = DIC
    salinity = df_wod.Salinity.values,
    temperature = df_wod.Temperature.values,
    pressure = pressure,
    opt_k_carbonic = 10,  # LDK00, Lueker et al 2000
    opt_k_bisulfate = 1,  # D90a, Dickson 1990
    opt_total_borate = 2, # LKB10, Lee et al 2010
    opt_k_fluoride = 2    # PF87, Perez & Fraga 1987
)

results = pyco2.sys(**kwargs)
co2sys_vars = ['pH', 'pCO2', 'fCO2', 'saturation_calcite', 'saturation_aragonite']
for vname in co2sys_vars:
    df_wod[vname] = results[vname]

df_wod.head()

Unnamed: 0,Cast,Latitude,Longitude,Date,Depth,Temperature,Salinity,DIC,TA,bottom_depth,Platform_Type,season,pH,pCO2,fCO2,saturation_calcite,saturation_aragonite
5686516,15637258.0,44.267502,-63.317699,2013-01-09,0.991947,3.952,30.871,2026.018433,2128.127441,144,1.0,winter,7.989841,428.509684,426.723842,1.991852,1.244721
5686517,15637258.0,44.267502,-63.317699,2013-01-09,1.487919,3.943,30.868999,2026.168091,2128.027832,144,1.0,winter,7.989217,429.129701,427.341061,1.988314,1.242497
5686518,15637258.0,44.267502,-63.317699,2013-01-09,1.98389,3.937,30.870001,2026.392456,2127.993164,144,1.0,winter,7.988471,429.892801,428.100838,1.98466,1.240216
5686519,15637258.0,44.267502,-63.317699,2013-01-09,2.479859,3.936,30.870001,2026.564087,2127.935303,144,1.0,winter,7.987753,430.634282,428.839205,1.981431,1.238205
5686520,15637258.0,44.267502,-63.317699,2013-01-09,2.975827,3.936,30.870001,2026.731812,2127.877686,144,1.0,winter,7.987031,431.382436,429.58424,1.978266,1.236234


## Save data to HDF5 file

In [5]:
yr_min, yr_max = df_wod.Date.dt.year.min(), df_wod.Date.dt.year.max()
outfile = 'data/bgc_all_{}-{}.h5'.format(yr_min, yr_max)
print('writing {}'.format(outfile))
df_wod.to_hdf(outfile, key='df_wod_bgc', mode='w', complevel=9)

writing data/bgc_all_2013-2019.h5


## Extract surface & bottom data

In [6]:
# Exclude glider data
df_wod = df_wod[df_wod.Platform_Type!=2]

In [7]:
# get surface values (mean for upper 5 m)
df_tmp1 = df_wod[df_wod.Depth<=5].groupby('Cast').mean()
df_tmp2 = df_wod.loc[df_wod.Depth<=5, ['Cast','Date','season']].groupby('Cast').nth(0)
df_surface = pd.concat([df_tmp1, df_tmp2], axis=1)[cols[1:] + ['season'] + co2sys_vars]
# write data to hdf5 file
outfile = 'data/bgc_surface_{}-{}.h5'.format(yr_min, yr_max)
print('writing {}'.format(outfile))
df_surface.to_hdf(outfile, key='df_surface', mode='w', complevel=9)
# df_surface.head()

writing data/bgc_surface_2013-2019.h5


In [8]:
# get values at the bottom of each cast
def get_bottom_vals(dfgrp):
    return dfgrp.sort_values('Depth')[-1:][cols[1:] + ['season'] + co2sys_vars]

df_bottom = df_wod.groupby('Cast').apply(get_bottom_vals).reset_index(1, drop=True)

# write data to hdf5 file
outfile = 'data/bgc_bottom_{}-{}.h5'.format(yr_min, yr_max)
print('writing {}'.format(outfile))
df_bottom.to_hdf(outfile, key='df_bottom', mode='w', complevel=9)
# df_bottom.head()

writing data/bgc_bottom_2013-2019.h5


## Interpolate data into horizontal fields

### Create grid

In [9]:
# lon_min, lon_max = np.round(df_wod.Longitude.min()), np.round(df_wod.Longitude.max())
# lat_min, lat_max = np.round(df_wod.Latitude.min()), np.round(df_wod.Latitude.max())

lon_min, lon_max = np.round(df_wod.Longitude.min()), -65 # NELME region
lat_min, lat_max = np.round(df_wod.Latitude.min()), 45.5 # NELME region

# dx = 0.10 # grid resolution in degrees
# xx = np.arange(lon_min, lon_max+dx/4, dx)
# yy = np.arange(lat_min, lat_max+dx/4, dx)
# lon, lat = np.meshgrid(xx, yy)

c = (lat_max - lat_min)/(lon_max - lon_min)
npts = 89 # number of points -> 0.125 degree resolution
xx = np.linspace(lon_min, lon_max, npts)
yy = np.linspace(lat_min, lat_max, int(np.round(c * npts)))
lon, lat = np.meshgrid(xx, yy)

In [10]:
# import hvplot.pandas
# df_grid = pd.DataFrame({'lon':lon.ravel(), 'lat':lat.ravel()})
# df_grid.hvplot.points('lon', 'lat', geo=True, xlabel='longitude', ylabel='latitude',
#                       title='Data grid', tiles='EsriTerrain', frame_width=300, size=1, c='red')

### Create topography mask

In [11]:
import xarray as xr
from scipy.interpolate import griddata

ds_topo = xr.open_dataset('data/etopo5.nc')
X = np.where(ds_topo.X>180, ds_topo.X-360, ds_topo.X) # 0:360 -> -180:180
lon_topo, lat_topo = np.meshgrid(X, ds_topo.Y.values)
grid_topo = griddata((lon_topo.ravel(), lat_topo.ravel()), ds_topo.bath.values.ravel(), (lon,lat), method='linear')
ds_topo.close()

shelf_mask = (grid_topo<-5) & (grid_topo>-600)
da_mask = xr.DataArray(shelf_mask, coords={'lat':yy, 'lon':xx},
                       attrs = {'long_name':'shelf mask'}, name='SHELF_MASK')

In [12]:
# import hvplot.xarray
# da_topo = xr.DataArray(grid_topo, coords={'lat':yy, 'lon':xx},
#                        attrs = {'long_name':'topography'}, name='Topography')
# da_topo.hvplot(geo=True, coastline='50m', frame_width=300, cmap='kbc', title='Topography')

### Seasonal fields

#### Interpolate surface fields

In [13]:
import pyresample as prs

varinfo = {
    'Temperature': {
        'long_name':'temperature',
        'units':'degree C'},
    'Salinity': {
        'long_name':'salinity',
        'units':'PSU'},
    'DIC': {
        'long_name':'dissolved inorganic carbon',
        'units':'micro mol/kg'},
    'TA': {
        'long_name':'total alkalinity',
        'units':'micro mol/kg'},
    'pH': {
        'long_name':'pH',
        'units':''},
    'pCO2': {
        'long_name':'seawater partial pressure of CO2',
        'units':'micro atm'},
    'fCO2': {'long_name':'seawater fugacity of CO2',
             'units':'micro atm'},
    'saturation_calcite': {
        'long_name':'calcite saturation state',
        'units':''},
    'saturation_aragonite':{
        'long_name':'aragonite saturation state',
        'units':''}
}

out_grid = prs.geometry.SwathDefinition(lons=lon, lats=lat)
radius = 100000 # in meters
wf = lambda r: 1 - r/radius # weighting function

# dictionary to collect interpolated data
datavars = {}
for vname in varinfo:
    datavars[vname] = []
    
# interpolate data
for season in notebook.tqdm(['winter','spring','summer','fall'], desc='season'):
    df = df_surface[df_surface.season==season]
    in_grid = prs.geometry.SwathDefinition(lons=df.Longitude.values, lats=df.Latitude.values)
    for vname in varinfo:
        # arr_data = griddata((df.Longitude, df.Latitude), df[vname], (lon,lat), method='linear')
        arr_data = prs.kd_tree.resample_custom(in_grid, df[vname].values, out_grid,
                                               radius_of_influence=radius, weight_funcs=wf,
                                               fill_value=np.nan, nprocs=2)
        datavars[vname].append(arr_data)

# dictionary of data arrays
vardict = {}
seasons = range(1,5)
for vname in varinfo:
    data = np.stack(datavars[vname], axis=0)
    da = xr.DataArray(data, coords={'season':seasons, 'lat':yy, 'lon':xx},
                      attrs = varinfo[vname], name=vname)
    vardict[vname] = da.where(da_mask)

# create xarray DataSet and write it to netcdf file
attrs = {'contents':'Seasonal interpolated surface data',
         'seasons':'1 = winter (DJF), 2 = spring (MAM), 3 = summer (JJA), 4 = fall (SON)',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_surface_seasonal = xr.Dataset(vardict, attrs=attrs)
ds_surface_seasonal.to_netcdf('data/bgc_surface_seasonal.nc', mode='w')

season:   0%|          | 0/4 [00:00<?, ?it/s]

#### Interpolate bottom fields

In [14]:
varinfo_bot = {k:varinfo[k] for k in ['Temperature', 'Salinity', 'DIC', 'TA', 'pH', 'saturation_calcite', 'saturation_aragonite']}

# dictionary to collect interpolated data
datavars = {}
for vname in varinfo_bot:
    datavars[vname] = []
    
# interpolate data
for season in notebook.tqdm(['winter','spring','summer','fall'], desc='season'):
    df = df_bottom[df_bottom.season==season]
    in_grid = prs.geometry.SwathDefinition(lons=df.Longitude.values, lats=df.Latitude.values)
    for vname in varinfo_bot:
        # arr_data = griddata((df.Longitude, df.Latitude), df[vname], (lon,lat), method='linear')
        arr_data = prs.kd_tree.resample_custom(in_grid, df[vname].values, out_grid,
                                               radius_of_influence=radius, weight_funcs=wf,
                                               fill_value=np.nan, nprocs=2)
        datavars[vname].append(arr_data)

# dictionary of data arrays
vardict = {}
for vname in varinfo_bot:
    data = np.stack(datavars[vname], axis=0)
    da = xr.DataArray(data, coords={'season':seasons, 'lat':yy, 'lon':xx},
                      attrs = varinfo[vname], name=vname)
    vardict[vname] = da.where(da_mask)

# create xarray DataSet and write it to netcdf file
attrs = {'contents':'Seasonal interpolated bottom data',
         'seasons':'1 = winter (DJF), 2 = spring (MAM), 3 = summer (JJA), 4 = fall (SON)',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_bottom_seasonal = xr.Dataset(vardict, attrs=attrs)
ds_bottom_seasonal.to_netcdf('data/bgc_bottom_seasonal.nc', mode='w')

season:   0%|          | 0/4 [00:00<?, ?it/s]

### Monthly fields

#### Interpolate surface fields

In [15]:
# dictionary to collect interpolated data
datavars = {}
for vname in varinfo:
    datavars[vname] = []
    
# interpolate data
months = range(1,13)
for mon in notebook.tqdm(months, desc='month'):
    df = df_surface[df_surface.Date.dt.month==mon]
    in_grid = prs.geometry.SwathDefinition(lons=df.Longitude.values, lats=df.Latitude.values)
    for vname in varinfo:
        # arr_data = griddata((df.Longitude, df.Latitude), df[vname], (lon,lat), method='linear')
        arr_data = prs.kd_tree.resample_custom(in_grid, df[vname].values, out_grid,
                                               radius_of_influence=radius, weight_funcs=wf,
                                               fill_value=np.nan, nprocs=2)
        datavars[vname].append(arr_data)

# dictionary of data arrays
vardict = {}
for vname in varinfo:
    data = np.stack(datavars[vname], axis=0)
    da = xr.DataArray(data, coords={'month':months, 'lat':yy, 'lon':xx},
                      attrs = varinfo[vname], name=vname)
    vardict[vname] = da.where(da_mask)

# create xarray DataSet and write it to netcdf file
attrs = {'contents':'Monthly interpolated surface data',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_surface_monthly = xr.Dataset(vardict, attrs=attrs)
ds_surface_monthly.to_netcdf('data/bgc_surface_monthly.nc', mode='w')

month:   0%|          | 0/12 [00:00<?, ?it/s]

#### Interpolate bottom fields

In [16]:
# dictionary to collect interpolated data
datavars = {}
for vname in varinfo_bot:
    datavars[vname] = []
    
# interpolate data
for mon in notebook.tqdm(months, desc='month'):
    df = df_bottom[df_bottom.Date.dt.month==mon]
    in_grid = prs.geometry.SwathDefinition(lons=df.Longitude.values, lats=df.Latitude.values)
    for vname in varinfo_bot:
        # arr_data = griddata((df.Longitude, df.Latitude), df[vname], (lon,lat), method='linear')
        arr_data = prs.kd_tree.resample_custom(in_grid, df[vname].values, out_grid,
                                               radius_of_influence=radius, weight_funcs=wf,
                                               fill_value=np.nan, nprocs=2)
        datavars[vname].append(arr_data)

# dictionary of data arrays
vardict = {}
for vname in varinfo_bot:
    data = np.stack(datavars[vname], axis=0)
    da = xr.DataArray(data, coords={'month':months, 'lat':yy, 'lon':xx},
                      attrs = varinfo[vname], name=vname)
    vardict[vname] = da.where(da_mask)

# create xarray DataSet and write it to netcdf file
attrs = {'contents':'Monthly interpolated bottom data',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_bottom_monthly = xr.Dataset(vardict, attrs=attrs)
ds_bottom_monthly.to_netcdf('data/bgc_bottom_monthly.nc', mode='w')

month:   0%|          | 0/12 [00:00<?, ?it/s]