# Extract surface and bottom DIC and total alkalinity (TA) data
Created by Ivan Lima on Thu Jan  6 2022 11:17:56 -0500

In this notebook we extract the estimated DIC and TA near the surface and bottom and create seasonal and monthly fields of surface and bottom DIC and TA using interpolation. 

In [1]:
import pandas as pd
import numpy as np
import datetime
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Thu Jan  6 13:50:45 2022


## Read data

In [2]:
cols = ['Cast', 'Latitude', 'Longitude', 'Date', 'Depth', 'Temperature', 'Salinity', 'DIC', 'TA', 'bottom_depth', 'Platform_Type']
df_ctd = pd.read_hdf('data/CombinedCTD_satellite_bgc_pytorch_2002-2019.h5', key='df_ctd')
df_ctd = df_ctd[cols]
df_ctd = df_ctd[df_ctd.Date.dt.year>2012] # use only contemporary data
df_ctd = df_ctd[df_ctd.Depth<600]         # use only shelf data
df_ctd.loc[df_ctd.Date.dt.month.isin([1,2,12]),'season'] = 'winter' # set seasons
df_ctd.loc[df_ctd.Date.dt.month.isin([3,4,5]),'season'] = 'spring'
df_ctd.loc[df_ctd.Date.dt.month.isin([6,7,8]),'season'] = 'summer'
df_ctd.loc[df_ctd.Date.dt.month.isin([9,10,11]),'season'] = 'fall'
df_ctd.head()

Unnamed: 0,Cast,Latitude,Longitude,Date,Depth,Temperature,Salinity,DIC,TA,bottom_depth,Platform_Type,season
5686516,15637258.0,44.267502,-63.317699,2013-01-09,0.991947,3.952,30.871,2061.806152,2135.737793,144,1.0,winter
5686517,15637258.0,44.267502,-63.317699,2013-01-09,1.487919,3.943,30.868999,2061.752686,2135.686523,144,1.0,winter
5686518,15637258.0,44.267502,-63.317699,2013-01-09,1.98389,3.937,30.870001,2061.651855,2135.631348,144,1.0,winter
5686519,15637258.0,44.267502,-63.317699,2013-01-09,2.479859,3.936,30.870001,2061.583984,2135.582031,144,1.0,winter
5686520,15637258.0,44.267502,-63.317699,2013-01-09,2.975827,3.936,30.870001,2061.519287,2135.533203,144,1.0,winter


## Remove ouliers ($x < -3\sigma$ or $ x > 3\sigma$)

In [3]:
df = df_ctd[['Temperature', 'Salinity', 'DIC', 'TA']]
df_clean = df[np.abs(df - df.mean()) < 3*df.std()].dropna()
n_orig, n_clean = len(df_ctd), len(df_clean)
df_ctd = df_ctd.reindex(df_clean.index)
print('Original data: {:,} points'.format(n_orig))
print('Cleaned data:  {:,} points'.format(n_clean))
print('{:,} points removed ({:.2f}%)'.format(n_orig - n_clean, (n_orig - n_clean)/n_orig * 100))

Original data: 8,417,747 points
Cleaned data:  8,347,704 points
70,043 points removed (0.83%)


## Extract surface & bottom data

In [4]:
# get surface values (mean for upper 5 m)
df_tmp1 = df_ctd[df_ctd.Depth<=5].groupby('Cast').mean()
df_tmp2 = df_ctd.loc[df_ctd.Depth<=5, ['Cast','Date','season']].groupby('Cast').nth(0)
df_surface = pd.concat([df_tmp1, df_tmp2], axis=1)[cols[1:]+['season']]
df_surface.to_hdf('data/bgc_surface.h5', key='df_surface', mode='w', complevel=9) # write data to hdf5 file
df_surface.head()

Unnamed: 0_level_0,Latitude,Longitude,Date,Depth,Temperature,Salinity,DIC,TA,bottom_depth,Platform_Type,season
Cast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15637258.0,44.267502,-63.317699,2013-01-09,2.975823,3.938111,30.869889,2061.527588,2135.535645,144.0,1.0,winter
15637379.0,44.930698,-66.849503,2013-01-14,3.47158,5.903143,32.403,2075.17334,2185.747559,23.0,1.0,winter
15637792.0,44.267502,-63.317501,2013-01-29,2.975823,2.007222,30.969,2065.900879,2149.934082,144.0,1.0,winter
15638442.0,44.930698,-66.849503,2013-02-11,3.967516,3.2588,32.4018,2081.686279,2192.593994,23.0,1.0,winter
15638623.0,44.268002,-63.319199,2013-02-15,2.727838,1.0801,31.0658,2061.78418,2145.269775,144.0,1.0,winter


In [5]:
# get values at the bottom of each cast
def get_bottom_vals(dfgrp):
    return dfgrp.sort_values('Depth')[-1:][cols[1:]+['season']]

df_bottom = df_ctd.groupby('Cast').apply(get_bottom_vals).reset_index(1, drop=True)
df_bottom.to_hdf('data/bgc_bottom.h5', key='df_bottom', mode='w', complevel=9) # write data to hdf5 file
df_bottom.head()

Unnamed: 0_level_0,Latitude,Longitude,Date,Depth,Temperature,Salinity,DIC,TA,bottom_depth,Platform_Type,season
Cast,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
15637258.0,44.267502,-63.317699,2013-01-09,159.146027,6.626,32.963001,2114.399902,2218.093262,144,1.0,winter
15637379.0,44.930698,-66.849503,2013-01-14,91.233437,6.349,32.580002,2086.559082,2196.546631,23,1.0,winter
15637792.0,44.267502,-63.317501,2013-01-29,155.67688,9.827,34.460999,2162.613525,2301.360596,144,1.0,winter
15638442.0,44.930698,-66.849503,2013-02-11,100.651962,3.259,32.416,2079.348389,2189.548828,23,1.0,winter
15638623.0,44.268002,-63.319199,2013-02-15,148.242767,6.496,33.110001,2121.177246,2235.505371,144,1.0,winter


## Interpolate data into horizontal fields

### Create grid

In [6]:
# lon_min, lon_max = np.round(df_ctd.Longitude.min()), np.round(df_ctd.Longitude.max())
# lat_min, lat_max = np.round(df_ctd.Latitude.min()), np.round(df_ctd.Latitude.max())

lon_min, lon_max = np.round(df_ctd.Longitude.min()), -65 # NELME region
lat_min, lat_max = np.round(df_ctd.Latitude.min()), 45.5 # NELME region
dx = 0.10 # grid resolution in degrees

xx = np.arange(lon_min, lon_max+dx/4, dx)
yy = np.arange(lat_min, lat_max+dx/4, dx)
lon, lat = np.meshgrid(xx, yy)

In [7]:
# import hvplot.pandas
# df_grid = pd.DataFrame({'lon':lon.ravel(), 'lat':lat.ravel()})
# df_grid.hvplot.points('lon', 'lat', geo=True, xlabel='longitude', ylabel='latitude',
#                       title='Data grid', tiles=tiles, frame_width=300, size=1)

### Create topography mask

In [8]:
import xarray as xr
from scipy.interpolate import griddata

ds_topo = xr.open_dataset('data/etopo5.nc')
X = np.where(ds_topo.X>180, ds_topo.X-360, ds_topo.X) # 0:360 -> -180:180
lon_topo, lat_topo = np.meshgrid(X, ds_topo.Y.values)
grid_topo = griddata((lon_topo.ravel(), lat_topo.ravel()), ds_topo.bath.values.ravel(), (lon,lat), method='linear')
ds_topo.close()

shelf_mask = (grid_topo<-5) & (grid_topo>-600)
da_mask = xr.DataArray(shelf_mask, coords={'lat':yy, 'lon':xx},
                       attrs = {'long_name':'shelf mask'}, name='SHELF_MASK')

In [9]:
# import hvplot.xarray
# da_topo = xr.DataArray(grid_topo, coords={'lat':yy, 'lon':xx},
#                        attrs = {'long_name':'topography'}, name='Topography')
# da_topo.hvplot(geo=True, coastline='50m', frame_width=300, cmap='kbc', title='Topography')

### Seasonal fields

#### Interpolate surface DIC and TA

In [10]:
arr_list_dic, arr_list_alk  = [], []
seasons = range(1,5)
for season in ['winter','spring','summer','fall']:
    df = df_surface[df_surface.season==season]
    arr_dic = griddata((df.Longitude, df.Latitude), df.DIC, (lon,lat), method='linear')
    arr_alk = griddata((df.Longitude, df.Latitude), df.TA, (lon,lat), method='linear')
    arr_list_dic.append(arr_dic)
    arr_list_alk.append(arr_alk)

dic = np.stack(arr_list_dic, axis=0)
alk = np.stack(arr_list_alk, axis=0)
da_dic = xr.DataArray(dic, coords={'season':seasons, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'dissolved inorganic carbon', 'units':'micro mol/kg'}, name='DIC')
da_alk = xr.DataArray(alk, coords={'season':seasons, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'total alkalinity', 'units':'micro mol/kg'}, name='TA')
attrs = {'contents':'Seasonal interpolated surface DIC & TA data',
         'seasons':'1 = winter, 2 = spring, 3 = summer, 4 = fall',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_surface_seasonal = xr.Dataset({'DIC':da_dic.where(da_mask), 'TA':da_alk.where(da_mask)}, attrs=attrs)
ds_surface_seasonal.to_netcdf('data/bgc_surface_seasonal.nc', mode='w') # write data to netcdf file

#### Interpolate bottom DIC and TA

In [11]:
arr_list_dic, arr_list_alk  = [], []
for season in ['winter','spring','summer','fall']:
    df = df_bottom[df_bottom.season==season]
    arr_dic = griddata((df.Longitude, df.Latitude), df.DIC, (lon,lat), method='linear')
    arr_alk = griddata((df.Longitude, df.Latitude), df.TA, (lon,lat), method='linear')
    arr_list_dic.append(arr_dic)
    arr_list_alk.append(arr_alk)

dic = np.stack(arr_list_dic, axis=0)
alk = np.stack(arr_list_alk, axis=0)
da_dic = xr.DataArray(dic, coords={'season':seasons, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'dissolved inorganic carbon', 'units':'micro mol/kg'}, name='DIC')
da_alk = xr.DataArray(alk, coords={'season':seasons, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'total alkalinity', 'units':'micro mol/kg'}, name='TA')
attrs = {'contents':'Seasonal interpolated bottom DIC & TA data',
         'seasons':'1 = winter, 2 = spring, 3 = summer, 4 = fall',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_bottom_seasonal = xr.Dataset({'DIC':da_dic.where(da_mask), 'TA':da_alk.where(da_mask)}, attrs=attrs)
ds_bottom_seasonal.to_netcdf('data/bgc_bottom_seasonal.nc', mode='w') # write data to netcdf file

### Monthly fields

#### Interpolate surface DIC and alkalinity

In [12]:
arr_list_dic, arr_list_alk  = [], []
months = range(1,13)
for mon in months:
    df = df_surface[df_surface.Date.dt.month==mon]
    arr_dic = griddata((df.Longitude, df.Latitude), df.DIC, (lon,lat), method='linear')
    arr_alk = griddata((df.Longitude, df.Latitude), df.TA, (lon,lat), method='linear')
    arr_list_dic.append(arr_dic)
    arr_list_alk.append(arr_alk)

dic = np.stack(arr_list_dic, axis=0)
alk = np.stack(arr_list_alk, axis=0)
da_dic = xr.DataArray(dic, coords={'month':months, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'dissolved inorganic carbon', 'units':'micro mol/kg'}, name='DIC')
da_alk = xr.DataArray(alk, coords={'month':months, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'total alkalinity', 'units':'micro mol/kg'}, name='TA')
attrs = {'contents':'Monthly interpolated surface DIC & TA data',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_surface_monthly = xr.Dataset({'DIC':da_dic.where(da_mask), 'TA':da_alk.where(da_mask)}, attrs=attrs)
ds_surface_monthly.to_netcdf('data/bgc_surface_monthly.nc', mode='w') # write data to netcdf file

#### Interpolate bottom DIC and alkalinity

In [13]:
arr_list_dic, arr_list_alk  = [], []
for mon in months:
    df = df_bottom[df_bottom.Date.dt.month==mon]
    arr_dic = griddata((df.Longitude, df.Latitude), df.DIC, (lon,lat), method='linear')
    arr_alk = griddata((df.Longitude, df.Latitude), df.TA, (lon,lat), method='linear')
    arr_list_dic.append(arr_dic)
    arr_list_alk.append(arr_alk)

dic = np.stack(arr_list_dic, axis=0)
alk = np.stack(arr_list_alk, axis=0)
da_dic = xr.DataArray(dic, coords={'month':months, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'dissolved inorganic carbon', 'units':'micro mol/kg'}, name='DIC')
da_alk = xr.DataArray(alk, coords={'month':months, 'lat':yy, 'lon':xx},
                      attrs = {'long_name':'total alkalinity', 'units':'micro mol/kg'}, name='TA')
attrs = {'contents':'Monthly interpolated bottom DIC & TA data',
         'history':'Created by Ivan Lima <ilima@whoi.edu> on {}'.format(datetime.datetime.now().ctime())}
ds_bottom_monthly = xr.Dataset({'DIC':da_dic.where(da_mask), 'TA':da_alk.where(da_mask)}, attrs=attrs)
ds_bottom_monthly.to_netcdf('data/bgc_bottom_monthly.nc', mode='w') # write data to netcdf file