# Apply pytorch neural network model to CTD data
Created by Ivan Lima on Fri Dec 10 2021 11:38:55 -0500

In [1]:
import pandas as pd
import numpy as np
import os, datetime, warnings
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Wed Apr  6 16:30:57 2022


In [2]:
pd.options.display.max_columns = 50
warnings.filterwarnings('ignore')

## Read merged CTD & satellite data 

In [3]:
# Set input variables
features = ['Depth', 'bottom_depth', 'Temperature', 'Salinity', 'pCO2_monthave', 'SLA', 'SST_hires', 'log_KD490']

df_ctd = pd.read_hdf('data/CombinedCTD_satellite_1981-2021.h5', key='df_ctd_sat')
df_ctd['log_KD490'] = np.log(df_ctd.KD490) # log-transformed KD490
df_ctd = df_ctd.rename(columns={'pCO2_year':'pCO2_yearhave', 'pCO2_month':'pCO2_monthave'})
df_ctd = df_ctd[~df_ctd[features].isnull().any(axis=1)]

print('Total number of valid CTD points: {:,}\n'.format(len(df_ctd)))
df_ctd.info()

Total number of valid CTD points: 11,015,435

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11015435 entries, 3298535 to 14539801
Data columns (total 21 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Cast           float64       
 1   Latitude       float64       
 2   Longitude      float64       
 3   Date           datetime64[ns]
 4   Depth          float64       
 5   Temperature    float64       
 6   Salinity       float64       
 7   Oxygen         float64       
 8   pCO2_yearhave  float64       
 9   pCO2_monthave  float64       
 10  Platform_Type  float64       
 11  Temp_flag      float64       
 12  Salt_flag      float64       
 13  DO_flag        float64       
 14  bottom_depth   int64         
 15  SLA            float64       
 16  SST            float64       
 17  SST_hires      float64       
 18  Chl            float64       
 19  KD490          float64       
 20  log_KD490      float64       
dtypes: datetime64[ns](1), f

## Load neural network model and data scaler

In [4]:
import torch, joblib
import torch.nn as nn

scaler = joblib.load('models/scaler.joblib')

n_features = 8 # number of input variables
n_targets = 2  # number of output variables
n_hidden = 256 # number of hidden layers
learning_rate = 0.001

class MLPReg(nn.Module):
    def __init__(self, n_features, n_hidden, n_targets):
        super(MLPReg, self).__init__()
        self.l1    = nn.Linear(n_features, n_hidden)
        self.l2    = nn.Linear(n_hidden, n_hidden)
        self.l3    = nn.Linear(n_hidden, n_targets)
        self.activ = nn.LeakyReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.activ(out)
        out = self.l2(out)
        out = self.activ(out)
        out = self.l3(out)
        return out

nn_reg = MLPReg(n_features=n_features, n_hidden=n_hidden, n_targets=n_targets) # create model instance
loss_func = nn.MSELoss()                                                       # loss function (mean square error)
optimizer = torch.optim.Adam(nn_reg.parameters(), lr=learning_rate)            # optimizer

nn_reg.load_state_dict(torch.load('models/nn_reg_noO2_state.pth'))
nn_reg.eval()

MLPReg(
  (l1): Linear(in_features=8, out_features=256, bias=True)
  (l2): Linear(in_features=256, out_features=256, bias=True)
  (l3): Linear(in_features=256, out_features=2, bias=True)
  (activ): LeakyReLU(negative_slope=0.01)
)

## Apply neural network model to CTD data

In [5]:
X_numpy = df_ctd[features].values
X_numpy_scaled = scaler.transform(X_numpy) # rescale features
X = torch.from_numpy(X_numpy_scaled.astype(np.float32)) # convert array to tensor

# apply model to rescaled features
with torch.no_grad():
    Y_pred = nn_reg(X)

# add estimated DIC & TA to dataframe
df_ctd['DIC'] = Y_pred[:,0]
df_ctd['TA'] = Y_pred[:,1]

df_ctd[features +['DIC','TA']].head()

Unnamed: 0,Depth,bottom_depth,Temperature,Salinity,pCO2_monthave,SLA,SST_hires,log_KD490,DIC,TA
3298535,1.98435,158,8.57,32.401001,375.69,0.004074,9.777556,-2.493288,2062.703613,2242.923584
3298536,2.976518,158,7.91,32.492001,375.69,0.004074,9.777556,-2.493288,2062.651611,2242.545654
3298537,3.968682,158,7.74,32.466,375.69,0.004074,9.777556,-2.493288,2062.236572,2241.744629
3298538,4.96084,158,7.64,32.556999,375.69,0.004074,9.777556,-2.493288,2064.441895,2243.039307
3298539,5.952994,158,7.57,32.673,375.69,0.004074,9.777556,-2.493288,2067.389893,2244.98291


## Write merged CTD & satellite data with DIC & TA to HDF5 file

In [6]:
yr_min, yr_max = df_ctd.Date.dt.year.min(), df_ctd.Date.dt.year.max()
outfile = 'data/CombinedCTD_satellite_bgc_pytorch_{}-{}.h5'.format(yr_min, yr_max)
print('writing {}'.format(outfile))
df_ctd.to_hdf(outfile, key='df_ctd', mode='w', complevel=9)

writing data/CombinedCTD_satellite_bgc_pytorch_2002-2019.h5


## Use 1/4 degree SST instead of high resolution SST

In [7]:
features2 = ['Depth', 'bottom_depth', 'Temperature', 'Salinity', 'pCO2_monthave', 'SLA', 'SST', 'log_KD490']

df_ctd = pd.read_hdf('data/CombinedCTD_satellite_1981-2021.h5', key='df_ctd_sat')
df_ctd['log_KD490'] = np.log(df_ctd.KD490) # log-transformed KD490
df_ctd = df_ctd.rename(columns={'pCO2_year':'pCO2_yearhave', 'pCO2_month':'pCO2_monthave'})
df_ctd = df_ctd[~df_ctd[features2].isnull().any(axis=1)]

print('Total number of valid CTD points: {:,}\n'.format(len(df_ctd)))

Total number of valid CTD points: 11,546,497



In [8]:
X_numpy = df_ctd[features2].values
X_numpy_scaled = scaler.transform(X_numpy) # rescale features
X = torch.from_numpy(X_numpy_scaled.astype(np.float32)) # convert array to tensor
with torch.no_grad(): # apply model to rescaled features
    Y_pred = nn_reg(X)

# add estimated DIC & TA to dataframe
df_ctd['DIC'] = Y_pred[:,0]
df_ctd['TA'] = Y_pred[:,1]

yr_min, yr_max = df_ctd.Date.dt.year.min(), df_ctd.Date.dt.year.max()
outfile = 'data/CombinedCTD_satellite_bgc_pytorch_{}-{}.h5'.format(yr_min, yr_max)
print('writing {}'.format(outfile))
df_ctd.to_hdf(outfile, key='df_ctd', mode='w', complevel=9)

writing data/CombinedCTD_satellite_bgc_pytorch_1998-2019.h5
