# Apply pytorch neural network model to CTD data
Created by Ivan Lima on Wed Dec 15 2021 16:11:53 -0500

This version of the neural network model does not include satellite data as input features.

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os, datetime, warnings
print('Last updated on {}'.format(datetime.datetime.now().ctime()))

Last updated on Mon Mar  7 10:49:12 2022


In [2]:
sns.set_theme(context='paper', style='ticks', palette='tab10', rc={'figure.dpi':100, 'figure.figsize':[5, 5], 'axes.grid':True})
pd.options.display.max_columns = 50
warnings.filterwarnings('ignore')

## Read merged CTD data 

In [3]:
# Set input variables
features = ['Depth', 'bottom_depth', 'Temperature', 'Salinity', 'pCO2_monthave']

df_ctd = pd.read_hdf('data/CombinedCTD_600m_1873-2021.h5', key='df_ctd')
df_ctd = df_ctd.rename(columns={'pCO2_year':'pCO2_yearhave', 'pCO2_month':'pCO2_monthave'})
df_ctd = df_ctd[~df_ctd[features].isnull().any(axis=1)]
print('Total number of valid CTD points: {:,}\n'.format(len(df_ctd)))
df_ctd.info()

Total number of valid CTD points: 11,976,547

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11976547 entries, 46495 to 12023041
Data columns (total 15 columns):
 #   Column         Dtype         
---  ------         -----         
 0   Cast           float64       
 1   Latitude       float64       
 2   Longitude      float64       
 3   Depth          float64       
 4   Temperature    float64       
 5   Temp_flag      float64       
 6   Salinity       float64       
 7   Salt_flag      float64       
 8   Oxygen         float64       
 9   DO_flag        float64       
 10  Platform_Type  float64       
 11  pCO2_yearhave  float64       
 12  pCO2_monthave  float64       
 13  Date           datetime64[ns]
 14  bottom_depth   int64         
dtypes: datetime64[ns](1), float64(13), int64(1)
memory usage: 1.4 GB


## Load neural network model and data scaler

In [4]:
import torch, joblib
import torch.nn as nn

scaler = joblib.load('models/scaler_nosat.joblib')

n_features = len(features) # number of input variables
n_targets = 2  # number of output variables
n_hidden = 256 # number of hidden layers
learning_rate = 0.001

class MLPReg(nn.Module):
    def __init__(self, n_features, n_hidden, n_targets):
        super(MLPReg, self).__init__()
        self.l1    = nn.Linear(n_features, n_hidden)
        self.l2    = nn.Linear(n_hidden, n_hidden)
        self.l3    = nn.Linear(n_hidden, n_targets)
        self.activ = nn.LeakyReLU()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.activ(out)
        out = self.l2(out)
        out = self.activ(out)
        out = self.l3(out)
        return out

nn_reg = MLPReg(n_features=n_features, n_hidden=n_hidden, n_targets=n_targets) # create model instance
loss_func = nn.MSELoss()                                                       # loss function (mean square error)
optimizer = torch.optim.Adam(nn_reg.parameters(), lr=learning_rate)            # optimizer

nn_reg.load_state_dict(torch.load('models/nn_reg_nosat_state.pth'))
nn_reg.eval()

MLPReg(
  (l1): Linear(in_features=5, out_features=256, bias=True)
  (l2): Linear(in_features=256, out_features=256, bias=True)
  (l3): Linear(in_features=256, out_features=2, bias=True)
  (activ): LeakyReLU(negative_slope=0.01)
)

## Apply neural network model to CTD data

In [5]:
X_numpy = df_ctd[features].values
X_numpy_scaled = scaler.transform(X_numpy) # rescale features
X = torch.from_numpy(X_numpy_scaled.astype(np.float32)) # convert array to tensor

# apply model to rescaled features
with torch.no_grad():
    Y_pred = nn_reg(X)

# add estimated DIC & TA to dataframe
df_ctd['DIC'] = Y_pred[:,0]
df_ctd['TA'] = Y_pred[:,1]

df_ctd[features + ['Date','DIC','TA']].head()

Unnamed: 0,Depth,bottom_depth,Temperature,Salinity,pCO2_monthave,Date,DIC,TA
46495,54.0,55,15.77,36.0,315.7,1958-03-01,2879.59668,3238.353271
46496,0.0,55,6.66,32.659,315.7,1958-03-02,3030.841797,3282.488525
46497,0.0,55,5.38,32.569,315.7,1958-03-03,3055.965576,3295.482178
46498,0.0,55,4.555,32.039001,315.7,1958-03-04,3071.268311,3307.949463
46499,0.0,55,4.88,32.519001,315.7,1958-03-05,3066.236816,3300.008545


## Write merged CTD data with DIC & TA to HDF5 file

In [6]:
yr_min, yr_max = df_ctd.Date.dt.year.min(), df_ctd.Date.dt.year.max()
outfile = 'data/CombinedCTD_600m_bgc_pytorch_{}-{}.h5'.format(yr_min, yr_max)
print('writing {}'.format(outfile))
df_ctd.to_hdf(outfile, key='df_ctd', mode='w', complevel=9)

writing data/CombinedCTD_600m_bgc_pytorch_1958-2021.h5
