In [20]:
import pandas as pd
import xarray as xr
import numpy as np
from numpy import errstate,isneginf,array
import datetime
import os
import yaml

import matplotlib.pyplot as plt
import cmocean as cm    

#from xgboost import XGBRegressor

In [21]:
#This file contains configuration details like API keys and passwords
global_vars = yaml.safe_load(open('../config.yml', 'r') )

#This has custom functions - log transform
%run './00_custom_functions.ipynb'
#more functions for flux conversions
%run './00_co2_flux_equations.ipynb'

In [22]:
#Set base folders
result_folder = global_vars['reconstruction_folder']
data_folder_root = global_vars['download_folder']
print(result_folder)

#This variable sets the output file type. 
#When using cloud storage, it is recommended to use ARCO (Analysis-Ready Cloud-Optimized) formats like Zarr over NetCDF
output_file_type = '.zarr' if data_folder_root[0:5] == 'gs://' else '.nc'
print(f'Files will be outputed as: {output_file_type}')

/data/artemis/workspace/afay/LDEO_HPD/data/
Files will be outputed as: .nc


In [5]:
#Set location of input files (path from root above)
#Note that these were processed such that they already share a coordinate set
sst_processed  = data_folder_root + 'SST/processed/SST_NOAA_OI-V2-1x1_198201-202312.nc'
sss_processed  = data_folder_root + 'SSS/processed/SSS_Met-Office-Hadley-Centre_EN422f-g10-analyses_198201-202312.nc' #previously 202303
mld_processed  = data_folder_root + 'MLD/processed/MLD_IFREMER-deBoyer_DT02-c1m-1x1_198201-202312.nc'
chl_processed  = data_folder_root + 'CHL/processed/CHL_ARI-ST-GlobColour_L3m-GLOB-100-merged-GSM-CHL1_198201-202312.nc'
pco2_processed = data_folder_root + 'pCO2/processed/fCO2_SOCAT-weighted_198201-202312.nc' #updated for fco2
xco2_processed = data_folder_root + 'xCO2/processed/xCO2_NOAA_xCO2-mm-gl-monthly_198201-202312.nc'
# #add additional sources if desired
# sst_processed_option2 = data_folder_root + 'SST/processed/SST_ECMWF_ERA5-monthly-reanalysis-1x1-SST_198201-202304.nc'
# sst_processed_option3 = data_folder_root + 'SST/processed/SST_JMA_JRA55-do-monthly-reanalysis-SST_198201-202304.nc'
# mld_processed_option2 = data_folder_root + 'MLD/processed/MLD_UCSD-Argo_MLD-dt-mean-1x1_198201-202304.nc'
list_for_df = [sst_processed, sss_processed, mld_processed, chl_processed, pco2_processed, xco2_processed
              # ,sst_processed_option2, sst_processed_option3, mld_processed_option2
              ] 

In [6]:
#This is where we set parameters for the ML algorithm for finding the long term pco2 mean feature

#The next variable is for the XGBoost method for both pCO2 Residual and creating the long term pCo2 mean feature. They were determined via a grid search in previous iterations. 
best_params = {'max_depth': 9, 'n_estimators': 1000} 
random_seed = 47  #Set the random seeds used for training
jobs = -1         #Number of cores you have access to for model training; -1 for all available ones

#This variable is a list of features used for the Long Term pCO2 mean machine learning
feature_sel = ['sst','sst_anomaly','sss','sss_anomaly','chl_log','chl_log_anomaly','mld_log','xco2_trend','A','B','C','T0','T1']
target_sel = ['fco2']  #previously was pco2
sst_variable_option = ['sst']  #the name of the SST variable to use for calculating the residual component of the target

# Create Features
### Base Features

In [7]:
xrfull = xr.merge([xr.open_dataset(f) for f in list_for_df], compat='broadcast_equals')
xrfull.attrs = "" #just removing attribute details since wont be accurate anymore
#xrfull

In [9]:
#next add derived (logs)
xrfull = xrfull.assign( mld_log = log_or_0_xr(xrfull.mld, 'mld_log') 
                       ,chl_log = log_or_0_xr(xrfull.chl, 'chl_log')
                       # ,mld_argo_log = log_or_0_xr(xrfull.mld_argo, 'mld_argo_log')
                      )

In [10]:
%%time
#add anomalies fields
anomalies = xrfull.groupby("time.month") - xrfull.groupby("time.month").mean("time")
anomalies = anomalies.get(['sst','sss','chl_log']) #just need SST, SSS, CHL anomalies #extras: , 'sst_era5', 'sst_jra55'
anomalies = anomalies.drop('month') 
anomalies = anomalies.rename({ 'sst': 'sst_anomaly'
                              ,'sss':'sss_anomaly'
                              ,'chl_log':'chl_log_anomaly'
                              # ,'sst_era5':'sst_era5_anomaly'
                              # ,'sst_jra55':'sst_jra55_anomaly'
                             }) 
#anomalies
xrfull = xrfull.merge(anomalies, compat='identical')

CPU times: user 3.68 s, sys: 3.48 s, total: 7.16 s
Wall time: 9.27 s


In [11]:
#add time and space derivations
xrfull = xrfull.assign( days_idx = xrfull.time.dt.dayofyear 
                       ,lon_rad = np.radians(xrfull.xlon)
                       ,lat_rad = np.radians(xrfull.ylat)
                      )
xrfull = xrfull.assign( T0 = np.cos(xrfull.days_idx * 2 * np.pi / 365)
                       ,T1 = np.sin(xrfull.days_idx * 2 * np.pi / 365)
                       ,A  = np.sin(xrfull.lat_rad)
                       ,B  = np.cos(xrfull.lat_rad)*np.sin(xrfull.lon_rad)
                       ,C  = -np.cos(xrfull.lat_rad)*np.cos(xrfull.lon_rad)
                      )

In [12]:
xrfull

## Add Features
#### GOBM Features

In [15]:
#only one file to add with 8 vars really
#just going to combine CO2 residual Df with the new GOBMs since all else is the same (also add network mask variable)

netmask = xr_open_dataset_custom(data_folder_root + 'Masks/processed/mask_LEAP_land-sea-network.nc')
gobms = xr_open_dataset_custom(data_folder_root + 'GOBM/processed/GOBM_GCB-2024_fco2-10-models_198201-202312.nc')

In [16]:
gobms

In [1]:
xrfull = xrfull.merge(gobms, compat='identical')
xrfull = xrfull.merge(netmask, compat='identical')


In [19]:
#export out (may be a large file)
output_xarray_with_date(xrfull, result_folder+'', 'HPD_LEAP_fco2-full-dataset-preML_2024models', filetype=output_file_type)



Saved HPD_LEAP_fco2-full-dataset-preML_2024models_198201-202312.nc to /data/artemis/workspace/afay/LDEO_HPD/data/
