# Training Data Preparation for Random Forest

In [2]:
# Import Packages

#Data Structure/Organization Packages
import pandas as pd
import xarray as xr
import numpy as np
import numpy.ma as ma

# Plotting Packages
import matplotlib.pyplot as plt
import seaborn

# Machine Learning/Statistics Packages
from scipy.stats import kde
from scipy.stats import iqr
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import xgboost as xgb
from pygam import LinearGAM, l, s, f
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error

# Utility Packages
import random as rand
import os
import math
import dataframe_image as dfi
import itertools
from scipy import interpolate

# Oceanography Packages
import gsw
import PyCO2SYS as pyco2

import sogos_module as sg

[Ship Data Link](https://uwnetid-my.sharepoint.com/:f:/g/personal/sangsong_uw_edu/ErLtPwS6pdZClgo0Flp9lq8BoNBWNN6mE2LJuJJVmt874A?e=qHB9qp)

[Float Data Link](https://uwnetid-my.sharepoint.com/:f:/g/personal/sangsong_uw_edu/Es-ESkVfIlpHhpFq7o5LTaoBk6IOFXMqQ8wFCgLMA512uw?e=7cWT6J)


In [3]:
# Set up directories
float_directory = "/Users/sangminsong/OneDrive - UW/Code/SOGOS/data/float/"
ship_filepath = "/Users/sangminsong/OneDrive - UW/Code/SOGOS/data/ship/bottle_data.csv"


## Ship Data

In [4]:
# Load Ship Data (Already QC Filtered) into Pandas Dataframe
ship_DF = pd.read_csv(ship_filepath)
ship_DF["Julian Day"] = pd.to_datetime(ship_DF["Julian Day"].values/1440, unit='D', origin=pd.Timestamp('1980-01-01'))
ship_DF['Yearday'] = sg.datetime2ytd(ship_DF['Julian Day'])

In [15]:
ship_DF

Unnamed: 0,Latitude,Longitude,Julian Day,Bottle Depth,Pressure,Temperature,Salinity,Oxygen,Nitrate,DIC,Alkalinity,pH,Yearday
0,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,522.0,0.7214,34.6535,210.8,32.03,2252.1,2348.73,7.5811,105.379861
1,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,505.4,0.7200,34.6525,211.0,32.02,,,,105.379861
2,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,490.7,0.7193,34.6523,211.0,31.94,2250.7,2346.14,7.5831,105.379861
3,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,439.7,0.7221,34.6491,211.1,32.01,,,,105.379861
4,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,388.8,0.4779,34.5828,221.2,32.09,2250.4,2339.87,7.5826,105.379861
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1817,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,97.7,20.1644,35.6228,220.7,0.26,2043.3,2332.37,8.0089,130.749306
1818,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,73.9,20.1626,35.6230,220.8,0.25,2044.6,2331.96,8.0090,130.749306
1819,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,48.2,20.1614,35.6224,220.8,0.27,2046.8,2332.79,8.0090,130.749306
1820,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,33.1,20.1627,35.6224,221.3,0.25,2045.6,2332.62,8.0100,130.749306


## Float Data and Corrections

In [5]:
floatDSdict = {}
for filename in os.listdir(float_directory):
    fp = float_directory + filename
    DS = xr.open_dataset(fp, decode_times=False)
    floatDSdict[filename[0:7]] = DS

In [6]:
## Check float quality control
flag = 0
for floatnum in floatDSdict:
    for prof in floatDSdict[floatnum]["PARAMETER_DATA_MODE"]:
        data_modes = np.concatenate((prof.values[-2:],prof.values[0:4]))
        if np.all((data_modes == b'D') | (data_modes == b'A')) == False:
            flag = 1
if flag == 0:
    print("Data Mode is all D or A! All Good!")

Data Mode is all D or A! All Good!


In [7]:
def make_float_DF(floatDSdict):
    """
    Create a dictionary with all necesary data fields from Argo.

    # We Want the following data: 
    # Data Type, Date Update, LATITUDE, LONGITUDE, Position QC, PRES_ADJUSTED, PRES_ADJUSTED_QC,
    # TEMP_ADJUSTED, TEMP_ADJUSTED_QC, PSAL_ADJUSTED, PSAL_ADJUSTED_QC, DOXY_ADJUSTED, DOXY_ADJUSTED_QC
    # CHLA_ADJUSTED, CHLA_ADJUSTED_QC, NITRATE_ADJUSTED, NITRATE_ADJUSTED_QC

    # Data Type is structured differently so we will process the QC in a different way.
    """
    
    TrainingData = {
        # "Serial Number" : [],
        # "Data Mode": [],
        "Julian Day" : [],
        "Julian Day QC" : [],
        "Latitude" : [],
        "Longitude" : [],
        "Position QC" : [],
        "Pressure" : [],
        "Pressure QC": [],
        "Temperature Adjusted" : [],
        "Temperature Adjusted QC" : [],
        "Temperature" : [],
        "Temperature QC" : [],
        "Salinity Adjusted" : [],
        "Salinity Adjusted QC" : [],
        "Salinity" : [],
        "Salinity QC" : [],
        "Oxygen" : [],
        "Oxygen QC" : [],
        "Nitrate": [],
        "Nitrate QC": [],
        "pH": [],
        "pH QC": []
    }

    # Iterate through all floats in float dictionary
    for floatnum in floatDSdict:

        # Establish empty lists to append data to
        # float_serial = []
        # data_mode = []
        float_juld = []
        juld_QC = []
        float_lat = []
        float_lon = []
        position_QC = []
        float_pres = []
        pres_QC = []
        float_temp_adj = []
        temp_adj_QC = []
        float_temp = []
        temp_QC = []
        float_sal = []
        sal_QC = []
        float_sal_adj = []
        sal_adj_QC = []
        float_ox = []
        ox_QC = []
        nit = []
        nit_QC = []
        pH = []
        pH_QC = []
        # Iterate through every profile of a specific float
        for iprof in floatDSdict[floatnum].N_PROF.values:
            # Append Pressure, Temperature, P Salinity, Dis Oxygen, Nitrate, and pH, as well as associated QC
            float_pres = float_pres + list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)
            pres_QC = pres_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PRES_ADJUSTED_QC[iprof].values]
            float_temp_adj = float_temp_adj + list(floatDSdict[floatnum].TEMP_ADJUSTED[iprof].values)
            temp_adj_QC = temp_adj_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].TEMP_ADJUSTED_QC[iprof].values]
            float_temp = float_temp + list(floatDSdict[floatnum].TEMP[iprof].values)
            temp_QC = temp_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].TEMP_QC[iprof].values]
            float_sal_adj = float_sal_adj + list(floatDSdict[floatnum].PSAL_ADJUSTED[iprof].values)
            sal_adj_QC = sal_adj_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PSAL_ADJUSTED_QC[iprof].values]
            float_sal = float_sal + list(floatDSdict[floatnum].PSAL[iprof].values)
            sal_QC = sal_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PSAL_QC[iprof].values]
            float_ox = float_ox + list(floatDSdict[floatnum].DOXY_ADJUSTED[iprof].values)
            ox_QC = ox_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].DOXY_ADJUSTED_QC[iprof].values]
            nit = nit + list(floatDSdict[floatnum].NITRATE_ADJUSTED[iprof].values)
            nit_QC = nit_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].NITRATE_ADJUSTED_QC[iprof].values]
            pH = pH + list(floatDSdict[floatnum].PH_IN_SITU_TOTAL_ADJUSTED[iprof].values)
            pH_QC = pH_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PH_IN_SITU_TOTAL_ADJUSTED_QC[iprof].values]
            # print(float_pres)

            # Append Julian Day, lat, lon, and QC. Values stay the same for whole profile
            # float_serial_temp = np.array2string(floatDSdict[floatnum].FLOAT_SERIAL_NO[iprof].values)
            float_juld_temp = pd.to_datetime(floatDSdict[floatnum].JULD.values[iprof], unit='D', origin=pd.Timestamp('1950-01-01'))
            juld_QC_temp = int(floatDSdict[floatnum].JULD_QC.values[iprof])
            float_lat_temp = float(floatDSdict[floatnum].LATITUDE.values[iprof])
            float_lon_temp = float(floatDSdict[floatnum].LONGITUDE.values[iprof])
            position_QC_temp = int(floatDSdict[floatnum].POSITION_QC.values[iprof])
            data_mode_temp = [str(i.decode("utf-8")) for i in floatDSdict[floatnum].PARAMETER_DATA_MODE[iprof].values]
            # print(data_mode_temp)
            # print(position_QC_temp)

            # Duplicate values to be of the right array size since values stay the same for whole profile
            # float_serial = [float_serial_temp] * len(float_pres)
            float_juld = float_juld + ([float_juld_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            juld_QC = juld_QC + ([juld_QC_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            float_lat = float_lat + ([float_lat_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            float_lon = float_lon + ([float_lon_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            position_QC = position_QC + ([position_QC_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            # data_mode = data_mode + [data_mode_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values))
        print("float", floatnum, "completed")

        # Add to overall training data dictionary
        # TrainingData["Serial Number"] = TrainingData["Serial Number"] + float_serial
        # TrainingData["Data Mode"] = TrainingData["Data Mode"] + data_mode
        TrainingData["Julian Day"] = TrainingData["Julian Day"] + float_juld
        TrainingData["Julian Day QC"] = TrainingData["Julian Day QC"] + juld_QC
        TrainingData["Latitude"] = TrainingData["Latitude"] + float_lat
        TrainingData["Longitude"] = TrainingData["Longitude"] + float_lon
        TrainingData["Position QC"] = TrainingData["Position QC"] + position_QC
        TrainingData["Pressure"] = TrainingData["Pressure"] + float_pres
        TrainingData["Pressure QC"] = TrainingData["Pressure QC"] + pres_QC
        TrainingData["Temperature Adjusted"] = TrainingData["Temperature Adjusted"] + float_temp_adj
        TrainingData["Temperature Adjusted QC"] = TrainingData["Temperature Adjusted QC"] + temp_adj_QC
        TrainingData["Temperature"] = TrainingData["Temperature"] + float_temp
        TrainingData["Temperature QC"] = TrainingData["Temperature QC"] + temp_QC
        TrainingData["Salinity Adjusted"] = TrainingData["Salinity Adjusted"] + float_sal_adj
        TrainingData["Salinity Adjusted QC"] = TrainingData["Salinity Adjusted QC"] + sal_adj_QC
        TrainingData["Salinity"] = TrainingData["Salinity"] + float_sal
        TrainingData["Salinity QC"] = TrainingData["Salinity QC"] + sal_QC
        TrainingData["Oxygen"] = TrainingData["Oxygen"] + float_ox
        TrainingData["Oxygen QC"] = TrainingData["Oxygen QC"] + ox_QC
        TrainingData["Nitrate"] = TrainingData["Nitrate"] + nit
        TrainingData["Nitrate QC"] = TrainingData["Nitrate QC"] + nit_QC
        TrainingData["pH"] = TrainingData["pH"] + pH
        TrainingData["pH QC"] = TrainingData["pH QC"] + pH_QC   
    # Convert dictionary into pandas dataframe object
    TrainingDF = pd.DataFrame.from_dict(TrainingData)

    #QC for Pressure, Position, Temperature, and Salinity, Nitrate, Oxygen, pH
    # 1: Good data
    # 2: Probably good data
    # 8: Estimated
    # A: real time adjusted values
    # D: Delayed mode
    # Taken from https://argo.ucsd.edu/data/how-to-use-argo-files/

    TrainingDF = TrainingDF[(TrainingDF["Position QC"] == 1) | (TrainingDF["Position QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["Julian Day QC"] == 1) | (TrainingDF["Julian Day QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["Pressure QC"] == 1) | (TrainingDF["Pressure QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["Temperature QC"] == 1) | (TrainingDF["Temperature QC"] == 2) | (TrainingDF["Temperature QC"] == 8)]
    TrainingDF = TrainingDF[(TrainingDF["Salinity QC"] == 1) | (TrainingDF["Salinity QC"] == 2) | (TrainingDF["Salinity QC"] == 8)]
    TrainingDF = TrainingDF[(TrainingDF["Oxygen QC"] == np.nan) | ((TrainingDF["Oxygen QC"] == 1) | (TrainingDF["Oxygen QC"] == 2))]
    TrainingDF = TrainingDF[(TrainingDF["Nitrate QC"] == 1) | (TrainingDF["Nitrate QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["pH QC"] == 1) | (TrainingDF["pH QC"] == 2)]

    # Spatial Bounds
    # TrainingDF = TrainingDF[(TrainingDF["Longitude"] < 33) & (TrainingDF["Longitude"] > 27)]

    return TrainingDF

In [8]:
float_DF = make_float_DF(floatDSdict)
float_DF["Salinity Adjusted"] = gsw.SA_from_SP(float_DF["Salinity Adjusted"],float_DF["Pressure"],float_DF["Longitude"],float_DF["Latitude"]).values
float_DF["Temperature Adjusted"] = gsw.CT_from_t(float_DF["Salinity Adjusted"], float_DF["Temperature Adjusted"], float_DF["Pressure"])

float_DF['Yearday'] = sg.datetime2ytd(float_DF['Julian Day'])
float_DF.head()

float 5906036 completed
float 5906030 completed
float 5906031 completed
float 5906034 completed
float 5906035 completed
float 5906033 completed
float 5906032 completed


Unnamed: 0,Julian Day,Julian Day QC,Latitude,Longitude,Position QC,Pressure,Pressure QC,Temperature Adjusted,Temperature Adjusted QC,Temperature,...,Salinity Adjusted QC,Salinity,Salinity QC,Oxygen,Oxygen QC,Nitrate,Nitrate QC,pH,pH QC,Yearday
2,2019-05-11 07:44:49.000814848,1,-35.025,29.988,1,7.800000,1.0,21.239601,1.0,21.266001,...,1.0,35.581001,1.0,220.721359,1.0,-0.594459,1.0,8.070192,1.0,130.322789
4,2019-05-11 07:44:49.000814848,1,-35.025,29.988,1,11.800000,1.0,21.238828,1.0,21.266001,...,1.0,35.581001,1.0,220.629868,1.0,-0.747419,1.0,8.069442,1.0,130.322789
6,2019-05-11 07:44:49.000814848,1,-35.025,29.988,1,15.800000,1.0,21.230985,1.0,21.259001,...,1.0,35.583000,1.0,220.523651,1.0,-0.743400,1.0,8.069372,1.0,130.322789
9,2019-05-11 07:44:49.000814848,1,-35.025,29.988,1,21.799999,1.0,21.193513,1.0,21.223000,...,1.0,35.591999,1.0,220.215622,1.0,-0.763693,1.0,8.067467,1.0,130.322789
11,2019-05-11 07:44:49.000814848,1,-35.025,29.988,1,25.799999,1.0,21.175701,1.0,21.205999,...,1.0,35.592999,1.0,219.909927,1.0,-0.769277,1.0,8.066362,1.0,130.322789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298708,2021-10-21 18:49:44.002002176,1,-49.326,90.346,1,1600.020020,1.0,2.334355,1.0,2.444800,...,1.0,34.667801,1.0,172.802490,1.0,32.603031,1.0,7.839884,1.0,1024.784537
298709,2021-10-21 18:49:44.002002176,1,-49.326,90.346,1,1700.199951,1.0,2.250816,1.0,2.368700,...,1.0,34.695599,1.0,176.090378,1.0,32.125603,1.0,7.846263,1.0,1024.784537
298710,2021-10-21 18:49:44.002002176,1,-49.326,90.346,1,1800.000000,1.0,2.198804,1.0,2.324600,...,1.0,34.722301,1.0,180.613174,1.0,31.137655,1.0,7.857783,1.0,1024.784537
298711,2021-10-21 18:49:44.002002176,1,-49.326,90.346,1,1899.449951,1.0,2.119027,1.0,2.252200,...,1.0,34.734402,1.0,183.152924,1.0,31.005705,1.0,7.858600,1.0,1024.784537


In [None]:
# Optional save to file
# file_path = '/Users/sangminsong/OneDrive - UW/Code/SOGOS/gridded-vars/'
# float_DF.to_csv(file_path + 'FloatData_DF.csv')
# ship_DF.to_csv(file_path + 'ShipData_DF.csv')

In [9]:
ship_DF

Unnamed: 0,Latitude,Longitude,Julian Day,Bottle Depth,Pressure,Temperature,Salinity,Oxygen,Nitrate,DIC,Alkalinity,pH,Yearday
0,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,522.0,0.7214,34.6535,210.8,32.03,2252.1,2348.73,7.5811,105.379861
1,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,505.4,0.7200,34.6525,211.0,32.02,,,,105.379861
2,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,490.7,0.7193,34.6523,211.0,31.94,2250.7,2346.14,7.5831,105.379861
3,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,439.7,0.7221,34.6491,211.1,32.01,,,,105.379861
4,-68.3422,31.5350,2019-04-16 09:07:00.000000000,524,388.8,0.4779,34.5828,221.2,32.09,2250.4,2339.87,7.5826,105.379861
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1817,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,97.7,20.1644,35.6228,220.7,0.26,2043.3,2332.37,8.0089,130.749306
1818,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,73.9,20.1626,35.6230,220.8,0.25,2044.6,2331.96,8.0090,130.749306
1819,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,48.2,20.1614,35.6224,220.8,0.27,2046.8,2332.79,8.0090,130.749306
1820,-38.4994,30.0015,2019-05-11 17:58:59.999999744,4301,33.1,20.1627,35.6224,221.3,0.25,2045.6,2332.62,8.0100,130.749306


## Combine into full training dataset

In [None]:
## Combine float and ship data
TrainingDF = float_DF[["Latitude", "Longitude", "Julian Day", "Pressure", "Temperature Adjusted", "Salinity Adjusted", "Oxygen", "Nitrate", "pH"]]
ship_DF = ship_DF[["Latitude", "Longitude", "Julian Day", "Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate", "pH"]]
TrainingDF = pd.concat([TrainingDF, ship_DF], ignore_index=True)

# Limiting pressure to 2000
TrainingDF = TrainingDF[TrainingDF["Pressure"] < 2000]
TrainingDF = TrainingDF.reset_index(drop = True)

TrainingDF.head()


# Add Optional Training Fields

In [None]:
# Add sigma variables and N squared
TrainingDF["Sigma 0"] = gsw.sigma0(TrainingDF["Salinity"].values, TrainingDF["Temperature"].values)
TrainingDF["Sigma 1"] = gsw.sigma1(TrainingDF["Salinity"].values, TrainingDF["Temperature"].values)
TrainingDF["Sigma 2"] = gsw.sigma2(TrainingDF["Salinity"].values, TrainingDF["Temperature"].values)
TrainingDF["N_Squared"] = np.nan
TrainingDF["N_Squared_v2"] = np.nan
TrainingDF["Profile"] = np.nan

In [None]:
# Last Drop NA Check
TrainingDF = TrainingDF.dropna()
TrainingDF = TrainingDF.reset_index(drop = True)
TrainingDF

# may need to put one step here with unique dates


# Checking data distributions

In [None]:
# Pair Wise Plots
PWDF = TrainingDF[["Decimal Date", "Inverted Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate"]]
seaborn.pairplot(PWDF)
# plt.savefig("PairPlot.png")
plt.show()

# Random Forest Example

In [None]:
# Split data 80/10/10.
# Better way to do this like in class with reshuffle method


training = pd.DataFrame()
test = pd.DataFrame()
validation = pd.DataFrame()

profile_numbers = pd.DataFrame(list(range(1, profile_num)))
training_index = profile_numbers.sample(frac=.80, random_state=1)
validation_index = profile_numbers.drop(training_index.index)
test_index = validation_index.sample(frac=0.5, random_state=1)
validation_index = validation_index.drop(test_index.index)
counter = 0
for i in list(training_index[0].values):
    if counter == 0:
        training = TrainingDF[TrainingDF["Profile"] == float(i)]
    else:
        training = training.append(TrainingDF[TrainingDF["Profile"] == float(i)])
    counter = counter + 1
counter = 0
for i in list(validation_index[0].values):
    if counter == 0:
        validation = TrainingDF[TrainingDF["Profile"] == float(i)]
    else:
        validation = validation.append(TrainingDF[TrainingDF["Profile"] == float(i)])
    counter = counter + 1
counter = 0
for i in list(test_index[0].values):
    if counter == 0:
        test = TrainingDF[TrainingDF["Profile"] == float(i)]
    else:
        test = test.append(TrainingDF[TrainingDF["Profile"] == float(i)])
    counter = counter + 1


: 

In [None]:
training.head()
training.to_csv(file_path + 'SplitTraining_FloatShip_DF.csv')
test.to_csv(file_path + 'SplitTest_FloatShip_DF.csv')
validation.to_csv(file_path + 'SplitValidation_FloatShip_DF.csv')

One issue -- spatiotemporal aliasing if variables are correlated?

Can you pass latitude/longitude as parameters?

In [None]:
# Random Forest Model

ntrees = 1000
Mdl = RandomForestRegressor(ntrees, max_features=1/3, oob_score = True, random_state = 0)
variable = 'pH'

if variable == "pH":
    #all
    # Create X Variables for each subset of data.
    X_training = training[["Latitude", "Longitude", "Season(sin)", "Season(cos)","Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate", "N_Squared_v2"]].to_numpy()
    X_validation = validation[["Latitude", "Longitude", "Season(sin)", "Season(cos)","Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate", "N_Squared_v2"]].to_numpy()
    X_test = test[["Latitude", "Longitude", "Season(sin)", "Season(cos)","Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate", "N_Squared_v2"]].to_numpy()

    Y_training = training[variable].to_numpy()
    Y_validation = validation[variable].to_numpy()
    Y_test = test[variable].to_numpy()
    

# Fit the model to the training subset of data
Mdl.fit(X_training, Y_training)

In [None]:
# Predict Y (pH) in each subset of data (test at the end of development)
Y_pred_training = Mdl.predict(X_training)
Y_pred_validation = Mdl.predict(X_validation)
Y_pred_test = Mdl.predict(X_test)

# Create Error Metrics 
AE_RF_training = Y_pred_training - Y_training
IQR_RF_training = iqr(abs(AE_RF_training))

AE_RF_validation = Y_pred_validation - Y_validation
IQR_RF_validation = iqr(abs(AE_RF_validation))

AE_RF_test = Y_pred_test - Y_test
IQR_RF_test = iqr(abs(AE_RF_test))

RF_dict = {}
RF_dict["Model"] = ["Training", "Validation"]
RF_dict["Median Absolute Error"] = [np.nanmedian(abs(AE_RF_training)), np.nanmedian(abs(AE_RF_validation))]
RF_dict["IQR"] = [IQR_RF_training, IQR_RF_validation]
RF_DF = pd.DataFrame.from_dict(RF_dict)
RF_DF

In [14]:
# temp check for song
ds = xr.open_dataset('/Users/sangminsong/Library/CloudStorage/OneDrive-UW/Code/SOGOS/325020190403_bottle.nc')
ds.bottle_salinity