# Training Data Preparation for Random Forest

In [None]:
# Import Packages

#Data Structure/Organization Packages
import pandas as pd
import xarray as xr
import numpy as np
import numpy.ma as ma

# Plotting Packages
import matplotlib.pyplot as plt
import seaborn

# Machine Learning/Statistics Packages
from scipy.stats import kde
from scipy.stats import iqr
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import xgboost as xgb
from pygam import LinearGAM, l, s, f
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import mean_squared_error

# Utility Packages
import random as rand
import os
import math
import dataframe_image as dfi
import itertools
from scipy import interpolate

# Oceanography Packages
import gsw
import PyCO2SYS as pyco2

import sgmod_main as sg

## Ship Data

In [None]:
# Load Ship Data (Already QC Filtered) into Pandas Dataframe
ship_fp = "/Users/sangminsong/OneDrive - UW/Code/SOGOS/data/ship/bottle_data.csv"
ship_DF = pd.read_csv(ship_fp)

ship_DF["Julian Day"] = pd.to_datetime(ship_DF["Julian Day"].values/1440, unit='D', origin=pd.Timestamp('1980-01-01'))
ship_DF['Yearday'] = sg.datetime2ytd(ship_DF['Julian Day'])

## Float Data and Corrections

In [None]:
floatdir = "/Users/sangminsong/OneDrive - UW/Code/SOGOS/data/float/"
floatDSdict = {}
for filename in os.listdir(floatdir):
    fp = floatdir + filename
    DS = xr.open_dataset(fp, decode_times=False)
    floatDSdict[filename[0:7]] = DS

In [None]:
## Float Corrections
flag = 0
for floatnum in floatDSdict:
    # print(floatDSdict[floatnum]["PARAMETER_DATA_MODE"])
    # print(floatDSdict[floatnum]["PARAMETER"].values)
    for prof in floatDSdict[floatnum]["PARAMETER_DATA_MODE"]:
        data_modes = np.concatenate((prof.values[-2:],prof.values[0:4]))
        if np.all((data_modes == b'D') | (data_modes == b'A')) == False:
            flag = 1
if flag == 0:
    print("Data Mode is all D or A! All Good!")

In [None]:
def make_float_DF(floatDSdict):
    """
    Create a dictionary with all necesary data fields from Argo.

    # We Want the following data: 
    # Data Type, Date Update, LATITUDE, LONGITUDE, Position QC, PRES_ADJUSTED, PRES_ADJUSTED_QC,
    # TEMP_ADJUSTED, TEMP_ADJUSTED_QC, PSAL_ADJUSTED, PSAL_ADJUSTED_QC, DOXY_ADJUSTED, DOXY_ADJUSTED_QC
    # CHLA_ADJUSTED, CHLA_ADJUSTED_QC, NITRATE_ADJUSTED, NITRATE_ADJUSTED_QC

    # Data Type is structured differently so we will process the QC in a different way.
    """
    
    TrainingData = {
        # "Serial Number" : [],
        # "Data Mode": [],
        "Julian Day" : [],
        "Julian Day QC" : [],
        "Latitude" : [],
        "Longitude" : [],
        "Position QC" : [],
        "Pressure" : [],
        "Pressure QC": [],
        "Temperature Adjusted" : [],
        "Temperature Adjusted QC" : [],
        "Temperature" : [],
        "Temperature QC" : [],
        "Salinity Adjusted" : [],
        "Salinity Adjusted QC" : [],
        "Salinity" : [],
        "Salinity QC" : [],
        "Oxygen" : [],
        "Oxygen QC" : [],
        "Nitrate": [],
        "Nitrate QC": [],
        "pH": [],
        "pH QC": []
    }

    # Iterate through all floats in float dictionary
    for floatnum in floatDSdict:

        # Establish empty lists to append data to
        # float_serial = []
        # data_mode = []
        float_juld = []
        juld_QC = []
        float_lat = []
        float_lon = []
        position_QC = []
        float_pres = []
        pres_QC = []
        float_temp_adj = []
        temp_adj_QC = []
        float_temp = []
        temp_QC = []
        float_sal = []
        sal_QC = []
        float_sal_adj = []
        sal_adj_QC = []
        float_ox = []
        ox_QC = []
        nit = []
        nit_QC = []
        pH = []
        pH_QC = []
        # Iterate through every profile of a specific float
        for iprof in floatDSdict[floatnum].N_PROF.values:
            # Append Pressure, Temperature, P Salinity, Dis Oxygen, Nitrate, and pH, as well as associated QC
            float_pres = float_pres + list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)
            pres_QC = pres_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PRES_ADJUSTED_QC[iprof].values]
            float_temp_adj = float_temp_adj + list(floatDSdict[floatnum].TEMP_ADJUSTED[iprof].values)
            temp_adj_QC = temp_adj_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].TEMP_ADJUSTED_QC[iprof].values]
            float_temp = float_temp + list(floatDSdict[floatnum].TEMP[iprof].values)
            temp_QC = temp_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].TEMP_QC[iprof].values]
            float_sal_adj = float_sal_adj + list(floatDSdict[floatnum].PSAL_ADJUSTED[iprof].values)
            sal_adj_QC = sal_adj_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PSAL_ADJUSTED_QC[iprof].values]
            float_sal = float_sal + list(floatDSdict[floatnum].PSAL[iprof].values)
            sal_QC = sal_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PSAL_QC[iprof].values]
            float_ox = float_ox + list(floatDSdict[floatnum].DOXY_ADJUSTED[iprof].values)
            ox_QC = ox_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].DOXY_ADJUSTED_QC[iprof].values]
            nit = nit + list(floatDSdict[floatnum].NITRATE_ADJUSTED[iprof].values)
            nit_QC = nit_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].NITRATE_ADJUSTED_QC[iprof].values]
            pH = pH + list(floatDSdict[floatnum].PH_IN_SITU_TOTAL_ADJUSTED[iprof].values)
            pH_QC = pH_QC + [int(i.decode("utf-8")) if type(i) != float else np.nan for i in floatDSdict[floatnum].PH_IN_SITU_TOTAL_ADJUSTED_QC[iprof].values]
            # print(float_pres)

            # Append Julian Day, lat, lon, and QC. Values stay the same for whole profile
            # float_serial_temp = np.array2string(floatDSdict[floatnum].FLOAT_SERIAL_NO[iprof].values)
            float_juld_temp = pd.to_datetime(floatDSdict[floatnum].JULD.values[iprof], unit='D', origin=pd.Timestamp('1950-01-01'))
            juld_QC_temp = int(floatDSdict[floatnum].JULD_QC.values[iprof])
            float_lat_temp = float(floatDSdict[floatnum].LATITUDE.values[iprof])
            float_lon_temp = float(floatDSdict[floatnum].LONGITUDE.values[iprof])
            position_QC_temp = int(floatDSdict[floatnum].POSITION_QC.values[iprof])
            data_mode_temp = [str(i.decode("utf-8")) for i in floatDSdict[floatnum].PARAMETER_DATA_MODE[iprof].values]
            # print(data_mode_temp)
            # print(position_QC_temp)

            # Duplicate values to be of the right array size since values stay the same for whole profile
            # float_serial = [float_serial_temp] * len(float_pres)
            float_juld = float_juld + ([float_juld_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            juld_QC = juld_QC + ([juld_QC_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            float_lat = float_lat + ([float_lat_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            float_lon = float_lon + ([float_lon_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            position_QC = position_QC + ([position_QC_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values)))
            # data_mode = data_mode + [data_mode_temp] * len(list(floatDSdict[floatnum].PRES_ADJUSTED[iprof].values))
        print("float", floatnum, "completed")

        # Add to overall training data dictionary
        # TrainingData["Serial Number"] = TrainingData["Serial Number"] + float_serial
        # TrainingData["Data Mode"] = TrainingData["Data Mode"] + data_mode
        TrainingData["Julian Day"] = TrainingData["Julian Day"] + float_juld
        TrainingData["Julian Day QC"] = TrainingData["Julian Day QC"] + juld_QC
        TrainingData["Latitude"] = TrainingData["Latitude"] + float_lat
        TrainingData["Longitude"] = TrainingData["Longitude"] + float_lon
        TrainingData["Position QC"] = TrainingData["Position QC"] + position_QC
        TrainingData["Pressure"] = TrainingData["Pressure"] + float_pres
        TrainingData["Pressure QC"] = TrainingData["Pressure QC"] + pres_QC
        TrainingData["Temperature Adjusted"] = TrainingData["Temperature Adjusted"] + float_temp_adj
        TrainingData["Temperature Adjusted QC"] = TrainingData["Temperature Adjusted QC"] + temp_adj_QC
        TrainingData["Temperature"] = TrainingData["Temperature"] + float_temp
        TrainingData["Temperature QC"] = TrainingData["Temperature QC"] + temp_QC
        TrainingData["Salinity Adjusted"] = TrainingData["Salinity Adjusted"] + float_sal_adj
        TrainingData["Salinity Adjusted QC"] = TrainingData["Salinity Adjusted QC"] + sal_adj_QC
        TrainingData["Salinity"] = TrainingData["Salinity"] + float_sal
        TrainingData["Salinity QC"] = TrainingData["Salinity QC"] + sal_QC
        TrainingData["Oxygen"] = TrainingData["Oxygen"] + float_ox
        TrainingData["Oxygen QC"] = TrainingData["Oxygen QC"] + ox_QC
        TrainingData["Nitrate"] = TrainingData["Nitrate"] + nit
        TrainingData["Nitrate QC"] = TrainingData["Nitrate QC"] + nit_QC
        TrainingData["pH"] = TrainingData["pH"] + pH
        TrainingData["pH QC"] = TrainingData["pH QC"] + pH_QC   
    # Convert dictionary into pandas dataframe object
    TrainingDF = pd.DataFrame.from_dict(TrainingData)

    #QC for Pressure, Position, Temperature, and Salinity, Nitrate, Oxygen, pH
    # 1: Good data
    # 2: Probably good data
    # 8: Estimated
    # A: real time adjusted values
    # D: Delayed mode
    # Taken from https://argo.ucsd.edu/data/how-to-use-argo-files/

    TrainingDF = TrainingDF[(TrainingDF["Position QC"] == 1) | (TrainingDF["Position QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["Julian Day QC"] == 1) | (TrainingDF["Julian Day QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["Pressure QC"] == 1) | (TrainingDF["Pressure QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["Temperature QC"] == 1) | (TrainingDF["Temperature QC"] == 2) | (TrainingDF["Temperature QC"] == 8)]
    TrainingDF = TrainingDF[(TrainingDF["Salinity QC"] == 1) | (TrainingDF["Salinity QC"] == 2) | (TrainingDF["Salinity QC"] == 8)]
    TrainingDF = TrainingDF[(TrainingDF["Oxygen QC"] == np.nan) | ((TrainingDF["Oxygen QC"] == 1) | (TrainingDF["Oxygen QC"] == 2))]
    TrainingDF = TrainingDF[(TrainingDF["Nitrate QC"] == 1) | (TrainingDF["Nitrate QC"] == 2)]
    TrainingDF = TrainingDF[(TrainingDF["pH QC"] == 1) | (TrainingDF["pH QC"] == 2)]

    # Spatial Bounds
    # TrainingDF = TrainingDF[(TrainingDF["Longitude"] < 33) & (TrainingDF["Longitude"] > 27)]

    return TrainingDF

In [None]:
float_DF = make_float_DF(floatDSdict)
float_DF["Salinity Adjusted"] = gsw.SA_from_SP(float_DF["Salinity Adjusted"],float_DF["Pressure"],float_DF["Longitude"],float_DF["Latitude"]).values
float_DF["Temperature Adjusted"] = gsw.CT_from_t(float_DF["Salinity Adjusted"], float_DF["Temperature Adjusted"], float_DF["Pressure"])

float_DF['Yearday'] = sg.datetime2ytd(float_DF['Julian Day'])
float_DF

In [None]:
# file_path = '/Users/sangminsong/OneDrive - UW/Code/SOGOS/gridded-vars/'
float_DF.to_csv(file_path + 'FloatData_DF.csv')
ship_DF.to_csv(file_path + 'ShipData_DF.csv')

## Combine into full training dataset

In [None]:
## Combine float and ship data
TrainingDF = float_DF[["Latitude", "Longitude", "Julian Day", "Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate", "pH"]]
ship_DF = ship_DF[["Latitude", "Longitude", "Julian Day", "Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate", "pH"]]
TrainingDF = pd.concat([TrainingDF, ship_DF], ignore_index=True)

# Limiting pressure to 2000
TrainingDF = TrainingDF[TrainingDF["Pressure"] < 2000]
TrainingDF = TrainingDF.reset_index(drop = True)

TrainingDF.head()


# Add Training Fields

In [None]:
# Add seasonal variables
seasonal_sin = []
seasonal_cos = []
for date in TrainingDF["Julian Day"]:
    month = date.month
    seasonal_sin.append(math.sin(2*math.pi*month/12))
    seasonal_cos.append(math.cos(2*math.pi*month/12))
TrainingDF["Season(sin)"] = seasonal_sin
TrainingDF["Season(cos)"] = seasonal_cos


In [None]:
# Add sigma variables and N squared
TrainingDF["Sigma 0"] = gsw.sigma0(TrainingDF["Salinity"].values, TrainingDF["Temperature"].values)
TrainingDF["Sigma 1"] = gsw.sigma1(TrainingDF["Salinity"].values, TrainingDF["Temperature"].values)
TrainingDF["Sigma 2"] = gsw.sigma2(TrainingDF["Salinity"].values, TrainingDF["Temperature"].values)
TrainingDF["N_Squared"] = np.nan
TrainingDF["N_Squared_v2"] = np.nan
TrainingDF["Profile"] = np.nan

In [None]:
# Last Drop NA Check
TrainingDF = TrainingDF.dropna()
TrainingDF = TrainingDF.reset_index(drop = True)
TrainingDF

# may need to put one step here with unique dates


# Checking data distributions

In [None]:
# Pair Wise Plots
PWDF = TrainingDF[["Decimal Date", "Inverted Pressure", "Temperature", "Salinity", "Oxygen", "Nitrate"]]
seaborn.pairplot(PWDF)
# plt.savefig("PairPlot.png")
plt.show()