# Predicting Diatom concentration with forecasting regression based on the oceanographic boxes (spatial means)

## Importing

In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xskillscore as xs

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import r_regression

from skforecast.recursive import ForecasterRecursive
from skforecast.direct import ForecasterDirect
from skforecast.preprocessing import RollingFeatures


from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import root_mean_squared_error as rmse

import os
import lzma
import dill

from tqdm import tqdm

import cmocean.cm as cm
import salishsea_tools.viz_tools as sa_vi

np.warnings.filterwarnings('ignore') # For the nan mean warning


## Datasets Preparation

In [2]:
# Creation of the training - testing datasets
def datasets_preparation(dataset, dataset2, regions, name, inputs_names):
    
    indx = np.where((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
    
    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    inputs = []
    
    for i in inputs_names:
        inputs.append(dataset2[i].to_numpy().reshape(*dataset2[i].to_numpy().shape[:1],-1))

    inputs = np.array(inputs)

    # Deleting 29 of February
    inputs = np.delete(inputs,indx,axis=1)
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    inputs = np.split(inputs,len(np.unique(dataset.time_counter.dt.year)),axis=1)
    targets = np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0)

    # Means
    inputs = np.nanmean(inputs,axis=0)
    targets = np.nanmean(targets,axis=0)

    x =  np.tile(dataset2.x, len(dataset2.y))
    y =  np.tile(np.repeat(dataset2.y, len(dataset2.x)),1)

    indx = np.where((~np.isnan(targets).any(axis=0)) & (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,:,indx[0]]
    targets = targets[:,indx[0]]

    regions = np.tile(np.ravel(regions), len(dataset.time_counter))
    regions = regions[indx[0]]

    return(inputs, targets, indx, regions)


## File Creation

In [3]:
def file_creation(path, variable, name):

    temp = variable.to_dataset(name=name)
    temp.to_netcdf(path = path + 'targets_predictions.nc', mode='a', encoding={name:{"zlib": True, "complevel": 9}})
    

## Scatter Plot

In [4]:
def scatter_plot(dates,targets, predictions, name):

    indx = pd.DatetimeIndex(dates[0:75]) # From the first year

    # compute slope m and intercept b
    m, b = np.polyfit(targets, predictions, deg=1)

    fig, ax = plt.subplots()

    scatter = ax.scatter(targets,predictions, s = 10, c= indx.month)

    lims = [np.min([ax.get_xlim(), ax.get_ylim()]),
        np.max([ax.get_xlim(), ax.get_ylim()])]

    # plot fitted y = m*x + b
    ax.axline(xy1=(0, b), slope=m, color='r')

    ax.set_xlabel('targets')
    ax.set_ylabel('predictions')
    ax.set_xlim(lims)
    ax.set_ylim(lims)
    ax.set_aspect('equal')
    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])

    ax.plot(lims, lims,linestyle = '--',color = 'k')

    fig.suptitle(name)

    plt.show()

    return(m)

## Pre-training

In [5]:
def pre_training(dataset,dataset2,boxes,regions0,name,inputs_names):

    np.warnings.filterwarnings('ignore') # For the nan mean warning

    regions_indiv_t = np.zeros((len(np.unique(dataset.time_counter.dt.dayofyear))-1,len(np.unique(dataset.time_counter.dt.year)),len(boxes)))
    regions_indiv_d = np.zeros((len(inputs_names),len(np.unique(dataset.time_counter.dt.dayofyear))-1,len(np.unique(dataset.time_counter.dt.year)),len(boxes)))

    ds = dataset
    ds2 = dataset2

    for i in range(0, len(np.unique(ds.time_counter.dt.year))):

        dataset = ds.sel(time_counter = slice(str(np.unique(ds.time_counter.dt.year)[i]), str(np.unique(ds.time_counter.dt.year)[i])))
        dataset2 = ds2.sel(time_counter = slice(str(np.unique(ds2.time_counter.dt.year)[i]), str(np.unique(ds2.time_counter.dt.year)[i])))

        inputs, targets, indx, _ = datasets_preparation(dataset, dataset2, regions0, name, inputs_names)

        regions1 = np.ravel(regions0)[indx]

        for j in range (0,len(boxes)):

            temp = xr.where(regions1==j, inputs, np.nan)
            regions_indiv_d[:,:,i,j] = np.nanmean(temp,axis=2)

            temp = xr.where(regions1==j, targets, np.nan)
            regions_indiv_t[:,i,j] = np.nanmean(temp,axis=1)

    return(regions_indiv_d,regions_indiv_t,indx)


## Plotting (Mean Values)

In [6]:
def plotting_mean_values(dates,boxes,targets,predictions,r_train,rms_train,slope_train,category,units,region,boxnames):

    years = np.unique(dates.year)
    
    ticks = []
    for i in range (0,targets.shape[0],75):
        ticks.append(i)
    
    targets_masked = np.ma.array(targets)
    predictions_masked = np.ma.array(predictions)

    targets_masked[ticks] = np.ma.masked
    predictions_masked[ticks] = np.ma.masked

    for i in range (0,len(boxes)):

        fig, _ = plt.subplots(figsize=(19,5))

        temp = pd.DataFrame(np.vstack((r_train[i],rms_train[i],slope_train[i])).transpose(),index=[boxnames[i]],columns=['r','rms','slope'])
        display(temp)

        plt.plot(targets_masked[:,i], label = 'targets')
        plt.plot(predictions_masked[:,i], label = 'predictions')
        plt.xlabel('Years')
        plt.xticks(ticks,years)
        plt.suptitle('Mean '+category + ' ' +units + ' (15 Feb - 30 Apr) ' + region + ' ' + boxnames[i])
        plt.legend()

        plt.show()


## Plotting (Mean Differences)

In [7]:
def plotting_mean_differences(dates,boxes,targets,predictions,category,units,region,boxnames):

    years = np.unique(dates.year)
    
    ticks = []
    for i in range (0,targets.shape[0]*targets.shape[1],targets.shape[0]):
        ticks.append(i)

    targets = np.reshape(targets,(targets.shape[0]*targets.shape[1],targets.shape[2]), order = 'F')
    predictions = np.reshape(predictions,(predictions.shape[0]*predictions.shape[1],predictions.shape[2]), order = 'F')
    
    targets_masked = np.ma.array(targets)
    predictions_masked = np.ma.array(predictions)

    targets_masked[ticks] = np.ma.masked
    predictions_masked[ticks] = np.ma.masked

    for i in range (0,len(boxes)):

        fig, _ = plt.subplots(figsize=(19,5))

        plt.plot(targets_masked[:,i], label = 'targets')
        plt.plot(predictions_masked[:,i], label = 'predictions')
        plt.xlabel('Years')
        plt.xticks(ticks,years)
        plt.suptitle('Mean '+category + ' ' +units + ' (15 Feb - 30 Apr) ' + region + ' ' + boxnames[i])
        plt.legend()

        plt.show()


## Post-Training

In [8]:
def post_training(dates,boxes,targets,predictions,category,units,region,boxnames):

    r_train = np.full(len(boxes),np.nan)
    rms_train = np.full(len(boxes),np.nan)
    slope_train = np.full(len(boxes),np.nan)

    for i in range (0,len(boxes)):

        r_train[i] = np.round(np.corrcoef(np.ravel(targets[:,i]),np.ravel(predictions[:,i]))[0][1],3)
        rms_train[i] = rmse(np.ravel(targets[:,i]),np.ravel(predictions[:,i]))
        m,_ = np.polyfit(np.ravel(targets[:,i]),np.ravel(predictions[:,i]), deg=1)
        slope_train[i] = np.round(m,3)
    
    plotting_mean_values(dates,boxes,targets,predictions,r_train,rms_train,slope_train,category,units,region,boxnames)

    return(r_train,rms_train,slope_train)


## Plotting (regions)

In [9]:
def plot_box(ax, corn, colour):

    ax.plot([corn[2], corn[3], corn[3], corn[2], corn[2]], 
    [corn[0], corn[0], corn[1], corn[1], corn[0]], '-', color=colour)

## Initiation

In [10]:
name = 'Diatom'
units = '[mmol m-2]'
category = 'Concentrations'

if name == 'Diatom':
    inputs_names = ['Summation_of_solar_radiation','Mean_wind_speed','Mean_air_temperature']
    lags = [13,13,4,11,30,22,29,19,3]
else:
    inputs_names = ['Summation_of_solar_radiation','Mean_air_temperature','Mean_pressure', 'Mean_precipitation', 'Mean_specific_humidity']
    lags = [28,4,4,3,30,8,12,11,24]

ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')


## Regions

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 9))
mycmap = cm.deep
mycmap.set_bad('grey')
ax.pcolormesh(ds['Diatom'][0], cmap=mycmap)
sa_vi.set_aspect(ax)

SoG_north = [650, 730, 100, 200]
plot_box(ax, SoG_north, 'g')
SoG_center = [450, 550, 200, 300]
plot_box(ax, SoG_center, 'b')
Fraser_plume = [380, 460, 260, 330]
plot_box(ax, Fraser_plume, 'm')
SoG_south = [320, 380, 280, 350]
plot_box(ax, SoG_south, 'k')
Haro_Boundary = [290, 350, 210, 280]
plot_box(ax, Haro_Boundary, 'm')
JdF_west = [250, 425, 25, 125]
plot_box(ax, JdF_west, 'c')
JdF_east = [200, 290, 150, 260]
plot_box(ax, JdF_east, 'w')
PS_all = [0, 200, 80, 320]
plot_box(ax, PS_all, 'm')
PS_main = [20, 150, 200, 280]
plot_box(ax, PS_main, 'r')

boxnames = ['SoG_north','SoG_center','Fraser_plume','SoG_south', 'Haro_Boundary', 'JdF_west', 'JdF_east', 'PS_all', 'PS_main']
fig.legend(boxnames)

boxes = [SoG_north,SoG_center,Fraser_plume,SoG_south,Haro_Boundary,JdF_west,JdF_east,PS_all,PS_main]

regions0 = np.full((len(ds.y),len(ds.x)),np.nan)

for i in range (0, len(boxes)):
    regions0[boxes[i][0]:boxes[i][1], boxes[i][2]:boxes[i][3]] = i

regions0 = xr.DataArray(regions0,dims = ['y','x'])

# # Low resolution
# temp = []

# for i in boxes:
#     temp.append([x//5 for x in i])

# boxes = temp

In [12]:
# Low resolution

# ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
#     x=(np.arange(ds.x[0], ds.x[-1], 5)))

# ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
#     x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

# regions0 = regions0.isel(y=(np.arange(regions0.y[0], regions0.y[-1], 5)), 
#     x=(np.arange(regions0.x[0], regions0.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

years = np.unique(dataset.time_counter.dt.year)

r_inputs = np.zeros((len(boxnames), len(inputs_names)))

inputs0,targets0,indx = pre_training(dataset,dataset2,boxes,regions0,name,inputs_names)

inputs = np.reshape(inputs0,(len(inputs_names),targets0.shape[0]*targets0.shape[1],len(boxes)),order='F')
targets = np.reshape(targets0,(targets0.shape[0]*targets0.shape[1],len(boxes)),order='F')


## Training

In [None]:
dataset = ds.sel(time_counter = slice('2007', '2020'))
dates = pd.DatetimeIndex(dataset['time_counter'].values)
indx2 = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx2]

day = dataset.time_counter.dt.dayofyear[indx2].values

day = np.tile(day,len(boxes))
day = np.reshape(day, (len(dates),len(boxes)), order='F')
day = np.expand_dims(day, axis=0)

inputs_names2=list(inputs_names) # For the Dataframes 

inputs_names2.append('day')
inputs = np.vstack((inputs,day))

inputs_training = []
targets_training = []

regr_all = []
predictions = np.full(targets.shape,np.nan)

for i in range (0,len(boxnames)):
    
    temp = pd.DataFrame(inputs[:,:,i].transpose(),dates,columns=inputs_names2)
    temp.rename_axis('date')
    inputs_training.append(temp)

    temp = pd.DataFrame({name:targets[:,i]},dates)
    temp.rename_axis('date')
    targets_training.append(temp)

    forecaster = ForecasterRecursive(regressor= BaggingRegressor(estimator=HistGradientBoostingRegressor(categorical_features=['day']), n_estimators=12), lags=lags[i])
    forecaster.fit(y=targets_training[i][name], exog=inputs_training[i])
    regr_all.append(forecaster)

    predictions[:,i] = regr_all[i].predict(steps=len(dates), exog=inputs_training[i], check_inputs=False)


In [None]:
inputs_training


## Time-series (Training)

In [None]:
r_train,rms_train,slope_train = post_training(dates,boxes,targets,predictions,units,category,' ',boxnames)

season = np.mean(targets0,axis=1)

season_train = np.tile(season,len(np.unique(dates.year))) # Broadcasting season to all training years
season_train = np.reshape(season_train,targets0.shape)
season_train = np.reshape(season_train,targets.shape,order='F')

plt.plot(season)
plt.legend(boxnames)
plt.suptitle('Long-term seasonalities (2007-2020, targets)')
plt.show()

r_train_season,_,slope_train_season = post_training(dates,boxes,targets-season_train,predictions-season_train,units,category,'(removed seasonality)',boxnames)

# Differences for the testing time-series
diff = np.zeros(len(boxes))

for i in range (0, len(boxes)):
    mean = np.mean(targets0[:,:,i])
    std = np.mean(targets0[:,:,i])
    diff[i] = mean + 0*std


## Other Years

In [15]:
dataset = ds.sel(time_counter = slice('2021', '2024'))
dataset2 = ds2.sel(time_counter = slice('2021', '2024'))

dates = pd.DatetimeIndex(dataset['time_counter'].values)
indx = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx]

years = np.unique(dataset.time_counter.dt.year)

inputs0_test,targets0_test,indx_test = pre_training(dataset,dataset2,boxes,regions0,name,inputs_names)

inputs_test = np.reshape(inputs0_test, (len(inputs_names),targets0.shape[0]*len(years),len(boxnames)), order='F')
targets_test = np.reshape(targets0_test, (targets0.shape[0]*len(years),len(boxnames)), order='F')

day = dataset.time_counter.dt.dayofyear[indx].values

day = np.tile(day,len(boxes))
day = np.reshape(day, (len(dates),len(boxes)), order='F')
day = np.expand_dims(day,axis=0)

inputs_test = np.vstack((inputs_test,day))

season_test = np.tile(season,len(years)) # Broadcasting season to all testing years
season0_test = np.reshape(season_test, targets0_test.shape) # Needed for the annual calculations
season_test = np.reshape(season0_test, targets_test.shape, order='F')

inputs_testing = []

predictions_test = np.full(targets_test.shape,np.nan)

for i in range (0,len(boxnames)):
    
    temp = pd.DataFrame(inputs_test[:,:,i].transpose(),dates,columns=inputs_names2)
    temp.rename_axis('date')
    inputs_testing.append(temp)

    predictions_test[:,i] = regr_all[i].predict(steps=len(dates), exog=inputs_testing[i], check_inputs=False)

predictions0_test = predictions_test.reshape(targets0_test.shape, order='F')


## Time-series (Testing)

In [None]:
r_test,rms_test,slope_test = np.zeros(len(boxes)), np.zeros(len(boxes)), np.zeros(len(boxes))

r_test_season, slope_test_season = np.zeros(len(boxes)), np.zeros(len(boxes))

targets_sum, predictions_sum = np.zeros((len(boxes),len(years))), np.zeros((len(boxes),len(years)))

targets_mean, predictions_mean = np.zeros((len(boxes),len(years))), np.zeros((len(boxes),len(years)))

targets_diff, predictions_diff = np.zeros((len(boxes),targets0_test.shape[0],len(years))), np.zeros((len(boxes),targets0_test.shape[0],len(years)))

rss = np.zeros(len(boxes))

r_test_season,_,slope_test_season = post_training(dates,boxes,targets_test-season_test,predictions_test-season_test,units,category,'(removed Seasonality)',boxnames)

for i in range (0,len(boxes)):

    r_test[i] = np.round(np.corrcoef(np.ravel(targets_test[:,i]),np.ravel(predictions_test[:,i]))[0][1],3)
    rms_test[i] = rmse(np.ravel(targets_test[:,i]),np.ravel(predictions_test[:,i]))
    m,_ = np.polyfit(np.ravel(targets_test[:,i]),np.ravel(predictions_test[:,i]), deg=1)
    slope_test[i] = np.round(m,3)

    rss[i] = np.sum((np.ravel(targets_test[:,i])-np.ravel(predictions_test[:,i]))**2) # Similar to rms, is not affected by the seasonality

    for j in range (0, len(years)):

        targets_sum[i,j] = np.sum(targets0_test[:,j,i]-season0_test[:,j,i])
        predictions_sum[i,j] = np.sum(predictions0_test[:,j,i]-season0_test[:,j,i])

        targets_mean[i,j] = np.mean(targets0_test[:,j,i]-season0_test[:,j,i])
        predictions_mean[i,j] = np.mean(predictions0_test[:,j,i]-season0_test[:,j,i])

        targets_diff[i,:,j] = np.where(targets0_test[:,j,i]>diff[i],targets0_test[:,j,i], np.nan)
        predictions_diff[i,:,j] = np.where(predictions0_test[:,j,i]>diff[i],predictions0_test[:,j,i], np.nan)

targets_diff = np.transpose(targets_diff,(1,2,0))
predictions_diff = np.transpose(predictions_diff,(1,2,0))
plotting_mean_differences(dates,boxes,targets_diff,predictions_diff,category,units,'Differences',boxnames)


## Saving

In [17]:
# path = '/data/ibougoudis/MOAD/files/results/' + name + '/for_reg_boxes_s3/'

# os.makedirs(path, exist_ok=True)
# with lzma.open(path + 'regr_all.xz', 'wb') as f:
    
#     dill.dump(regr_all, f)

# with open(path + 'train_metrics.pkl', 'wb') as f:
#     dill.dump([r_train,rms_train,slope_train,r_train_season,slope_train_season,season.transpose()], f)

# with open(path + 'test_metrics.pkl', 'wb') as f:
#     dill.dump([r_test,rms_test,slope_test,r_test_season,slope_test_season,targets_sum,predictions_sum,targets_mean,predictions_mean,targets_diff,predictions_diff,rss], f)

# with open(path + 'targets-predictions.pkl', 'wb') as f:
#     dill.dump([targets0_test,predictions0_test], f)
