# Predicting Diatom concentration with functional regression

### Version 1: Stacking y and x from the dataarray, then working per year

## Importing

In [17]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xskillscore as xs

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import root_mean_squared_error as rmse

from skfda.ml.regression import KNeighborsRegressor
from skfda.representation.grid import FDataGrid
from skfda.representation.basis import FourierBasis

import os
import lzma
import dill

import random

import salishsea_tools.viz_tools as sa_vi
import cmocean.cm as cm


## Datasets Preparation

In [18]:
# Creation of the training - testing datasets
def datasets_preparation(ds, ds2, ref, name):
    
    years = np.unique(ds.time_counter.dt.year)
    test = []
    test2 = []

    for year in years:

        dataset = ds.sel(time_counter=str(year))
        dataset2 = ds2.sel(time_counter=str(year))

        y = np.tile(ref.y, len(ref.time_counter)*len(ref.x))
        x = np.tile(np.repeat(ref.x, len(ref.y)), len(ref.time_counter))

        test.append(np.stack([
            dataset2['Summation_of_solar_radiation'].to_numpy(),
            dataset2['Mean_wind_speed'].to_numpy(),
            dataset2['Mean_air_temperature'].to_numpy(),
            y.reshape(dataset2['Summation_of_solar_radiation'].to_numpy().shape),
            x.reshape(dataset2['Summation_of_solar_radiation'].to_numpy().shape)
            ]))
        
        test2.append(dataset[name].to_numpy())

    # Grouping all the years
    inputs = np.concatenate(test,axis=2)
    targets = np.concatenate(test2,axis=1)

    y = np.tile(ref.y, len(np.unique(ds.time_counter.dt.year))*len(ref.x))
    x = np.tile(np.repeat(ref.x, len(ref.y)), len(np.unique(ds.time_counter.dt.year)))

    indx = np.where((~np.isnan(targets).any(axis=0))& (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,:,indx[0]]
    targets = targets[:,indx[0]]

    return(inputs, targets, indx)


## Datasets Preparation 2

In [19]:
# Creation of the data arrays
def datasets_preparation2(variable, name, units, dataset, ref, indx):

    # Creating the maps size (with nans)
    variable_all = np.full((len(dataset.time_counter), len(ref.y) * len(ref.x)),np.nan)
    variable_all[:,indx[0]] = variable
    variable_all = np.reshape(variable_all,(len(dataset.time_counter),len(ref.y),len(ref.x)), order = 'F')

    # Preparation of the dataarray 
    array = xr.DataArray(variable_all,
        coords = {'time_counter': dataset.time_counter,'y': ref.y, 'x': ref.x},
        dims = ['time_counter','y','x'],
        attrs=dict(description= name,
        units=units))
        
    return (array)


## File Creation

In [20]:
def file_creation(path, variable, name):

    temp = variable.to_dataset(name=name)
    temp.to_netcdf(path = path + 'targets_predictions.nc', mode='a', encoding={name:{"zlib": True, "complevel": 9}})
    

## Regressor

In [21]:
def regressor (inputs, targets):

    from skfda.misc.hat_matrix import NadarayaWatsonHatMatrix
    from skfda.misc.hat_matrix import KNeighborsHatMatrix
    from skfda.misc.hat_matrix import LocalLinearRegressionHatMatrix
    from skfda.ml.regression import KernelRegression
    
    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    scaler_inputs = make_column_transformer((MinMaxScaler(), [0,1,2]), remainder=KBinsDiscretizer(n_bins=155,encode='ordinal',strategy='uniform'))
    temp = scaler_inputs.fit_transform(temp)
    temp = temp.transpose()
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))
    
    inputs = np.transpose(inputs,axes=(2,1,0))
    
    # Scaling the targets
    scaler_targets = MinMaxScaler()
    temp = np.ravel(targets)
    temp = np.expand_dims(temp,-1)
    temp = scaler_targets.fit_transform(temp)
    targets = temp.reshape(targets.shape)

    # Final transformations
    targets = targets.transpose()
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    # targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=5))

    kernel_estimator = NadarayaWatsonHatMatrix(bandwidth=1)
    regr = KernelRegression(kernel_estimator=kernel_estimator)
    regr.fit(inputs, targets)

    return(regr,scaler_inputs,scaler_targets)


## Scaling

In [22]:
def scaling(regr,inputs,scaler_inputs,targets,scaler_targets):

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    temp = scaler_inputs.transform(temp)
    temp = temp.transpose()        
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))
        
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets)))

    predictions = regr.predict(inputs)

    # # Post-processing of predictions
    # predictions = np.array(predictions.to_grid(np.arange(0,len(targets))).data_matrix)
    # predictions = np.squeeze(predictions,2)

    # Scaling the predictions
    temp = np.ravel(predictions)
    temp = np.expand_dims(temp,axis=-1)
    temp = scaler_targets.inverse_transform(temp)
    predictions = temp.reshape(predictions.shape)
    predictions = predictions.transpose()

    return(inputs,predictions)


## Scatter Plot

In [23]:
def scatter_plot(targets, predictions, name):

    # compute slope m and intercept b
    m, b = np.polyfit(targets, predictions, deg=1)

    fig, ax = plt.subplots(2, figsize=(5,10), layout='constrained')

    ax[0].scatter(targets,predictions, alpha = 0.2, s = 10)

    lims = [np.min([ax[0].get_xlim(), ax[0].get_ylim()]),
        np.max([ax[0].get_xlim(), ax[0].get_ylim()])]

    # plot fitted y = m*x + b
    ax[0].axline(xy1=(0, b), slope=m, color='r')

    ax[0].set_xlabel('targets')
    ax[0].set_ylabel('predictions')
    ax[0].set_xlim(lims)
    ax[0].set_ylim(lims)
    ax[0].set_aspect('equal')

    ax[0].plot(lims, lims,linestyle = '--',color = 'k')

    h = ax[1].hist2d(targets,predictions, bins=100, cmap='jet', 
        range=[lims,lims], cmin=0.1, norm='log')
    
    ax[1].plot(lims, lims,linestyle = '--',color = 'k')

    # plot fitted y = m*x + b
    ax[1].axline(xy1=(0, b), slope=m, color='r')

    ax[1].set_xlabel('targets')
    ax[1].set_ylabel('predictions')
    ax[1].set_aspect('equal')

    fig.colorbar(h[3],ax=ax[1], location='bottom')

    fig.suptitle(name)

    plt.show()

    return(m)


## Seasonality

In [24]:
def seasonality (dates,targets):

    # Preparation of the dataarray 
    targets2 = xr.DataArray(targets,
        coords = {'time_counter':dates},
        dims = 'time_counter')
    
    test = targets2.groupby('time_counter.dayofyear').mean('time_counter')
    test = test.drop_isel(dayofyear=14) # Removing 29 Feb
    season = np.tile(test,len(np.unique(dates.year)))

    return(season)
    

## Plotting (Criteria)

In [25]:
def plotting_criteria(dates, variable, year_variable, title):
    
    indx = pd.DatetimeIndex(dates)
    fig, ax = plt.subplots()

    scatter= ax.scatter(dates,variable, marker='.', c=indx.month)
    plt.xticks(rotation=70)
    ax.legend(handles=scatter.legend_elements()[0], labels=['February','March','April'])
    ax.plot(dates[(indx.month == 3) & (indx.day == 15)], year_variable,color='red',marker='*')
    fig.suptitle(title + ' (15 Feb - 30 Apr)')
    
    fig.show()


## Plotting (Mean Values)

In [26]:
def plotting_mean_values(dates, mean_targets, mean_predictions, units, category, region):

    years = np.unique(dates.year)
    ticks = [0]
    
    fig, _ = plt.subplots(figsize=(19,5))
    
    mean_targets = np.ma.array(mean_targets)
    mean_predictions = np.ma.array(mean_predictions)

    for year in years[:-1]:
        ticks.append((np.where(dates.year==year)[0][-1]+1))
        mean_targets[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked
        mean_predictions[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked

    plt.plot(mean_targets, label = 'targets')
    plt.plot(mean_predictions, label = 'predictions')
    plt.xlabel('Years')
    plt.xticks(ticks,years)
    plt.suptitle('Mean '+category + ' ' +units + ' (15 Feb - 30 Apr) ' + region)
    plt.legend()
    fig.show()
    

## Plotting (Maps)

In [27]:
def plotting_maps(targets, predictions, name, units):

    fig, ax = plt.subplots(2,2, figsize = (10,15), layout='tight')

    cmap = plt.get_cmap('cubehelix')
    cmap.set_bad('gray')

    targets.plot(ax=ax[0,0], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': name + ' ' + units})
    predictions.plot(ax=ax[0,1], cmap=cmap, vmin = targets.min(), vmax = targets.max(), cbar_kwargs={'label': name + ' ' + units})
    (targets-predictions).plot(ax=ax[1,0], cmap=cmap, cbar_kwargs={'label': name + ' ' + units})

    plt.subplots_adjust(left=0.1,
        bottom=0.1, 
        right=0.95, 
        top=0.95, 
        wspace=0.35, 
        hspace=0.35)

    sa_vi.set_aspect(ax[0,0])
    sa_vi.set_aspect(ax[0,1])
    sa_vi.set_aspect(ax[1,0])

    ax[0,0].title.set_text('Targets')
    ax[0,1].title.set_text('Predictions')
    ax[1,0].title.set_text('Targets-Predictions')
    ax[1,1].axis('off')

    fig.suptitle(name + ' '+ str(targets.time_counter.dt.date.values))

    plt.show()
    

## Plotting (Regions)

In [28]:
def plot_box(ax, corn, colour):

    ax.plot([corn[2], corn[3], corn[3], corn[2], corn[2]], 
    [corn[0], corn[0], corn[1], corn[1], corn[0]], '-', color=colour)
    

## Plotting (Regional analysis)

In [29]:
def plotting_regional(metric,box,years,category):

    fig,ax = plt.subplots()

    for i in range (0,len(box)):
        ax.plot(years,metric[:,i],marker= '*', label=box[i])
    plt.suptitle(category+ ' (Regional analysis)')
    plt.legend()
    fig.show()
    

## Evaluation

In [30]:
def evaluation (regr,ds,ds2,name,units,ref,scaler_inputs,scaler_targets):

    years = np.unique(ds.time_counter.dt.year)

    # For every year
    r_years = np.array([])
    rms_years = np.array([])
    slope_years = np.array([])

    # For all testing years 
    targets_all = []
    predictions_all = []

    for year in (years):

        dataset = ds.sel(time_counter=str(year))
        dataset2 = ds2.sel(time_counter=str(year))
    
        inputs, targets, indx = datasets_preparation(dataset, dataset2, ref, name)

        inputs,predictions = scaling(regr,inputs,scaler_inputs,targets,scaler_targets)

        # Calculating the annual time-series
        m_year = scatter_plot(np.ravel(targets), np.ravel(predictions), name + ' for '+ str(year)) 
        r_year = np.corrcoef(np.ravel(targets), np.ravel(predictions))[0][1]
        rms_year = rmse(np.ravel(targets), np.ravel(predictions))
        
        r_years = np.append(r_years,r_year)
        rms_years = np.append(rms_years,rms_year)
        slope_years = np.append(slope_years,m_year)

        # Daily arrays
        targets_all.append (datasets_preparation2(targets, name + ' _targets', units, dataset, ref, indx))
        predictions_all.append(datasets_preparation2(predictions, name + ' _predictions', units, dataset, ref, indx))  

    # Daily arrays
    targets_all = xr.concat(targets_all, dim='time_counter')
    predictions_all = xr.concat(predictions_all, dim='time_counter') 
    
    return(r_years, rms_years, slope_years, targets_all, predictions_all)


## Pre-processing of datasets (run only once!)

In [31]:
name = 'Flagellate'
units = '[mmol m-2]'
category = 'Concentrations'

ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

ref = ds.sel(time_counter = slice('2007', '2007'))

dataset = ds.sel(time_counter = slice('2007', '2020'))
quant_train = dataset[name] # Keeping it for the regional seasonalities

ds = ds.stack(z=('x','y'))
ds2 = ds2.stack(z=('x','y'))

indx = ~((ds.time_counter.dt.month==2) & (ds.time_counter.dt.day==29))
ds = ds.sel(time_counter=indx)
ds2 = ds2.sel(time_counter=indx)


## Training

In [32]:
dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

inputs, targets, indx = datasets_preparation(dataset, dataset2, ref, name)

regr,scaler_inputs,scaler_targets = regressor(inputs, targets)

inputs,predictions = scaling(regr,inputs,scaler_inputs,targets,scaler_targets)

print ('The correlation coefficient during training is: ' + str(np.round(np.corrcoef(np.ravel(predictions),np.ravel(targets))[0][1],3)))
print ('The rmse during training is: ' + str(rmse(np.ravel(predictions),np.ravel(targets))))
m,_ = np.polyfit(np.ravel(predictions),np.ravel(targets), deg=1)
print('The slope of the best fitting line during training is: '+str(np.round(m,3)))

# Daily mean plot
dates = pd.DatetimeIndex(dataset['time_counter'].values)
indx2 = ~((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
dates = dates[indx2]

targets = np.split(targets,len(np.unique(dates.year)),axis=1)
targets = np.ravel(targets)
targets = np.reshape(targets,(len(dates),int(len(indx[0])/len(np.unique(dates.year)))))
targets = np.mean(targets,axis=1)

predictions = np.split(predictions,len(np.unique(dates.year)),axis=1)
predictions = np.ravel(predictions)
predictions = np.reshape(predictions,(len(dates),int(len(indx[0])/len(np.unique(dates.year)))))
predictions = np.mean(predictions,axis=1)

plotting_mean_values(dates, targets, predictions, units, category, 'Salish Sea')


## Code for splitting into training and testing

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(np.transpose(inputs,axes=(2,0,1)), targets.transpose(), test_size=0.33)
# inputs = np.transpose(X_train,axes=(1,2,0))
# targets = y_train.transpose()

# scaler_inputs = []

# # Scaling the inputs
# for j in range (0, len(inputs)):

#     scaler = StandardScaler()
#     temp = np.ravel(inputs[j])
#     temp = np.expand_dims(temp,-1)
#     temp = scaler.fit_transform(temp)
#     inputs[j] = temp.reshape(inputs[j].shape)
#     scaler_inputs.append(scaler)

# inputs = np.transpose(inputs,axes=(2,1,0))

# targets0 = targets

# # Scaling the targets
# scaler_targets = StandardScaler()
# temp = np.ravel(targets)
# temp = np.expand_dims(temp,-1)
# temp = scaler_targets.fit_transform(temp)
# targets = temp.reshape(targets.shape)
# targets = targets.transpose()

# # Final transformations
# inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
# targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

# # Smoothing
# # targets = targets.to_basis(FourierBasis(n_basis=5))

# regr = KNeighborsRegressor(n_neighbors=10,n_jobs=4)
# regr.fit(inputs, targets)

# predictions = regr.predict(inputs)

# # Post-processing of predictions
# predictions = np.array(predictions.to_grid(np.arange(0,len(targets0))).data_matrix)
# predictions = np.squeeze(predictions,2)
# # Scaling the predictions
# temp = np.ravel(predictions)
# temp = np.expand_dims(temp,axis=-1)
# temp = scaler_targets.inverse_transform(temp)
# predictions = temp.reshape(predictions.shape)
# predictions = predictions.transpose()

# print ('The correlation coefficient during training is: ' + str(np.round(np.corrcoef(np.ravel(predictions),np.ravel(targets0))[0][1],3)))
# print ('The rmse during training is: ' + str(rmse(np.ravel(predictions),np.ravel(targets0))))
# m,_ = np.polyfit(np.ravel(predictions),np.ravel(targets0), deg=1)
# print('The slope of the best fitting line during training is: '+str(np.round(m,3)))

# inputs = np.transpose(X_test,axes=(1,2,0))
# targets0 = y_test.transpose()

# # Scaling the inputs
# for j in range (0, len(inputs)):

#     temp = np.ravel(inputs[j])
#     temp = np.expand_dims(temp,axis=-1)
#     temp = scaler_inputs[j].transform(temp)
#     inputs[j] = temp.reshape(inputs[j].shape)

# inputs = np.transpose(inputs,axes=(2,1,0))    

# inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets0)))

# predictions = regr.predict(inputs)

# # Post-processing of predictions
# predictions = np.array(predictions.to_grid(np.arange(0,len(targets0))).data_matrix)
# predictions = np.squeeze(predictions,2)
# # Scaling the predictions
# temp = np.ravel(predictions)
# temp = np.expand_dims(temp,axis=-1)
# temp = scaler_targets.inverse_transform(temp)
# predictions = temp.reshape(predictions.shape)
# predictions = predictions.transpose()

# print ('The correlation coefficient during testing is: ' + str(np.round(np.corrcoef(np.ravel(predictions),np.ravel(targets0))[0][1],3)))
# print ('The rmse during testing is: ' + str(rmse(np.ravel(predictions),np.ravel(targets0))))
# m,_ = np.polyfit(np.ravel(predictions),np.ravel(targets0), deg=1)
# print('The slope of the best fitting line during testing is: '+str(np.round(m,3)))


In [None]:
season = seasonality(dates,targets)

plt.plot(season[75:150])
plt.suptitle('Long-term seasonality (2007-2020)')

plotting_mean_values(dates, targets-season, predictions-season, units, category, 'Salish Sea (removed seasonality)')

dates_season = dates


## Other Years

In [None]:
dataset = ds.sel(time_counter = slice('2021', '2024'))
dataset2 = ds2.sel(time_counter = slice('2021', '2024'))

dates = pd.DatetimeIndex(dataset['time_counter'].values)

r_years, rms_years, slope_years, targets_all, predictions_all = evaluation(regr,dataset,dataset2,name,units,ref,scaler_inputs,scaler_targets)

r_days = xr.corr(targets_all,predictions_all, dim=['x','y'])
rms_days = xs.rmse(targets_all,predictions_all, dim=['x','y'], skipna=True)
slope_days = xs.linslope(targets_all,predictions_all, dim=['x','y'], skipna=True)


## Plotting (Results)

In [None]:
plotting_criteria(dates, r_days, r_years, 'Correlation Coefficients')
plotting_criteria(dates, rms_days, rms_years, 'Root Mean Square Errors')
plotting_criteria(dates, slope_days, slope_years, 'Slopes of the best fitting line')

# Daily maps
maps = random.sample(sorted(np.arange(0,len(targets_all.time_counter))),10)
for i in maps:

    idx = np.isfinite(np.ravel(targets_all[i]))
    scatter_plot(np.ravel(targets_all[i])[idx], np.ravel(predictions_all[i])[idx], name + ' '+ str(targets_all[i].time_counter.dt.date.values))

    plotting_maps(targets_all[i], predictions_all[i], name, units)
    

## Regional analysis

In [None]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')

fig, ax = plt.subplots(1, 1, figsize=(5, 9))
mycmap = cm.deep
mycmap.set_bad('grey')
ax.pcolormesh(ds[name][0], cmap=mycmap)
sa_vi.set_aspect(ax)

SoG_north = [650, 730, 100, 200]
plot_box(ax, SoG_north, 'g')
SoG_center = [450, 550, 200, 300]
plot_box(ax, SoG_center, 'b')
Fraser_plume = [380, 460, 260, 330]
plot_box(ax, Fraser_plume, 'm')
SoG_south = [320, 380, 280, 350]
plot_box(ax, SoG_south, 'k')
Haro_Boundary = [290, 350, 210, 280]
plot_box(ax, Haro_Boundary, 'm')
JdF_west = [250, 425, 25, 125]
plot_box(ax, JdF_west, 'c')
JdF_east = [200, 290, 150, 260]
plot_box(ax, JdF_east, 'w')
PS_main = [20, 150, 200, 280]
plot_box(ax, PS_main, 'r')
PS_all = [0, 200, 80, 320]
plot_box(ax, PS_all, 'm')

boxnames = ['SoG_north','SoG_center','Fraser_plume','SoG_south', 'Haro_Boundary', 'JdF_west', 'JdF_east', 'PS_main', 'PS_all']
fig.legend(boxnames)

SS_all = [0, 895, 0, 395]
boxes = [SS_all,SoG_north,SoG_center,Fraser_plume,SoG_south,Haro_Boundary,JdF_west,JdF_east,PS_main, PS_all]
boxnames.insert(0,'SS_all')

# Low resolution
temp = []
for i in boxes:

    temp.append([x//5 for x in i])

boxes = temp


In [None]:
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

r = np.zeros((4,len(boxnames)))
rms = np.zeros((4,len(boxnames)))

r_season = np.zeros((4,len(boxnames)))
rms_season = np.zeros((4,len(boxnames)))

for i in range (0, len(boxes)):

    targets=targets_all[:,boxes[i][0]:boxes[i][1], boxes[i][2]:boxes[i][3]]
    predictions=predictions_all[:,boxes[i][0]:boxes[i][1], boxes[i][2]:boxes[i][3]]
   
    mean_targets = targets.mean(dim=['x','y'], skipna=True)
    mean_predictions = predictions.mean(dim=['x','y'], skipna=True)
    plotting_mean_values(dates, mean_targets, mean_predictions, units, category, boxnames[i])

    climatology = quant_train[:,boxes[i][0]:boxes[i][1], boxes[i][2]:boxes[i][3]]
    mean_targets_clim = climatology.mean(dim=['x','y'], skipna=True)
    mean_targets_clim = mean_targets_clim.drop_sel(time_counter=['2008-02-29','2012-02-29','2016-02-29','2020-02-29']) # Removing 29 of Feb
    season = seasonality(dates_season,mean_targets_clim)

    plotting_mean_values(dates, mean_targets-season[np.where(dates_season.year==2017)[0][0]:], 
        mean_predictions-season[np.where(dates_season.year==2017)[0][0]:], units, category, boxnames[i]+' (removed seasonality)')

    targets_annual = np.ravel(mean_targets.groupby('time_counter.year'))
    predictions_annual = np.ravel(mean_predictions.groupby('time_counter.year'))

    targets_annual_season = np.ravel((mean_targets-season[np.where(dates_season.year==2017)[0][0]:]).groupby('time_counter.year'))
    predictions_annual_season = np.ravel((mean_predictions-season[np.where(dates_season.year==2017)[0][0]:]).groupby('time_counter.year'))
    
    years = []
    for j in range(1,8,2):

        years.append(targets_annual[j-1])

        r[len(years)-1,i] = np.round(np.corrcoef(np.ravel(targets_annual[j]),np.ravel(predictions_annual[j]))[0][1],3)
        rms[len(years)-1,i] = rmse(np.ravel(targets_annual[j]),np.ravel(predictions_annual[j]))

        r_season[len(years)-1,i] = np.round(np.corrcoef(np.ravel(targets_annual_season[j]),np.ravel(predictions_annual_season[j]))[0][1],3)
        rms_season[len(years)-1,i] = rmse(np.ravel(targets_annual_season[j]),np.ravel(predictions_annual_season[j]))

plotting_regional(r,boxnames,years, 'Correlation coefficients')
plotting_regional(rms,boxnames,years, 'Root mean square errors')

plotting_regional(r_season,boxnames,years, 'Correlation coefficients (removed seasonality)')
plotting_regional(rms_season,boxnames,years, 'Root mean square errors (removed seasonality)')


## Functional comparison

In [None]:
targets = targets_all.stack(z=('x','y')).dropna('z').to_numpy()
predictions = predictions_all.stack(z=('x','y')).dropna('z').to_numpy()

targets = targets.transpose()
predictions = predictions.transpose()

In [None]:
targets.shape

In [None]:
plt.plot(targets[1001].transpose(), label = "targets")
plt.plot(predictions[1001].transpose(), label = "predictions")
plt.legend()

## Saving

In [None]:
# path = '/data/ibougoudis/MOAD/files/results/' + name + '/func_reg/'
# os.makedirs(path, exist_ok=True)
# with lzma.open(path + 'regr.xz', 'wb') as f:
#     dill.dump(regr, f)

# file_creation(path, targets_all, 'Targets')
# file_creation(path, predictions_all, 'Predictions')
# file_creation(path, (targets_all-predictions_all), 'Targets - Predictions')
