# Finding the optimal parameters for Historical Linear Regression algorithm (spatial means)

## Importing

In [31]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xskillscore as xs

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import r_regression

from skfda.representation.grid import FDataGrid
from skfda.ml.clustering import KMeans

from skfda.misc.hat_matrix import NadarayaWatsonHatMatrix, LocalLinearRegressionHatMatrix, KNeighborsHatMatrix
from skfda.preprocessing.smoothing import KernelSmoother

from skfda.ml.regression import HistoricalLinearRegression

from sklearn.metrics import root_mean_squared_error as rmse

import cmocean.cm as cm
import salishsea_tools.viz_tools as sa_vi

from tqdm import tqdm


## Datasets Preparation

In [32]:
# Creation of the training - testing datasets

def datasets_preparation(dataset, boxes, regions, name, inputs_names):
    
    indx = np.where((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
    
    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    inputs = []
    
    for i in inputs_names:
        inputs.append(dataset[i].to_numpy().reshape(*dataset[i].to_numpy().shape[:1],-1))

    inputs = np.array(inputs)

    # Deleting 29 of February
    inputs = np.delete(inputs,indx,axis=1)
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    inputs = np.split(inputs,len(np.unique(dataset.time_counter.dt.year)),axis=1)
    targets = np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0)

    # Transposing
    inputs = np.transpose(inputs, (1,2,0,3))
    targets = np.transpose(targets, (1,0,2))

    indx = np.where(~np.isnan(targets[0]).any(axis=0))
    inputs = inputs[:,:,:,indx[0]]
    targets = targets[:,:,indx[0]]

    regions = np.ravel(regions)
    regions = regions[indx[0]]

    regions_indiv_t = np.zeros((len(np.unique(dataset.time_counter.dt.dayofyear))-1,len(np.unique(dataset.time_counter.dt.year)),len(boxes)))
    regions_indiv_d = np.zeros((len(inputs_names),len(np.unique(dataset.time_counter.dt.dayofyear))-1,len(np.unique(dataset.time_counter.dt.year)),len(boxes)))

    for j in range (0,len(boxes)):

        regions_indiv_d[:,:,:,j] = np.nanmean(np.where(regions==j, inputs, np.nan),axis=3)
        regions_indiv_t[:,:,j] = np.nanmean(np.where(regions==j, targets, np.nan),axis=2)

    inputs = regions_indiv_d
    targets = regions_indiv_t

    return(inputs, targets)


## Regressor

In [33]:
def regressor (inputs, targets, lag):

    temp_inputs = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]), order='F')
    temp_inputs = temp_inputs.transpose()
    temp_targets = np.reshape(targets, (targets.shape[0]*targets.shape[1]), order='F')

    # Scaling the inputs
    scaler_inputs = make_column_transformer((StandardScaler(), np.arange(0,len(inputs))))
    temp_inputs = scaler_inputs.fit_transform(temp_inputs)
    temp_inputs = temp_inputs.transpose()
    inputs = np.reshape(temp_inputs,(len(inputs),inputs.shape[1],inputs.shape[2]), order='F')   
    
    # Scaling the targets
    scaler_targets = StandardScaler()
    temp_targets = np.expand_dims(temp_targets,-1)
    temp_targets = scaler_targets.fit_transform(temp_targets)
    targets = temp_targets.reshape(targets.shape, order='F')

    # Final transformations
    inputs = np.transpose(inputs,axes=(2,1,0))
    targets = targets.transpose()
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=10))
    kernel_estimator = LocalLinearRegressionHatMatrix(bandwidth=1)
    smoother = KernelSmoother(kernel_estimator=kernel_estimator)
    inputs = smoother.fit_transform(inputs)

    model = HistoricalLinearRegression(n_intervals=5, lag=lag)
    regr = model.fit(inputs,targets)

    return(regr,scaler_inputs,scaler_targets,smoother)


## Scalling

In [34]:
def scaling(regr,inputs,scaler_inputs,targets,scaler_targets,smoother):

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]), order='F')
    temp = temp.transpose()
    temp = scaler_inputs.transform(temp)
    temp = temp.transpose()        
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]), order='F')
        
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets)))

    inputs = smoother.transform(inputs)

    predictions = regr.predict(inputs)

    # Post-processing of predictions
    predictions = np.array(predictions.to_grid(np.arange(0,len(targets))).data_matrix)
    predictions = np.squeeze(predictions,2)

    # Scaling the predictions
    temp = np.ravel(predictions)
    temp = np.expand_dims(temp,axis=-1)
    temp = scaler_targets.inverse_transform(temp)
    predictions = temp.reshape(predictions.shape)
    predictions = predictions.transpose()

    return(predictions)


## Plotting (regions)

In [35]:
def plot_box(ax, corn, colour):

    ax.plot([corn[2], corn[3], corn[3], corn[2], corn[2]], 
    [corn[0], corn[0], corn[1], corn[1], corn[0]], '-', color=colour)
    

## Initiation

In [36]:
name = 'Diatom'
units = '[mmol m-2]'
category = 'Concentrations'

if name == 'Diatom':
    inputs_names = ['Summation_of_solar_radiation','Mean_wind_speed','Mean_air_temperature']
else:
    inputs_names = ['Summation_of_solar_radiation','Mean_air_temperature','Mean_pressure', 'Mean_precipitation', 'Mean_specific_humidity']

ds = xr.open_dataset('/data/ibougoudis/MOAD/files/jan_apr.nc')


## Regions

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(5, 9))
mycmap = cm.deep
mycmap.set_bad('grey')
ax.pcolormesh(ds['Diatom'][0], cmap=mycmap)
sa_vi.set_aspect(ax)

SoG_north = [650, 730, 100, 200]
plot_box(ax, SoG_north, 'g')
SoG_center = [450, 550, 200, 300]
plot_box(ax, SoG_center, 'b')
Fraser_plume = [380, 460, 260, 330]
plot_box(ax, Fraser_plume, 'm')
SoG_south = [320, 380, 280, 350]
plot_box(ax, SoG_south, 'k')
Haro_Boundary = [290, 350, 210, 280]
plot_box(ax, Haro_Boundary, 'm')
JdF_west = [250, 425, 25, 125]
plot_box(ax, JdF_west, 'c')
JdF_east = [200, 290, 150, 260]
plot_box(ax, JdF_east, 'w')
PS_all = [0, 200, 80, 320]
plot_box(ax, PS_all, 'm')
PS_main = [20, 150, 200, 280]
plot_box(ax, PS_main, 'r')

boxnames = ['SoG_north','SoG_center','Fraser_plume','SoG_south', 'Haro_Boundary', 'JdF_west', 'JdF_east', 'PS_all', 'PS_main']
fig.legend(boxnames)

boxes = [SoG_north,SoG_center,Fraser_plume,SoG_south,Haro_Boundary,JdF_west,JdF_east,PS_all,PS_main]

regions0 = np.full((len(ds.y),len(ds.x)),np.nan)

for i in range (0, len(boxes)):
    regions0[boxes[i][0]:boxes[i][1], boxes[i][2]:boxes[i][3]] = i

regions0 = xr.DataArray(regions0,dims = ['y','x'])

# # Low resolution

# temp = []

# for i in boxes:
#     temp.append([x//5 for x in i])

# boxes = temp


## Pre-processing

In [38]:
# Low resolution

# ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
#     x=(np.arange(ds.x[0], ds.x[-1], 5)))

# regions0 = regions0.isel(y=(np.arange(regions0.y[0], regions0.y[-1], 5)), 
#     x=(np.arange(regions0.x[0], regions0.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2020'))

r_inputs = np.zeros((len(boxnames), len(inputs_names)))

inputs,targets = datasets_preparation(dataset,boxes,regions0,name,inputs_names)

# Testing

dataset = ds.sel(time_counter = slice('2021', '2024'))

inputs_test,targets_test = datasets_preparation(dataset,boxes,regions0,name,inputs_names)

# Lags (days=75 or days=76)

# lags = [24.6, 49.3, 74] # n_intervals=3
# lags = [18.5, 37, 55.5, 74] # n_intervals=4
# lags = [14.8, 29.6, 44.4, 59.2, 74] # n_intervals=5

# Lags (days=120)

# lags = [39.6, 79.2, 119] # n_intervals=3
# lags = [29.75, 59.5, 89.25, 119] # n_intervals=4
lags = [23.8, 47.6, 71.4, 95.2, 119] # n_intervals=5

# Lags (days=153)

# lags = [50.6, 101.3, 152] # n_intervals=3
# lags = [38, 76, 114, 152] # n_intervals=4
# lags = [30.4, 60.8, 91.2, 121.6, 152] # n_intervals=5


## Training - Testing

In [None]:
r_train = np.zeros((len(boxes),len(lags))) 
rms_train = np.zeros((len(boxes),len(lags))) 
slope_train = np.zeros((len(boxes),len(lags))) 

r_test = np.zeros((len(boxes),len(lags))) 
rms_test = np.zeros((len(boxes),len(lags))) 
slope_test = np.zeros((len(boxes),len(lags))) 

for i in tqdm(range(0,len(boxes))):

    for j in range(0, len(lags)):

        inputs2 = inputs[:,:,:,i] # inputs of the i cluster
        targets2 = targets[:,:,i] # targets of the i cluster
        regr, scaler_inputs,scaler_targets,smoother = regressor(inputs2,targets2,lags[j])

        predictions= scaling(regr,inputs2,scaler_inputs,targets2,scaler_targets,smoother) # putting them in the right place

        r_train[i,j] = np.corrcoef(np.ravel(targets2),np.ravel(predictions))[0][1]
        rms_train[i,j] = rmse(np.ravel(targets2),np.ravel(predictions))
        m,_ = np.polyfit(np.ravel(targets2), np.ravel(predictions), deg=1)
        slope_train[i,j]= np.round(m,3)

        inputs2 = inputs_test[:,:,:,i] # inputs of the i cluster
        targets2 = targets_test[:,:,i] # targets of the i cluster

        predictions_test =  scaling(regr,inputs2,scaler_inputs,targets2,scaler_targets,smoother) # putting them in the right place

        r_test[i,j] = np.corrcoef(np.ravel(targets2),np.ravel(predictions_test))[0][1]
        rms_test[i,j] = rmse(np.ravel(targets2),np.ravel(predictions_test))
        m,_ = np.polyfit(np.ravel(targets2), np.ravel(predictions_test), deg=1)
        slope_test[i,j]= np.round(m,3)


## Printing

In [None]:
for i in range (0, len(boxes)):

    print('The best correlation coefficient for training for region ' +boxnames[i]+ ' is with lag: ' +str(lags[np.argmax(r_train[i])]))
    print('The best root mean square error for training for region ' +boxnames[i]+ ' is with lag: ' +str(lags[np.argmin(rms_train[i])]))

    print('The best correlation coefficient for testing for region ' +boxnames[i]+ ' is with lag: ' +str(lags[np.argmax(r_test[i])]))
    print('The best root mean square error for testing for region ' +boxnames[i]+ ' is with lag: ' +str(lags[np.argmin(rms_test[i])]))

    print('\n')
