# Finding the optimal parameters for Historical Linear Regression algorithm

## Importing

In [14]:
import xarray as xr
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xskillscore as xs

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import r_regression

from skfda.representation.grid import FDataGrid
from skfda.ml.clustering import KMeans

from skfda.ml.regression import FPLSRegression

from sklearn.metrics import root_mean_squared_error as rmse

import os
import lzma
import dill

import random

import cmocean.cm as cm
import salishsea_tools.viz_tools as sa_vi
from tqdm import tqdm

np.warnings.filterwarnings('ignore') # For the nan mean warning


## Datasets Preparation

In [15]:
# Creation of the training - testing datasets
def datasets_preparation(dataset, dataset2, name):
    
    indx = np.where((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
    
    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    day = np.repeat(dataset.time_counter.dt.dayofyear, len(dataset.x)*len(dataset.y)) # if needed

    inputs = np.stack([
        dataset2['Summation_of_solar_radiation'].to_numpy().reshape(*dataset2['Summation_of_solar_radiation'].to_numpy().shape[:1],-1),
        dataset2['Mean_wind_speed'].to_numpy().reshape(*dataset2['Mean_wind_speed'].to_numpy().shape[:1],-1),
        dataset2['Mean_air_temperature'].to_numpy().reshape(*dataset2['Mean_air_temperature'].to_numpy().shape[:1],-1),
        dataset2['Latitude'].to_numpy().reshape(*dataset2['Latitude'].to_numpy().shape[:1],-1),
        dataset2['Longitude'].to_numpy().reshape(*dataset2['Longitude'].to_numpy().shape[:1],-1),
        ])

    # Deleting 29 of February
    inputs = np.delete(inputs,indx,axis=1)
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    inputs = np.split(inputs,len(np.unique(dataset.time_counter.dt.year)),axis=1)
    targets = np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0)

    # Grouping all the years (amount of days for one year * amount of grid boxes)
    inputs = np.concatenate(inputs,axis=2)
    targets = np.concatenate(targets,axis=1)

    x = np.tile(dataset.x, len(np.unique(dataset.time_counter.dt.year))*len(dataset.y))
    y = np.tile(np.repeat(dataset.y, len(dataset.x)), len(np.unique(dataset.time_counter.dt.year)))

    indx = np.where((~np.isnan(targets).any(axis=0))& (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,:,indx[0]]
    targets = targets[:,indx[0]]


    return(inputs, targets, indx)


## Regressor

In [16]:
def regressor (inputs, targets, name,i):

    temp_inputs = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp_inputs = temp_inputs.transpose()
    temp_targets = np.ravel(targets)

    # Scaling the inputs
    scaler_inputs = make_column_transformer((StandardScaler(), [0,1,2,3,4]))
    temp_inputs = scaler_inputs.fit_transform(temp_inputs)
    temp_inputs = temp_inputs.transpose()
    inputs = np.reshape(temp_inputs,(len(inputs),inputs.shape[1],inputs.shape[2]))   
    inputs = np.transpose(inputs,axes=(2,1,0))
    
    # Scaling the targets
    scaler_targets = StandardScaler()
    temp_targets = np.expand_dims(temp_targets,-1)
    temp_targets = scaler_targets.fit_transform(temp_targets)
    targets = temp_targets.reshape(targets.shape)

     # Final transformations
    targets = targets.transpose()
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    ## Smoothing
    from skfda.representation.basis import VectorValuedBasis,FourierBasis

    # x_basis = np.tile(FourierBasis(domain_range=(0,74),n_basis=10),5)
    # basis = VectorValuedBasis(x_basis)
    # inputs = inputs.to_basis(basis)
    
    # targets = targets.to_basis(FourierBasis(n_basis=10))

    from skfda.ml.regression import HistoricalLinearRegression,LinearRegression,FPCARegression

    model = HistoricalLinearRegression(n_intervals=i,lag=25)
    regr = model.fit(inputs,targets)

    return(regr,scaler_inputs,scaler_targets)


## Scaling

In [17]:
def scaling(regr,inputs,scaler_inputs,targets,scaler_targets):

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    temp = scaler_inputs.transform(temp)
    temp = temp.transpose()        
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))
        
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets)))

    predictions = regr.predict(inputs)

    # Post-processing of predictions
    predictions = np.array(predictions.to_grid(np.arange(0,len(targets))).data_matrix)
    predictions = np.squeeze(predictions,2)

    # Scaling the predictions
    temp = np.ravel(predictions)
    temp = np.expand_dims(temp,axis=-1)
    temp = scaler_targets.inverse_transform(temp)
    predictions = temp.reshape(predictions.shape)
    predictions = predictions.transpose()

    return(predictions)


## Plotting (Mean Values)

In [18]:
def plotting_mean_values(dates,mean_targets,mean_predictions,category,units,region):

    years = np.unique(dates.year)
    ticks = [0]
    
    fig, _ = plt.subplots(figsize=(19,5))
    
    mean_targets = np.ma.array(mean_targets)
    mean_predictions = np.ma.array(mean_predictions)

    for year in years[:-1]:
        ticks.append((np.where(dates.year==year)[0][-1]+1))
        mean_targets[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked
        mean_predictions[(np.where(dates.year==year)[0][-1]+1)] = np.ma.masked

    plt.plot(mean_targets, label = 'targets')
    plt.plot(mean_predictions, label = 'predictions')
    plt.xlabel('Years')
    plt.xticks(ticks,years)
    plt.suptitle('Mean '+category + ' ' +units + ' (15 Feb - 30 Apr) ' + region)
    plt.legend()
    
    fig.show()


## Training

In [19]:
name = 'Diatom'
units = '[mmol m-2]'
category = 'Concentrations'

ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

dataset_test = ds.sel(time_counter = slice('2021', '2024'))
dataset2_test = ds2.sel(time_counter = slice('2021', '2024'))

inputs, targets, indx = datasets_preparation(dataset, dataset2, name)

inputs_test, targets_test, indx = datasets_preparation(dataset_test, dataset2_test, name)



In [23]:
r_train = []
rms_train = []
slope_train = []

r_test = []
rms_test = []
slope_test = []

for i in tqdm(range(5,6)):

    regr,scaler_inputs,scaler_targets = regressor(inputs, targets, name, i)

    # predictions = scaling(regr,inputs,scaler_inputs,targets,scaler_targets)

    # r_train.append(np.corrcoef(np.ravel(targets),np.ravel(predictions))[0][1])
    # rms_train.append(rmse(np.ravel(targets),np.ravel(predictions)))
    # m,_ = np.polyfit(np.ravel(targets), np.ravel(predictions), deg=1)
    # slope_train.append(np.round(m,3))

    predictions = scaling(regr,inputs_test,scaler_inputs,targets_test,scaler_targets)

    test = np.reshape(targets_test,(75,4,1838))
    test2 = np.reshape(predictions,(75,4,1838))
    a = np.mean(test,axis=2)
    b = np.mean(test2,axis=2)
    np.corrcoef(np.ravel(a),np.ravel(b))[0][1]

    r_test.append(np.corrcoef(np.ravel(a),np.ravel(b))[0][1])
    rms_test.append(rmse(np.ravel(a),np.ravel(b)))
    m,_ = np.polyfit(np.ravel(a),np.ravel(b), deg=1)
    slope_test.append(np.round(m,3))


100%|██████████| 1/1 [38:47<00:00, 2327.55s/it]


In [9]:
print('The best correlation coefficient for training is with number of components: ' +str(r_train.index(max(r_train))+1))
print('The best root mean square error for training is with number of components: ' +str(rms_train.index(min(rms_train))+1))

ValueError: max() arg is an empty sequence

In [None]:
print('The best correlation coefficient for testing is with number of components: ' +str(r_test.index(max(r_test))+1))
print('The best root mean square error for testing is with number of components: ' +str(rms_test.index(min(rms_test))+1))

In [24]:
rms_test

[0.05059774095135961]

In [25]:
r_test

[0.9181224502617207]

In [21]:
rms_test

[0.04222890183061277]

In [22]:
r_test

[0.9432235070504909]