# Assessing the performance of the model

## Importing

In [16]:
import xarray as xr
import numpy as np
import pandas as pd


from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler

from skfda.representation.grid import FDataGrid
from skfda.representation.basis import FourierBasis
from skfda.misc.hat_matrix import NadarayaWatsonHatMatrix
from skfda.ml.regression import KernelRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate

from sklearn.metrics import root_mean_squared_error as rmse


## Datasets Preparation

In [17]:
def datasets_preparation(ds, ds2, ref, name):
    
    years = np.unique(ds.time_counter.dt.year)
    test = []
    test2 = []

    for year in years:

        dataset = ds.sel(time_counter=str(year))
        dataset2 = ds2.sel(time_counter=str(year))

        y = np.tile(ref.y, len(ref.time_counter)*len(ref.x))
        x = np.tile(np.repeat(ref.x, len(ref.y)), len(ref.time_counter))

        test.append(np.stack([
            dataset2['Summation_of_solar_radiation'].to_numpy(),
            dataset2['Mean_wind_speed'].to_numpy(),
            dataset2['Mean_air_temperature'].to_numpy(),
            y.reshape(dataset2['Summation_of_solar_radiation'].to_numpy().shape),
            x.reshape(dataset2['Summation_of_solar_radiation'].to_numpy().shape)
            ]))
        
        test2.append(dataset[name].to_numpy())

    # Grouping all the years
    inputs = np.concatenate(test,axis=2)
    targets = np.concatenate(test2,axis=1)

    y = np.tile(ref.y, len(np.unique(ds.time_counter.dt.year))*len(ref.x))
    x = np.tile(np.repeat(ref.x, len(ref.y)), len(np.unique(ds.time_counter.dt.year)))

    indx = np.where((~np.isnan(targets).any(axis=0))& (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,:,indx[0]]
    targets = targets[:,indx[0]]

    return(inputs, targets, indx)


## Scaling (Training)

In [18]:
def scaling_train(inputs,targets):

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    scaler_inputs = make_column_transformer((MinMaxScaler(), [0,1,2]), remainder=KBinsDiscretizer(n_bins=155,encode='ordinal',strategy='uniform'))
    temp = scaler_inputs.fit_transform(temp)
    temp = temp.transpose()
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))
    
    inputs = np.transpose(inputs,axes=(2,1,0))
    
    # Scaling the targets
    scaler_targets = MinMaxScaler()
    temp = np.ravel(targets)
    temp = np.expand_dims(temp,-1)
    temp = scaler_targets.fit_transform(temp)
    targets = temp.reshape(targets.shape)
    targets = targets.transpose()

    return(inputs,scaler_inputs,targets,scaler_targets)


## Scaling (Testing)

In [19]:
def scaling_test(regr,inputs,scaler_inputs,targets,scaler_targets):

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    temp = scaler_inputs.transform(temp)
    temp = temp.transpose()        
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))
        
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets)))

    predictions = regr.predict(inputs)

    # # Post-processing of predictions
    # predictions = np.array(predictions.to_grid(np.arange(0,len(targets))).data_matrix)
    # predictions = np.squeeze(predictions,2)

    # Scaling the predictions
    temp = np.ravel(predictions)
    temp = np.expand_dims(temp,axis=-1)
    temp = scaler_targets.inverse_transform(temp)
    predictions = temp.reshape(predictions.shape)
    predictions = predictions.transpose()

    return(predictions)


## Regressor (Training with all)

In [20]:
def regressor (inputs, targets, table):

    inputs,scaler_inputs,targets,scaler_targets = scaling_train(inputs,targets)

    # Final transformations
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    # targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=5))

    kernel_estimator = NadarayaWatsonHatMatrix(bandwidth=1)
    model = KernelRegression(kernel_estimator=kernel_estimator)
    regr = model.fit(inputs, targets)

    predictions = scaling_test(regr,inputs,scaler_inputs,targets,scaler_targets)
    
    table[0,0] = np.round(np.corrcoef(predictions,targets)[0][1],3)
    table[1,0] = rmse(predictions,targets)
    m,_ = np.polyfit(targets, predictions, deg=1)
    table[2,0] = np.round(m,3)

    return(regr)


## Regressor (Training with 75%, testing with 25%)

In [21]:
def regressor2 (inputs, targets, table):

    X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.25)
    
    X_train,scaler_inputs,y_train,scaler_targets = scaling_train(X_train,y_train)

    # Final transformations
    X_train = FDataGrid(data_matrix=X_train, grid_points=np.arange(0,len(y_train[0])))
    # targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=5))

    kernel_estimator = NadarayaWatsonHatMatrix(bandwidth=1)
    regr = KernelRegression(kernel_estimator=kernel_estimator)
    regr.fit(X_train, y_train)

    predictions = scaling_test(regr,X_train,scaler_inputs,y_train,scaler_targets)

    table[0,2] = np.round(np.corrcoef(y_train,predictions)[0][1],3)
    table[1,2] = rmse(y_train,predictions)
    m,_ = np.polyfit(y_train,predictions, deg=1)
    table[2,2] = np.round(m,3)
    
    predictions = scaling_test(regr,X_test,scaler_inputs,y_test,scaler_targets)

    table[0,3] = np.round(np.corrcoef(y_test,predictions)[0][1],3)
    table[1,3] = rmse(y_test,predictions)
    m,_ = np.polyfit(y_test,predictions, deg=1)
    table[2,3] = np.round(m,3)

    return(regr)


## Cross Validation (4 folds)

In [22]:
def regressor3 (inputs, targets, table):

    inputs,scaler_inputs,targets,scaler_targets = scaling_train(inputs,targets)

    # Final transformations
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    # targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=5))

    kernel_estimator = NadarayaWatsonHatMatrix(bandwidth=1)
    regr = KernelRegression(kernel_estimator=kernel_estimator)

    predictions = scaling_test(regr,inputs,scaler_inputs,targets,scaler_targets)
    scores = cross_validate(regr, inputs, targets, cv=4, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)

    table[0,5:9] =  np.round(np.sqrt(np.abs(scores['train_r2'])),3)
    table[1,5:9] =  np.abs(scores['train_neg_root_mean_squared_error'])

    table[0,9:13] =  np.round(np.sqrt(np.abs(scores['test_r2'])),3)
    table[1,9:13] =  np.abs(scores['test_neg_root_mean_squared_error'])

    table[0,13] = np.round(np.corrcoef(predictions,targets)[0][1],3)
    table[1,13] = rmse(predictions,targets)
    m,_ = np.polyfit(targets, predictions, deg=1)
    table[2,13] = np.round(m,3)

## Evaluation (2021-2024)


In [23]:
def evaluation (ds, ds2, regr, scaler_inputs, scaler_targets, name, table, i):

    dataset = ds.sel(time_counter = slice('2021', '2024'))
    dataset2 = ds2.sel(time_counter = slice('2021', '2024'))

    inputs, targets, _ = datasets_preparation(dataset, dataset2, name)

    predictions = scaling_test(regr,inputs,scaler_inputs,targets,scaler_targets)

    table[0,i] = np.round(np.corrcoef(predictions,targets)[0][1],3)
    table[1,i] = rmse(predictions,targets)
    m,_ = np.polyfit(targets, predictions, deg=1)
    table[2,i] = np.round(m,3)


## Printing

In [24]:
def printing(table,criteria,categories,metric):

    temp = pd.DataFrame(table.transpose(),columns=criteria,index=categories)
    print(metric)
    display(temp)
    print ('\n')
    

## Pre-processing of datasets (run only once!)


In [None]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

ref = ds.sel(time_counter = slice('2007', '2007'))

ds = ds.stack(z=('x','y'))
ds2 = ds2.stack(z=('x','y'))

indx = ~((ds.time_counter.dt.month==2) & (ds.time_counter.dt.day==29))
ds = ds.sel(time_counter=indx)
ds2 = ds2.sel(time_counter=indx)


## Training

In [None]:
def training(name,table):
 
    dataset = ds.sel(time_counter = slice('2007', '2020'))
    dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

    inputs, targets, _ = datasets_preparation(dataset, dataset2, name)

    regr = regressor(inputs, targets, table)
    evaluation(ds,ds2,regr,name,table,1)

    regr2 = regressor2(inputs, targets, table)
    evaluation(ds,ds2,regr2,name,table,4)

    regressor3(inputs, targets, table)



## Main Body

In [None]:
criteria = ['r','rms','slope']
categories = ['training with 100%', 'testing', 'training with 75%', 'testing with 25%', 'testing', '1st fold train','2nd fold train', '3rd fold train','4th fold train',
    '1st fold test', '2nd fold test', '3rd fold test', '4th fold test', 'overall cross-val']

diat = np.zeros((len(criteria),len(categories)))
flag = np.zeros((len(criteria),len(categories)))
diat_pr = np.zeros((len(criteria),len(categories)))
flag_pr = np.zeros((len(criteria),len(categories)))

training('Diatom',diat)
training('Flagellate',flag)
training('Diatom_Production_Rate',diat_pr)
training('Flagellate_Production_Rate',flag_pr)


## Printing (Results)

In [None]:
printing(diat,criteria, categories,'Diatom')
printing(flag,criteria, categories,'Flagellate')
printing(diat_pr,criteria, categories,'Diatom production rate')
printing(flag_pr,criteria, categories, 'Flagellate production rate')
