# Investigating training - testing performance based on k-fold validation - functional approach (issue with high testing results when testing is from the same years as training)

## Chose a different regression model, due to incompatibility with some sklearn functions

## Importing

In [1]:
import xarray as xr
import numpy as np
import pandas as pd

from sklearn.compose import make_column_transformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from skfda.representation.grid import FDataGrid
from skfda.ml.regression import FPLSRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate

from sklearn.metrics import root_mean_squared_error as rmse


## Datasets Preparation

In [2]:
def datasets_preparation(dataset, dataset2, name):
    
    indx = np.where((dataset.time_counter.dt.month==2) & (dataset.time_counter.dt.day==29))
    
    targets = dataset[name].to_numpy().reshape(*dataset[name].to_numpy().shape[:1],-1)

    day = np.repeat(dataset.time_counter.dt.dayofyear, len(dataset.x)*len(dataset.y))

    inputs = np.stack([
        dataset2['Summation_of_solar_radiation'].to_numpy().reshape(*dataset2['Summation_of_solar_radiation'].to_numpy().shape[:1],-1),
        dataset2['Mean_wind_speed'].to_numpy().reshape(*dataset2['Mean_wind_speed'].to_numpy().shape[:1],-1),
        dataset2['Mean_air_temperature'].to_numpy().reshape(*dataset2['Mean_air_temperature'].to_numpy().shape[:1],-1),
        dataset2['Latitude'].to_numpy().reshape(*dataset2['Latitude'].to_numpy().shape[:1],-1),
        dataset2['Longitude'].to_numpy().reshape(*dataset2['Longitude'].to_numpy().shape[:1],-1),
        ])

    # Deleting 29 of February
    inputs = np.delete(inputs,indx,axis=1)
    targets = np.delete(targets,indx,axis=0)

    # Splitting in years
    inputs = np.split(inputs,len(np.unique(dataset.time_counter.dt.year)),axis=1)
    targets = np.split(targets,len(np.unique(dataset.time_counter.dt.year)),axis=0)

    # Grouping all the years (amount of days for one year * amount of grid boxes)
    inputs = np.concatenate(inputs,axis=2)
    targets = np.concatenate(targets,axis=1)

    x = np.tile(dataset.x, len(np.unique(dataset.time_counter.dt.year))*len(dataset.y))
    y = np.tile(np.repeat(dataset.y, len(dataset.x)), len(np.unique(dataset.time_counter.dt.year)))

    indx = np.where((~np.isnan(targets).any(axis=0))& (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,:,indx[0]]
    targets = targets[:,indx[0]]

    return(inputs, targets, indx)


## Scaling (Training)

In [3]:
def scaling_train(inputs,targets):

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    scaler_inputs = make_column_transformer((StandardScaler(), [0,1,2,3,4]))
    temp = scaler_inputs.fit_transform(temp)
    temp = temp.transpose()
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))   
    inputs = np.transpose(inputs,axes=(2,1,0))
    
    # Scaling the targets
    scaler_targets = StandardScaler()
    temp = np.ravel(targets)
    temp = np.expand_dims(temp,-1)
    temp = scaler_targets.fit_transform(temp)
    targets = temp.reshape(targets.shape)

    return(inputs,scaler_inputs,targets,scaler_targets)


## Scaling (Testing)

In [4]:
def scaling_test(regr,inputs,scaler_inputs,targets,scaler_targets):

    # Scaling the inputs
    temp = np.reshape(inputs,(len(inputs),inputs.shape[1]*inputs.shape[2]))
    temp = temp.transpose()
    temp = scaler_inputs.transform(temp)
    temp = temp.transpose()        
    inputs = np.reshape(temp,(len(inputs),inputs.shape[1],inputs.shape[2]))
        
    inputs = np.transpose(inputs,axes=(2,1,0))
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets)))

    predictions = regr.predict(inputs)

    # # Post-processing of predictions
    # predictions = np.array(predictions.to_grid(np.arange(0,len(targets))).data_matrix)
    # predictions = np.squeeze(predictions,2)

    # Scaling the predictions
    temp = np.ravel(predictions)
    temp = np.expand_dims(temp,axis=-1)
    temp = scaler_targets.inverse_transform(temp)
    predictions = temp.reshape(predictions.shape)
    predictions = predictions.transpose()

    return(predictions)


## Regressor (Training with all)

In [5]:
def regressor (inputs0, targets0, table):

    inputs,scaler_inputs,targets,scaler_targets = scaling_train(inputs0,targets0)

    # Final transformations
    targets = targets.transpose()
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    # targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=5))

    model = FPLSRegression(n_components=35)
    regr = model.fit(inputs,targets)

    predictions = scaling_test(regr,inputs0,scaler_inputs,targets0,scaler_targets)
    
    table[0,0] = np.round(np.corrcoef(np.ravel(predictions),np.ravel(targets0))[0][1],3)
    table[1,0] = rmse(np.ravel(predictions),np.ravel(targets0))
    m,_ = np.polyfit(np.ravel(targets0), np.ravel(predictions), deg=1)
    table[2,0] = np.round(m,3)

    return(regr,scaler_inputs,scaler_targets)


## Regressor (Training with 75%, testing with 25%)

In [6]:
def regressor2 (inputs0, targets0, table):

    X_train0, X_test0, y_train0, y_test0 = train_test_split(np.transpose(inputs0,(2,0,1)), targets0.transpose(), test_size=0.25)

    X_train0 = np.transpose(X_train0,(1,2,0))
    y_train0 = y_train0.transpose()

    X_test0 = np.transpose(X_test0,(1,2,0))
    y_test0 = y_test0.transpose()
    
    X_train,scaler_inputs,y_train,scaler_targets = scaling_train(X_train0,y_train0)

    # Final transformations
    y_train = y_train.transpose()
    X_train = FDataGrid(data_matrix=X_train, grid_points=np.arange(0,len(y_train[0])))
    # y_train = FDataGrid(data_matrix=y_train, grid_points=np.arange(0,len(y_train[0])))

    # Smoothing
    # targets = targets.to_basis(FourierBasis(n_basis=5))

    model = FPLSRegression(n_components=35)
    regr = model.fit(X_train,y_train)

    predictions = scaling_test(regr,X_train0,scaler_inputs,y_train0,scaler_targets)

    table[0,2] = np.round(np.corrcoef(np.ravel(y_train0),np.ravel(predictions))[0][1],3)
    table[1,2] = rmse(np.ravel(y_train0),np.ravel(predictions))
    m,_ = np.polyfit(np.ravel(y_train0),np.ravel(predictions), deg=1)
    table[2,2] = np.round(m,3)
    
    predictions = scaling_test(regr,X_test0,scaler_inputs,y_test0,scaler_targets)

    table[0,3] = np.round(np.corrcoef(np.ravel(y_test0),np.ravel(predictions))[0][1],3)
    table[1,3] = rmse(np.ravel(y_test0),np.ravel(predictions))
    m,_ = np.polyfit(np.ravel(y_test0),np.ravel(predictions), deg=1)
    table[2,3] = np.round(m,3)

    return(regr,scaler_inputs,scaler_targets)


## Cross Validation (4 folds)

In [7]:
def regressor3(inputs0, targets0, table):

    inputs,scaler_inputs,targets,scaler_targets = scaling_train(inputs0,targets0)

    # Final transformations
    targets = targets.transpose()
    inputs = FDataGrid(data_matrix=inputs, grid_points=np.arange(0,len(targets[0])))
    # targets = FDataGrid(data_matrix=targets, grid_points=np.arange(0,len(targets[0])))

    model = FPLSRegression(n_components=35)
    regr = model

    kf = KFold(n_splits=4,shuffle=True)
    predictions = cross_val_predict(regr, inputs, targets, cv=kf)

    # Scaling the predictions
    temp = np.ravel(predictions)
    temp = np.expand_dims(temp,axis=-1)
    temp = scaler_targets.inverse_transform(temp)
    predictions = temp.reshape(predictions.shape)
    predictions = predictions.transpose()

    table[0,13] = np.round(np.corrcoef(np.ravel(predictions),np.ravel(targets0))[0][1],3)
    table[1,13] = rmse(np.ravel(predictions),np.ravel(targets0))
    m,_ = np.polyfit(np.ravel(targets0), np.ravel(predictions), deg=1)
    table[2,13] = np.round(m,3)

    scores = cross_validate(regr, inputs, targets, cv=kf, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True, return_estimator=True, return_indices=True)

    predictions_train = np.zeros(targets0.shape)
    predictions_test = np.zeros(targets0.shape)

    for i in range (0, kf.get_n_splits()):

        regr = scores['estimator'][i]
        indx_train = scores['indices']['train'][i]
        indx_test = scores['indices']['test'][i]

        predictions_train[:,indx_train] = scaling_test(regr,inputs0[:,:,indx_train],scaler_inputs,targets0[:,indx_train],scaler_targets)
        table[0,i+5] = np.round(np.corrcoef(np.ravel(predictions_train[:,indx_train]),np.ravel(targets0[:,indx_train]))[0][1],3)
        table[1,i+5] = rmse(np.ravel(predictions_train[:,indx_train]),np.ravel(targets0[:,indx_train]))
        m,_ = np.polyfit(np.ravel(targets0[:,indx_train]), np.ravel(predictions_train[:,indx_train]), deg=1)
        table[2,i+5] = np.round(m,3)

        predictions_test[:,indx_test] = scaling_test(regr,inputs0[:,:,indx_test],scaler_inputs,targets0[:,indx_test],scaler_targets)
        table[0,i+9] = np.round(np.corrcoef(np.ravel(predictions_test[:,indx_test]),np.ravel(targets0[:,indx_test]))[0][1],3)
        table[1,i+9] = rmse(np.ravel(predictions_test[:,indx_test]),np.ravel(targets0[:,indx_test]))
        m,_ = np.polyfit(np.ravel(targets0[:,indx_test]), np.ravel(predictions_test[:,indx_test]), deg=1)
        table[2,i+9] = np.round(m,3)


## Evaluation (2021-2024)


In [8]:
def evaluation (ds, ds2, regr, scaler_inputs, scaler_targets, name, table, i):

    dataset = ds.sel(time_counter = slice('2021', '2024'))
    dataset2 = ds2.sel(time_counter = slice('2021', '2024'))

    inputs, targets, _ = datasets_preparation(dataset, dataset2, name)

    predictions = scaling_test(regr,inputs,scaler_inputs,targets,scaler_targets)

    table[0,i] = np.round(np.corrcoef(np.ravel(predictions),np.ravel(targets))[0][1],3)
    table[1,i] = rmse(np.ravel(predictions),np.ravel(targets))
    m,_ = np.polyfit(np.ravel(targets), np.ravel(predictions), deg=1)
    table[2,i] = np.round(m,3)


## Printing

In [9]:
def printing(table,criteria,categories,metric):

    temp = pd.DataFrame(table.transpose(),columns=criteria,index=categories)
    print(metric)
    display(temp)
    print ('\n')
    

## Training

In [10]:
def training(name,table):

    ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
    ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

    ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
        x=(np.arange(ds.x[0], ds.x[-1], 5)))

    ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
        x=(np.arange(ds2.x[0], ds2.x[-1], 5)))
 
    dataset = ds.sel(time_counter = slice('2007', '2020'))
    dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

    inputs, targets, _ = datasets_preparation(dataset, dataset2, name)

    regr,scaler_inputs,scaler_targets = regressor(inputs, targets, table)
    evaluation(ds, ds2, regr, scaler_inputs, scaler_targets, name, table, 1)

    regr2,scaler_inputs,scaler_targets = regressor2(inputs, targets, table)
    evaluation(ds, ds2, regr2, scaler_inputs, scaler_targets, name, table, 4)

    regressor3(inputs, targets, table)


## Main Body

In [11]:
criteria = ['r','rms','slope']
categories = ['training with 100%', 'testing', 'training with 75%', 'testing with 25%', 'testing', '1st fold train','2nd fold train', '3rd fold train','4th fold train',
    '1st fold test', '2nd fold test', '3rd fold test', '4th fold test', 'overall cross-val']

diat = np.zeros((len(criteria),len(categories)))
flag = np.zeros((len(criteria),len(categories)))
diat_pr = np.zeros((len(criteria),len(categories)))
flag_pr = np.zeros((len(criteria),len(categories)))

training('Diatom',diat)
training('Flagellate',flag)
training('Diatom_Production_Rate',diat_pr)
training('Flagellate_Production_Rate',flag_pr)


## Printing (Results)

In [12]:
printing(diat,criteria, categories,'Diatom')
printing(flag,criteria, categories,'Flagellate')
printing(diat_pr,criteria, categories,'Diatom production rate')
printing(flag_pr,criteria, categories, 'Flagellate production rate')


Diatom


Unnamed: 0,r,rms,slope
training with 100%,0.823,0.093249,0.677
testing,0.583,0.186537,0.751
training with 75%,0.823,0.093195,0.678
testing with 25%,0.822,0.093507,0.675
testing,0.577,0.185476,0.739
1st fold train,0.824,0.093324,0.678
2nd fold train,0.824,0.093093,0.678
3rd fold train,0.823,0.093052,0.678
4th fold train,0.822,0.093366,0.676
1st fold test,0.821,0.093196,0.683




Flagellate


Unnamed: 0,r,rms,slope
training with 100%,0.788,0.016969,0.62
testing,0.297,0.032863,0.274
training with 75%,0.788,0.016988,0.621
testing with 25%,0.786,0.016941,0.624
testing,0.296,0.032649,0.268
1st fold train,0.788,0.016963,0.621
2nd fold train,0.786,0.017041,0.618
3rd fold train,0.789,0.016925,0.622
4th fold train,0.789,0.016911,0.622
1st fold test,0.785,0.017026,0.612




Diatom production rate


Unnamed: 0,r,rms,slope
training with 100%,0.826,8.521344e-07,0.682
testing,0.446,1.945298e-06,0.475
training with 75%,0.825,8.53698e-07,0.681
testing with 25%,0.828,8.474255e-07,0.683
testing,0.444,1.945483e-06,0.476
1st fold train,0.825,8.530358e-07,0.681
2nd fold train,0.826,8.504693e-07,0.682
3rd fold train,0.825,8.529669e-07,0.681
4th fold train,0.827,8.506161e-07,0.683
1st fold test,0.826,8.514574e-07,0.681




Flagellate production rate


Unnamed: 0,r,rms,slope
training with 100%,0.854,2.088482e-07,0.729
testing,0.45,3.988428e-07,0.401
training with 75%,0.854,2.089309e-07,0.729
testing with 25%,0.854,2.090805e-07,0.732
testing,0.451,3.99792e-07,0.404
1st fold train,0.855,2.088131e-07,0.731
2nd fold train,0.853,2.094207e-07,0.728
3rd fold train,0.854,2.083048e-07,0.729
4th fold train,0.855,2.084455e-07,0.73
1st fold test,0.851,2.094496e-07,0.731




