# Investigating training - testing performance based on k-fold validation (issue with high testing results when testing is from the same years as training)

## Importing

In [11]:
import xarray as xr
import numpy as np
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold

from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn.metrics import root_mean_squared_error as rmse


## Datasets Preparation

In [12]:
def datasets_preparation(dataset, dataset2, name):
    
    x = np.tile(dataset.x, len(dataset.time_counter)*len(dataset.y))
    y = np.tile(np.repeat(dataset.y, len(dataset.x)), len(dataset.time_counter))
   
    inputs = np.stack([
        np.ravel(dataset2['Summation_of_solar_radiation']),
        np.ravel(dataset2['Mean_wind_speed']),
        np.ravel(dataset2['Mean_air_temperature']),
        np.ravel(dataset2['Latitude']),
        np.ravel(dataset2['Longitude']),
        np.repeat(dataset.time_counter.dt.dayofyear, len(dataset.x)*len(dataset.y)),
        ])

    targets = np.ravel(dataset[name])
    
    indx = np.where(np.isfinite(targets) & (x>10) & ((x>100) | (y<880)))
    inputs = inputs[:,indx[0]]
    targets = targets[indx[0]]

    inputs = inputs.transpose()

    return(inputs, targets, indx)


## Regressor (Training with all)

In [13]:
def regressor (inputs, targets, table):

    model = TransformedTargetRegressor(regressor=make_pipeline(ColumnTransformer(
        transformers=[('drivers', StandardScaler(), [0,1,2]), ('spatial', KBinsDiscretizer(n_bins=255,encode='ordinal',strategy='quantile'), [3,4])],remainder='passthrough'),
        HistGradientBoostingRegressor(categorical_features=[3,4,5])),
        transformer=StandardScaler())
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(inputs,targets)
    
    predictions = regr.predict(inputs)
    
    table[0,0] = np.round(np.corrcoef(predictions,targets)[0][1],3)
    table[1,0] = rmse(predictions,targets)
    m,_ = np.polyfit(targets, predictions, deg=1)
    table[2,0] = np.round(m,3)

    return(regr)


## Regressor (Training with 75%, testing with 25%)

In [14]:
def regressor2 (inputs, targets, table):

    model = TransformedTargetRegressor(regressor=make_pipeline(ColumnTransformer(
      transformers=[('drivers', StandardScaler(), [0,1,2]), ('spatial', KBinsDiscretizer(n_bins=255,encode='ordinal',strategy='quantile'), [3,4])],remainder='passthrough'),
        HistGradientBoostingRegressor(categorical_features=[3,4,5])),
        transformer=StandardScaler())
    
    X_train, X_test, y_train, y_test = train_test_split(inputs, targets, test_size=0.25)
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4).fit(X_train,y_train)

    predictions = regr.predict(X_train)

    table[0,2] = np.round(np.corrcoef(y_train,predictions)[0][1],3)
    table[1,2] = rmse(y_train,predictions)
    m,_ = np.polyfit(y_train,predictions, deg=1)
    table[2,2] = np.round(m,3)
    
    predictions = regr.predict(X_test)

    table[0,3] = np.round(np.corrcoef(y_test,predictions)[0][1],3)
    table[1,3] = rmse(y_test,predictions)
    m,_ = np.polyfit(y_test,predictions, deg=1)
    table[2,3] = np.round(m,3)

    return(regr)


## Cross Validation (4 folds)

In [15]:
def regressor3 (inputs, targets, table):

    model = TransformedTargetRegressor(regressor=make_pipeline(ColumnTransformer(
        transformers=[('drivers', StandardScaler(), [0,1,2]), ('spatial', KBinsDiscretizer(n_bins=255,encode='ordinal',strategy='quantile'), [3,4])],remainder='passthrough'),
        HistGradientBoostingRegressor(categorical_features=[3,4,5])),
        transformer=StandardScaler())
    
    regr = BaggingRegressor(model, n_estimators=12, n_jobs=4)

    kf = KFold(n_splits=4,shuffle=True)
    predictions = cross_val_predict(regr, inputs, targets, cv=kf)
    scores = cross_validate(regr, inputs, targets, cv=kf, scoring=('r2', 'neg_root_mean_squared_error'), return_train_score=True)

    table[0,5:9] =  np.round(np.sqrt(np.abs(scores['train_r2'])),3)
    table[1,5:9] =  np.abs(scores['train_neg_root_mean_squared_error'])

    table[0,9:13] =  np.round(np.sqrt(np.abs(scores['test_r2'])),3)
    table[1,9:13] =  np.abs(scores['test_neg_root_mean_squared_error'])

    table[0,13] = np.round(np.corrcoef(predictions,targets)[0][1],3)
    table[1,13] = rmse(predictions,targets)
    m,_ = np.polyfit(targets, predictions, deg=1)
    table[2,13] = np.round(m,3)


## Evaluation (2021-2024)

In [16]:
def evaluation (ds,ds2,regr,name, table, i):

    dataset = ds.sel(time_counter = slice('2021', '2024'))
    dataset2 = ds2.sel(time_counter = slice('2021', '2024'))

    inputs, targets, _ = datasets_preparation(dataset, dataset2,name)

    predictions = regr.predict(inputs)

    table[0,i] = np.round(np.corrcoef(predictions,targets)[0][1],3)
    table[1,i] = rmse(predictions,targets)
    m,_ = np.polyfit(targets, predictions, deg=1)
    table[2,i] = np.round(m,3)


## Printing

In [17]:
def printing(table,criteria,categories,metric):

    temp = pd.DataFrame(table.transpose(),columns=criteria,index=categories)
    print(metric)
    display(temp)
    print ('\n')
    

## Training

In [18]:
def training(name,table):

    ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
    ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

    ds = ds.isel(y=(np.arange(ds.y[0], ds.y[-1], 5)), 
        x=(np.arange(ds.x[0], ds.x[-1], 5)))

    ds2 = ds2.isel(y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
        x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

    dataset = ds.sel(time_counter = slice('2007', '2020'))
    dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

    inputs, targets, _ = datasets_preparation(dataset, dataset2, name)

    regr = regressor(inputs, targets, table)
    evaluation(ds,ds2,regr,name,table,1)

    regr2 = regressor2(inputs, targets, table)
    evaluation(ds,ds2,regr2,name,table,4)

    regressor3(inputs, targets, table)


## Main Body

In [19]:
criteria = ['r','rms','slope']
categories = ['training with 100%', 'testing', 'training with 75%', 'testing with 25%', 'testing', '1st fold train','2nd fold train', '3rd fold train','4th fold train',
    '1st fold test', '2nd fold test', '3rd fold test', '4th fold test', 'overall cross-val']

diat = np.zeros((len(criteria),len(categories)))
flag = np.zeros((len(criteria),len(categories)))
diat_pr = np.zeros((len(criteria),len(categories)))
flag_pr = np.zeros((len(criteria),len(categories)))

training('Diatom',diat)
training('Flagellate',flag)
training('Diatom_Production_Rate',diat_pr)
training('Flagellate_Production_Rate',flag_pr)


## Printing (Results)

In [20]:
printing(diat,criteria, categories,'Diatom')
printing(flag,criteria, categories,'Flagellate')
printing(diat_pr,criteria, categories,'Diatom production rate')
printing(flag_pr,criteria, categories, 'Flagellate production rate')


Diatom


Unnamed: 0,r,rms,slope
training with 100%,0.767,0.105834,0.534
testing,0.612,0.130829,0.419
training with 75%,0.766,0.106092,0.532
testing with 25%,0.757,0.107838,0.523
testing,0.613,0.130663,0.418
1st fold train,0.762,0.106132,0.0
2nd fold train,0.764,0.105862,0.0
3rd fold train,0.763,0.10608,0.0
4th fold train,0.763,0.106015,0.0
1st fold test,0.756,0.107445,0.0




Flagellate


Unnamed: 0,r,rms,slope
training with 100%,0.896,0.012297,0.778
testing,0.87,0.014285,0.716
training with 75%,0.895,0.012309,0.778
testing with 25%,0.894,0.012379,0.777
testing,0.87,0.01429,0.716
1st fold train,0.895,0.012294,0.0
2nd fold train,0.895,0.012294,0.0
3rd fold train,0.895,0.012295,0.0
4th fold train,0.895,0.012307,0.0
1st fold test,0.893,0.012414,0.0




Diatom production rate


Unnamed: 0,r,rms,slope
training with 100%,0.91,6.267628e-07,0.807
testing,0.885,7.011508e-07,0.791
training with 75%,0.91,6.261213e-07,0.807
testing with 25%,0.908,6.337237e-07,0.805
testing,0.885,7.004662e-07,0.792
1st fold train,0.91,6.251628e-07,0.0
2nd fold train,0.91,6.265866e-07,0.0
3rd fold train,0.91,6.255357e-07,0.0
4th fold train,0.91,6.253508e-07,0.0
1st fold test,0.907,6.345531e-07,0.0




Flagellate production rate


Unnamed: 0,r,rms,slope
training with 100%,0.916,1.612731e-07,0.818
testing,0.882,1.885658e-07,0.825
training with 75%,0.916,1.61182e-07,0.818
testing with 25%,0.914,1.633268e-07,0.816
testing,0.882,1.888205e-07,0.826
1st fold train,0.916,1.61099e-07,0.0
2nd fold train,0.916,1.611959e-07,0.0
3rd fold train,0.916,1.610699e-07,0.0
4th fold train,0.916,1.613673e-07,0.0
1st fold test,0.913,1.63395e-07,0.0




