# Script to tune the hyperparameters of each model

## Importing

In [None]:
import xarray as xr
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

import xgboost as xgb


from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import root_mean_squared_error as rmse


from sklearn.model_selection import cross_val_score

## Datasets Preparation

In [None]:
def datasets_preparation(dataset, dataset2):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), 
        np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']),
        np.ravel(dataset2['Summation_of_solar_radiation']),
        np.ravel(dataset2['Mean_wind_speed']),
        np.ravel(dataset2['Mean_air_temperature']),
        np.tile(np.repeat(dataset.y, len(dataset.x)), len(dataset.time_counter)),
        np.tile(dataset.x, len(dataset.time_counter)*len(dataset.y)),
        np.repeat(dataset.time_counter.dt.dayofyear, len(dataset.x)*len(dataset.y))
        ])

    indx = np.where(~np.isnan(drivers).any(axis=0) & (drivers[8]>10) & ((drivers[8]>100) | (drivers[7]<880)))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

## Regressor

In [None]:
def regressor (inputs, targets):

    # Tuning of parameters      
    params = {'eta':[0.1,0.3], 'max_depth':[10,20,30,50], 'subsample': [0.9], 'colsample_bynode': [1]}                                        
    scale = preprocessing.MinMaxScaler()                                                                               

    inputs = inputs.transpose()
    X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.20)

    inputs = scale.fit_transform(inputs)
   
    model = xgb.XGBRegressor()

    random_search = GridSearchCV(estimator=model, param_grid=params, scoring='r2',
        cv=3, n_jobs=-1, verbose=3, pre_dispatch='2*n_jobs', return_train_score=True)

    random_search.fit(X_train,y_train)

    print('\n', 'The best parameters are', random_search.best_params_)

    return (0)

## Training 

In [None]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_original.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(time_counter = (np.arange(0, len(ds2.time_counter),2)), 
    y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

drivers, diat, _ = datasets_preparation(dataset, dataset2)

regr = regressor(drivers, diat)

## Best parameters for each case (Diatom)

In [None]:
# model = MLPRegressor(alpha=0.001, learning_rate='invscaling', tol=1e-06, epsilon=1e-07, power_t=1)
# model = ExtraTreesRegressor(max_features='sqrt')
# model = GradientBoostingRegressor(criterion='squared_error',learning_rate=0.5,subsample=0.5,min_samples_split=5,min_samples_leaf=6,max_depth=8,max_features='log2')
# model = HistGradientBoostingRegressor(learning_rate=0.5, max_iter=400,max_leaf_nodes=None,min_samples_leaf=200,max_bins=100)
# model = DecisionTreeRegressor(min_samples_leaf=15,min_samples_split=10)
# model = KNeighborsRegressor(leaf_size=10, metric='cityblock', n_neighbors=3, p=1, weights='distance')
# model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.05, max_depth=10, eta=0.1, subsample=0.3, colsample_bytree=0.1,seed=1)), 