# Script to tune the hyperparameters of the model (Diatom)

## Importing

In [None]:
import xarray as xr
import numpy as np

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import root_mean_squared_error as rmse

from tqdm import tqdm

from sklearn.model_selection import cross_val_score

## Datasets Preparation

In [None]:
def datasets_preparation(dataset, dataset2):
    
    drivers = np.stack([np.ravel(dataset['Temperature_(0m-15m)']),
        np.ravel(dataset['Temperature_(15m-100m)']), 
        np.ravel(dataset['Salinity_(0m-15m)']),
        np.ravel(dataset['Salinity_(15m-100m)']),
        np.ravel(dataset2['Summation_of_solar_radiation']),
        np.ravel(dataset2['Mean_wind_speed']),
        np.ravel(dataset2['Mean_air_temperature'])
        ])
    indx = np.where(~np.isnan(drivers).any(axis=0))
    drivers = drivers[:,indx[0]]

    diat = np.ravel(dataset['Diatom'])
    diat = diat[indx[0]]

    return(drivers, diat, indx)

## Regressor

In [None]:
def regressor (inputs, targets):

    # Tuning of parameters      
    params = {'n_neighbors':[3], 'weights':['distance'], 'leaf_size':[10], 'p':[1], 'metric':['cityblock']}                                        
    scale = preprocessing.StandardScaler()                                                                               

    inputs = inputs.transpose()
    X_train, _, y_train, _ = train_test_split(inputs, targets, train_size=0.20)

    inputs = scale.fit_transform(inputs)
   
    model = KNeighborsRegressor()

    random_search = GridSearchCV(estimator=model, param_grid=params, scoring='r2',
        cv=3, n_jobs=-1, verbose=3, pre_dispatch='2*n_jobs', return_train_score=True)

    random_search.fit(X_train,y_train)

    print('\n', 'The best parameters are', random_search.best_params_)

    return (0)

## Training 

In [None]:
ds = xr.open_dataset('/data/ibougoudis/MOAD/files/integrated_model_var_old.nc')
ds2 = xr.open_dataset('/data/ibougoudis/MOAD/files/external_inputs.nc')

ds = ds.isel(time_counter = (np.arange(0, len(ds.time_counter),2)), 
    y=(np.arange(ds.y[0], ds.y[-1], 5)), 
    x=(np.arange(ds.x[0], ds.x[-1], 5)))

ds2 = ds2.isel(time_counter = (np.arange(0, len(ds2.time_counter),2)), 
    y=(np.arange(ds2.y[0], ds2.y[-1], 5)), 
    x=(np.arange(ds2.x[0], ds2.x[-1], 5)))

dataset = ds.sel(time_counter = slice('2007', '2020'))
dataset2 = ds2.sel(time_counter = slice('2007', '2020'))

drivers, diat, _ = datasets_preparation(dataset, dataset2)

# regr = regressor(drivers, diat)

## Best parameters for each case

In [None]:
# model = MLPRegressor(alpha=0.001, learning_rate='invscaling', tol=1e-06, epsilon=1e-07, power_t=1)
# model = ExtraTreesRegressor(max_features='sqrt')
# model = GradientBoostingRegressor(criterion='squared_error',learning_rate=0.5,subsample=0.5,min_samples_split=5,min_samples_leaf=6,max_depth=8,max_features='log2')
# model = HistGradientBoostingRegressor(learning_rate=0.5, max_iter=400,max_leaf_nodes=None,min_samples_leaf=200,max_bins=100)
# model = DecisionTreeRegressor(min_samples_leaf=15,min_samples_split=10)
# model = KNeighborsRegressor(leaf_size=10, metric='cityblock', n_neighbors=3, p=1, weights='distance')

## Cross validation example

In [None]:

inputs = drivers.transpose()
targets = diat

X_train, X_test, y_train, y_test = train_test_split(inputs, targets, train_size=0.20)
scale = preprocessing.StandardScaler()
X_train_train=scale.fit_transform(X_train)
X_test=scale.transform(X_test)

model = MLPRegressor(alpha=0.001, learning_rate='invscaling', tol=1e-09, epsilon=1e-07)
model = make_pipeline(StandardScaler(), model)
scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1, verbose=3)
# regr = BaggingRegressor(model, n_estimators=12, n_jobs=8).fit(X_train,y_train)

predictions=model.predict(X_test)
RTWO=sklearn.metrics.r2_score(y_test,predictions)
