In [None]:
import motrainer
import numpy as np
import xarray as xr

In [None]:
import dask
dask.config.set(scheduler='synchronous')

## Load data

In [None]:
ds = xr.open_zarr("../example/example1_data.zarr/")
ds

## Split per gridcell

In [None]:
motrainer.is_splitable(ds)

In [None]:
bags = motrainer.dataset_split(ds, "space")
bags

In [None]:
bags.compute()

## Train Test Split

In [None]:
def to_dataframe(ds):
    return ds.to_dask_dataframe()

def chunk(ds, chunks):
    return ds.chunk(chunks)

In [None]:
# Train test splir
train_test_bags = bags.map(
    motrainer.train_test_split, split={"time": np.datetime64("2016-01-01")}
)

In [None]:
train_bags = train_test_bags.pluck(0).map(chunk, {"space": 500}).map(to_dataframe)
test_bags = train_test_bags.pluck(1).map(chunk, {"space": 500}).map(to_dataframe)

## Setup Training

In [None]:
# Setup grid search
# use the estimator definition and pipeline objects in sklearn
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RepeatedKFold

from dask_ml.preprocessing import MinMaxScaler
from dask_ml.model_selection import GridSearchCV

regSVR = make_pipeline(MinMaxScaler(), SVR())
kernel = ["poly", "rbf", "sigmoid"]
C = [1, 0.1]
gamma = ["scale"]
grid = dict(svr__kernel=kernel, svr__C=C, svr__gamma=gamma)
cv = RepeatedKFold(n_splits=4, n_repeats=2, random_state=1)
grid_search = GridSearchCV(
    estimator=regSVR,
    param_grid=grid,
    cv=cv,
    scoring=["r2", "neg_mean_squared_error"],
    refit="r2",
)

## Model Optimization

In [None]:
def optimize(df, grid_search, input_list, output_list):
    """Customized Optimization Function
    """
    df = df.dropna()
    grid_result = grid_search.fit(df[input_list], df[output_list])
    return grid_result


input_list = ["BIOMA1", "BIOMA1", "TG1", "TG2", "TG3"]
output_list = ["slop"]
optimazed_estimators = train_bags.map(
    optimize, grid_search=grid_search, input_list=input_list, output_list=output_list
)

In [None]:
optimazed_estimators_realized = optimazed_estimators.compute()

## Save model

In [None]:
# To be replaced by "modelstore"
import pickle

for model, id in zip(optimazed_estimators_realized, range(len(optimazed_estimators_realized))):
    name_model = f"model{id}.pickle"
    with open(name_model, "wb") as f:
        pickle.dump(model, f)

## Model performance evaluation

In [None]:
# Load the models back
list_model = []
for id in range(5): 
    with open(f"model{id}.pickle", "rb") as f:
        list_model.append(pickle.load(f))
list_model

In [None]:
from sklearn.metrics import mean_squared_error , r2_score,  mean_absolute_error

# This for need to be coverted to a user defined 
list_metrics = []
for model, test_data in zip(list_model, test_bags.compute()):
    test_data = test_data.dropna()
    X_test = test_data[input_list]
    Y_test = test_data[output_list]
    Y_eval = model.predict(X_test)

    metrics = {"MSE_SVR": mean_squared_error(Y_test,Y_eval),
               "MAE_SVR": mean_absolute_error(Y_test,Y_eval),
               "R_2":r2_score(Y_test,Y_eval)}
    list_metrics.append(metrics)