In [None]:
from pathlib import Path
import datetime
import pandas as pd
import xarray as xr

In [None]:
# Setup Path
p_dssat = Path('./data/DSSAT')
p_s1 = Path('./data/Sentinel-1')

# finding the time that both parameters are available (LAI is daily but AMP is not)
harvestdate=datetime.date(2017, 10, 1)

In [None]:
# Load inputs
input_list = ['LAI', 'SWTD', 'SWTD6', 'CWAD']
df_list = []
for key in input_list:
    df = pd.read_pickle(p_dssat/"brabant_{}.pkl".format(key))
    df.index = df.index.date
    df_list.append(df)

Brabant_LAI, Brabant_SWTD, Brabant_SWTD6, Brabant_CWAD = df_list

In [None]:
# Load outputs
output_list = ['CR']
df_list = []
for key in output_list:
    df = pd.read_pickle(p_s1/"Amp_{}_New.pkl".format(key))
    df_list.append(df)
Amp_CR_New = df_list[0]

In [None]:
# Align the temporal index (common dates)
idx_time = Brabant_LAI.index.intersection(Amp_CR_New.index)
idx_time = idx_time[idx_time<harvestdate]

# Allign the spactial index (common field IDs)
idx_space =  Brabant_LAI.columns.intersection(Amp_CR_New.columns)

print(idx_time)
print(idx_space)


In [None]:
# Initiate an Xarray Dataset with coords
ds = xr.Dataset(coords={"space": idx_space, "time": idx_time})
ds

In [None]:
# Assign input and outputs as data variables
df_list = [Brabant_LAI, Brabant_SWTD, Brabant_SWTD6, Brabant_CWAD, Amp_CR_New]
for k, v in zip(input_list + output_list, df_list):
    v_sel = v.loc[idx_time, idx_space]
    ds = ds.assign({k: (("time", "space"), v_sel.values)})
ds

In [None]:
# Split training and testing
testing_split = 200 # reserve last n in space for testing
idx_testing = ds.space[:200] # coordinates for testing data

# Training dataset
ds_training = ds.drop_sel(space=idx_testing)

# Testing data
ds_testing = ds.sel(space=idx_testing)

In [None]:
# Get Input and Output
# Question: discontinuity caused be stacking?
X = ds_training[input_list].stack(data=("space", "time")).to_array().transpose("data","variable").values
Y = ds_training[output_list].stack(data=("space", "time")).to_array().transpose("data", "variable").values.squeeze()

### GridSeachCV with DASK-ML

In [None]:
# Setup grid search
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

regSVR = make_pipeline(MinMaxScaler(),SVR())
kernel = ["poly","rbf","sigmoid"]
C = [100,10,1,0.1]
gamma = ["scale"]
grid = dict(svr__kernel=kernel,svr__C=C,svr__gamma=gamma)
cv = RepeatedKFold(n_splits=4,n_repeats=2,random_state=1)
grid_search = GridSearchCV(estimator=regSVR, param_grid=grid, n_jobs=-1, cv=cv, scoring=["r2","neg_mean_squared_error"], refit="r2")

In [None]:
# Normal model fitting
grid_result = grid_search.fit(X,Y)

In [None]:
# Fit with dask ml
import joblib
from dask.distributed import Client
client = Client()

with joblib.parallel_backend('dask'):
    grid_search.fit(X,Y)