In [None]:
def setup():
    colab = 'google.colab' in str(get_ipython())
    if not colab:
        print('Please run in Google Colab')
        return

    !rm -fr /content/metatipstar
    !git clone https://your-username:your-token@github.com/BigDataWUR/metatipstar.git
    !pip install --quiet "pytorch-lightning" "audtorch"
    inputdir = '/content/metatipstar/csv'
setup()

In [None]:
import torch
import pandas as pd
import os
import math

pl.seed_everything(42)

soilnr = [2010, 2040, 2050, 2060, 2070, 2080, 4010, 4020, 4031, 
 4040, 4041, 4050, 4070, 4090, 4130, 4140, 4160, 8060,
 8090, 8101, 8110, 8120, 10010, 10030, 10061, 10080, 
 10190, 10191, 10240, 11030, 11040, 11050]
dict_soilnr = {nr : i for i, nr in enumerate(soilnr)}

feature_variables = ['irradiance','irrigation','mintemp','maxtemp','precipitation',
                     'baseN','sidedressdoy','sidedressamount','sowdoy','maxRootDepthDueToSoil', 'Earliness',
                     'soilprofile']
target_variables = ['max_tuberfreshwt']
timeseries = ['irradiance','irrigation','mintemp','maxtemp','precipitation']
categorical = ['soilprofile']

def kaiming_init(model):
    for name, param in model.named_parameters():
        #print(f'{name} {param.requires_grad} {param.dim()}')
        if name.endswith(".bias"):
            param.data.fill_(0)
        #elif name.startswith("layers.0"):  # The first layer does not have ReLU applied on its input
        #    param.data.normal_(0, 1 / math.sqrt(param.shape[1]))
        else:
            if param.dim() == 3:
              param.data.normal_(0, math.sqrt(2) / math.sqrt(param.shape[1]))


def divisors(n):
    import math
    divs = [1]
    for i in range(2,int(math.sqrt(n))+1):
        if n%i == 0:
            divs.extend([i,n/i])
    divs.extend([n])
    return list(set(divs))

def find_batch_size(n, k=500):
    d = divisors(n)
    b = max(filter(lambda i: i < k, d))
    return int(b)


class NitrogenDataset(torch.utils.data.Dataset):
    def __init__(self, tensor_timeseries, tensor_continuous, tensor_categorical, tensor_targets, 
                 featurenames_timeseries, featurenames_continuous, featurenames_categorical, featurenames_targets):
        assert tensor_timeseries.size(dim=1) == len(featurenames_timeseries)
        assert tensor_continuous.size(dim=1) == len(featurenames_continuous)
        assert tensor_categorical.size(dim=1) == len(featurenames_categorical)
        assert tensor_targets.size(dim=1) == len(featurenames_targets)
        self.tensor_timeseries = torch.nan_to_num(tensor_timeseries)
        self.tensor_continuous = torch.nan_to_num(tensor_continuous)
        self.tensor_categorical = torch.nan_to_num(tensor_categorical)
        self.tensor_targets = torch.nan_to_num(tensor_targets)
        self.featurenames_timeseries = featurenames_timeseries
        self.featurenames_continuous = featurenames_continuous
        self.featurenames_categorical = featurenames_categorical
        self.featurenames_targets = featurenames_targets
        #quick hack to correct flaws in 'sidedressdoy'
        self.tensor_continuous[:,self.featurenames_continuous.index('sidedressdoy')] = 0.0
        self.experimentids = []


    def __len__(self):
        "Returns the size of the dataset"
        return len(self.tensor_timeseries)

    def __getitem__(self, index):
        "Returns an element from the dataset"
        return self.tensor_timeseries[index], self.tensor_continuous[index], self.tensor_categorical[index], self.tensor_targets[index], index


def create_torch_dataset(num_examples, length_time_series, feature_variables, inputdir, tag):
    print(f'create dataset {tag} {num_examples}')
    set_timeseries = sorted(list(set(feature_variables) & set(timeseries)))
    set_scalars = sorted(list(set(feature_variables) - set(set_timeseries)))
    set_continuous = sorted(list(set(set_scalars) - set(categorical)))
    tensor_timeseries = torch.zeros([num_examples, len(set_timeseries), length_time_series])
    tensor_continuous = torch.zeros([num_examples, len(set_continuous)])
    tensor_categorical = torch.zeros([num_examples, len(categorical)],dtype=int)
    tensor_targets = torch.zeros([num_examples, len(target_variables)])
    featurenames_timeseries = []
    for i, name in enumerate(set_timeseries):
        ds_path = os.path.join(inputdir, f'{name}-{tag}.csv.gz')
        v_data = pd.read_csv(ds_path, sep=',')
        print(f'load {ds_path} {v_data.shape}')
        tensor_timeseries[:,i,:] = torch.tensor(v_data.values)
        featurenames_timeseries.append(name)
    ds_path = os.path.join(inputdir, f'si-{tag}.csv.gz')
    scalar_data = pd.read_csv(ds_path, sep=',')
    print(f'load {ds_path} {scalar_data.shape}')
    set_continuous = scalar_data[set_continuous]
    featurenames_continuous = list(set_continuous.columns)
    tensor_continuous[:,:] = torch.tensor(set_continuous.values)

    for i, c in enumerate(categorical):
      df_nr = scalar_data.filter(regex=c).idxmax(axis=1).map(lambda x: int(str(x).split('_')[-1]))
      df_nr = df_nr.map(lambda x : dict_soilnr[x])
      tensor_categorical[:,i] = torch.tensor(df_nr.values.astype(int))
   
    ds_path = os.path.join(inputdir, f'response-{tag}.csv.gz')
    target_data = pd.read_csv(ds_path, sep=',')
    print(f'load {ds_path} {target_data.shape}')
    target_data.columns = target_variables
    target_data = target_data[target_variables]
    featurenames_targets = list(target_data.columns)
    tensor_targets[:,:] = torch.tensor(target_data.values)
    dataset = NitrogenDataset(tensor_timeseries, tensor_continuous, tensor_categorical, tensor_targets, featurenames_timeseries, featurenames_continuous, categorical, featurenames_targets)
    if 'exp' in scalar_data:  
      dataset.experimentids = scalar_data['exp'].map(lambda x: str(x).split('T')[-2])
    return dataset

In [None]:
sets = ['train', 'val', 'test', 'obs']
sets = ['train', 'obs']
inputdir = '/content/metatipstar/csv'
datasets = {}
for s in sets:
    ds_path = os.path.join(inputdir,f'irradiance-{s}.csv.gz')
    num_examples, length_timeseries = pd.read_csv(ds_path, sep=',').shape
    dataset = create_torch_dataset(num_examples, length_timeseries, feature_variables, inputdir=inputdir, tag=s)
    datasets[s] = dataset

In [None]:
%%script echo skipping
import matplotlib.pyplot as plt
import numpy as np
from random import randrange
fig, axes = plt.subplots(len(sets),1, sharex=True, figsize=(15,20)) #, subplot_kw=dict(box_aspect=1)
for i,s in enumerate(sets):
    ax = axes[i]
    x=np.tile(range(datasets[s].tensor_timeseries.shape[2]),(5,1)).T
    sample=randrange(datasets[s].tensor_timeseries.shape[0])
    y=datasets[s].tensor_timeseries[sample,:,:].T
    ax.step(x, y, where='post',label=datasets[s].featurenames_timeseries)
    ax.set_title(f'{s} (i={sample})')
    ax.set_ylim((0,1))
fig.legend(datasets['train'].featurenames_timeseries) #, bbox_to_anchor=(0.9,0.9), loc="upper right")
fig.tight_layout()
plt.show()

In [None]:
%%script echo skipping
import matplotlib.pyplot as plt
import numpy as np
nvar = datasets[s].tensor_timeseries.shape[1]
nx = datasets[s].tensor_timeseries.shape[2]
fig, axes = plt.subplots(nvar,1, sharex=True, figsize=(15,20))
for v,v_name in enumerate(datasets['train'].featurenames_timeseries):  
  ax = axes[v]
  plot_data = np.zeros((nx,len(sets)))
  for i,s in enumerate(sets):
    y = datasets[s].tensor_timeseries[:,v,:].numpy().squeeze()
    y = np.mean(y,axis=0).T
    plot_data[:,i] = y
  x = np.tile(range(nx),(len(sets),1)).T
  ax.step(x, plot_data, where='post',label=s,zorder=0)
  ax.set_ylim((0,1))
  ax.grid()
  ax.set_title(f'{v_name}')
fig.legend(sets) #, bbox_to_anchor=(0.9,0.9), loc="upper right")
fig.tight_layout()
plt.show()

In [None]:
%%script echo skipping
import seaborn as sns
import pandas as pd

features = datasets['train'].featurenames_continuous + datasets['train'].featurenames_categorical + datasets['train'].featurenames_targets
 
fig, axes = plt.subplots(len(features),1, figsize=(20,20))
for i,featurename in enumerate(features):
    print(f'{i}/{len(features)} feature: {featurename}')
    ax_left=axes[i]
    df_list = []
    for s in sets:
      if featurename in categorical:
        f = datasets['train'].featurenames_categorical.index(featurename)
        y = datasets[s].tensor_categorical[:,f].detach()/len(dict_soilnr)
      elif featurename in target_variables:
        f = datasets['train'].featurenames_targets.index(featurename)
        y = datasets[s].tensor_targets[:,f].detach()
      else:
        f = datasets['train'].featurenames_continuous.index(featurename)
        y = datasets[s].tensor_continuous[:,f].detach()
      df = pd.DataFrame(data=y,columns=[featurename])
      df['dataset'] = s
      df_list.append(df)
    df_long = pd.concat(df_list, sort=False,ignore_index=True)
    sns.histplot(data=df_long, x=featurename, hue='dataset', stat="probability", common_norm=False, ax=ax_left, legend=False, multiple='dodge')
    ax_left.set_xlabel('')
    ax_left.set_title(f'{featurename}')
plt.show()

In [None]:
import pytorch_lightning as pl
from audtorch.metrics.functional import pearsonr
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

scale_factor_target = 107827.207557798 / 1000.0

class Model1(pl.LightningModule):
    def __init__(self, n_timeseries=6, l_timeseries=213, n_continuous=5, n_categories=32, do_log=True):
        super(Model1, self).__init__()
        n_hidden_timeseries = 15
        n_hidden_continuous = 15
        n_embedded = 5
        self.do_log = do_log
        self.cnn_features = nn.Sequential(
            nn.Conv1d(n_timeseries, 3, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(3),
            nn.ReLU(inplace=True),

            nn.AvgPool1d(kernel_size=2, stride=2),
            nn.Conv1d(3, 2, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(2),
            nn.ReLU(inplace=True),
            nn.Conv1d(2, 1, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(1),
            nn.ReLU(inplace=True),
            nn.AvgPool1d(kernel_size=2, stride=2),
        )
        self.cnn_linear = nn.Sequential(
            nn.Linear(int(l_timeseries/4), n_hidden_timeseries),
            nn.ReLU(inplace=True),
        )
        self.cnn_scalar = nn.Sequential(
            nn.Linear(n_continuous, n_hidden_continuous),
            nn.ReLU(inplace=True),
        )
        self.embed = nn.Embedding(n_categories, n_embedded)

        self.combine = nn.Sequential(
            nn.BatchNorm1d(n_hidden_timeseries + n_hidden_continuous + n_embedded),
            nn.Linear(n_hidden_timeseries + n_hidden_continuous + n_embedded, 5),
            nn.ReLU(inplace=True),
            nn.Linear(5, 1)
        )

    def forward(self, timeseries, continuous, categories):
        x1 = self.cnn_features(timeseries)
        x1 = torch.flatten(x1,1)
        x1 = self.cnn_linear(x1)

        x2 = self.cnn_scalar(continuous)
        x3 = torch.squeeze(self.embed(categories), dim=1)
        x = torch.cat((x1, x2, x3), dim=1)
        x = self.combine(x)
        return x
    
    def training_step(self, batch, batch_idx):
        inputs_timeseries, inputs_continuous, inputs_categorical, targets, index = batch
        x = self.forward(inputs_timeseries, inputs_continuous, inputs_categorical)
        loss = F.mse_loss(x, targets)
        if self.do_log: 
          self.log("train_loss", loss.item())
        return {"loss": loss, "predictions": x, "targets": targets}


    def training_epoch_end(self, training_step_outputs):
        predictions = torch.reshape(torch.stack([x['predictions'] for x in training_step_outputs]),(-1,))
        targets = torch.reshape(torch.stack([x['targets'] for x in training_step_outputs]),(-1,))
        r = pearsonr(predictions, targets)
        criterion = nn.MSELoss()
        predictions_rescaled = torch.mul(predictions, scale_factor_target)
        targets_rescaled = torch.mul(targets, scale_factor_target) 
        rmse = torch.sqrt(criterion(predictions_rescaled, targets_rescaled))
        if self.do_log:
          self.log("R-train", r)
          self.log("rmse-train", rmse)
        return {"r": r.item(), "rmse": rmse.item(), "targets": targets_rescaled.detach(), "predictions": predictions_rescaled.detach()}  
        

    def validation_step(self, batch, batch_idx):
        inputs_timeseries, inputs_continuous, inputs_categorical, targets, index = batch
        x = self.forward(inputs_timeseries, inputs_continuous, inputs_categorical)
        loss = F.mse_loss(x, targets)
        if self.do_log:
          self.log("validation_loss", loss)
        return {"loss": loss, "predictions": x, "targets": targets}


    def validation_epoch_end(self, validation_step_outputs):
        predictions = torch.reshape(torch.stack([x['predictions'] for x in validation_step_outputs]),(-1,))
        targets = torch.reshape(torch.stack([x['targets'] for x in validation_step_outputs]),(-1,))
        predictions_rescaled = torch.mul(predictions, scale_factor_target)
        targets_rescaled = torch.mul(targets, scale_factor_target) 
        r = pearsonr(predictions, targets)
        if False:
          criterion = nn.MSELoss()
          rmse = torch.sqrt(criterion(predictions_rescaled, targets_rescaled))
          fig, ax = plt.subplots(1,1)
          plt.scatter(targets_rescaled.cpu(), predictions_rescaled.cpu(), zorder=1)
          plt.xlim(0.0, scale_factor_target)
          plt.ylim(0.0, scale_factor_target)
          ax.plot([0, 1], [0, 1], transform=ax.transAxes, color='grey')
          plt.grid()
          plt.title(f'epoch {self.current_epoch} R={r.item():.3f} RMSE={rmse:.2f} n={len(predictions)}')       
          self.logger.experiment.add_figure("validation", fig, self.global_step)
        if self.do_log:
          self.log("R-val", r)
        return_values = {
            "predictions": predictions_rescaled.cpu(),  # list of len batch 
            "targets": targets_rescaled.cpu(),  # list of len batch 
        }

    
    def predict_step(self, batch, batch_idx):
        inputs_timeseries, inputs_continuous, inputs_categorical, targets, index = batch
        x = self.forward(inputs_timeseries, inputs_continuous, inputs_categorical)
        return_values = {
            "predictions": x,  # list of len batch 
            "targets": targets,  # list of len batch 
        }
        return return_values


    def test_step(self, batch, batch_idx):
        return_values = self.predict_step(batch, batch_idx)
        return return_values

    def test_epoch_end(self, test_step_outputs):
        predictions = torch.reshape(torch.stack([x['predictions'] for x in test_step_outputs]),(-1,))
        targets = torch.reshape(torch.stack([x['targets'] for x in test_step_outputs]),(-1,))
        return_values = {
            "predictions": predictions,  # list of len batch 
            "groundtruth": targets,  # list of len batch 
        }
        return return_values
    
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

# some sanity checks
dataloader_train = DataLoader(datasets['train'], batch_size=4, shuffle=True)
inputs_timeseries, inputs_continuous, inputs_categories, targets, indices = next(iter(dataloader_train))
min_days = inputs_timeseries.shape[2]
n_channels_timeseries = inputs_timeseries.shape[1]
n_features_continuous = inputs_continuous.shape[1]

model = Model1(n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr))
output = model(inputs_timeseries, inputs_continuous, inputs_categories)

In [None]:
model1 = Model1(n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr))

dataloaders = {}
for s in sets:
  n_samples = len(DataLoader(datasets[s]))
  batch_size = find_batch_size(len(DataLoader(datasets[s])))
  print(f'{s}: n={n_samples} batch_size={batch_size}')
  dataloaders[s] = DataLoader(datasets[s], batch_size=batch_size, shuffle=(s == "train"))

from google.colab import drive
drive.mount('/content/drive')

log_dir = '/content/drive/MyDrive/WUR/metatipstar/log'
!mkdir -p $log_dir 

%load_ext tensorboard
%tensorboard --logdir $log_dir

from pytorch_lightning import loggers as pl_loggers
logger = pl_loggers.TensorBoardLogger(log_dir, name="synthetic-model")
trainer = pl.Trainer(devices=1, max_epochs=1, logger=logger, accelerator="auto") 
trainer.fit(model=model1, train_dataloaders=dataloaders['train'], val_dataloaders=dataloaders['obs'])

#model_path = os.path.join(log_dir,f'synthetic-model.ckpt')
#trainer.save_checkpoint(model_path)

In [None]:
from torch.utils.data import SubsetRandomSampler, Subset, Dataset, DataLoader
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning.loops import Loop
import numpy as np
import logging
import os

logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)

class TrainingEpochLoop(Loop):

    def __init__(self, model, optimizer, dataloader):
        super().__init__()
        self.model = model
        self.optimizer = optimizer
        self.dataloader = dataloader
        self.batch_idx = 0
        self._results = []

    @property
    def done(self):
        return self.batch_idx >= len(self.dataloader)

    def reset(self) -> None:
        self.dataloader_iter = iter(self.dataloader)

    def advance(self, *args, **kwargs) -> None:
        batch = next(self.dataloader_iter)
        self.optimizer.zero_grad()
        loss = self.model.training_step(batch, self.batch_idx)
        loss['loss'].backward()
        self._results.append(loss)
        self.optimizer.step()
    def on_run_end(self):
        results = self.model.training_epoch_end(self._results)
        return results


def scatterplot(targets, predictions, metrics={}, ax=None):
  ax.scatter(targets, predictions, zorder=1)
  ax.set_xlim(0.0, scale_factor_target)
  ax.set_ylim(-0.2*scale_factor_target, scale_factor_target)
  ax.plot([-0.2*scale_factor_target, scale_factor_target], [-0.2*scale_factor_target, scale_factor_target], color='grey')
  ax.grid()
  ax.set_title(f'{str(metrics)} n={len(predictions)}')
  ax.set_xlabel('Observed yield (fresh ton/ha)')
  ax.set_ylabel('Predicted yield (fresh ton/ha)')

def compute_metrics(targets, predictions):
  metrics = {}
  criterion = nn.MSELoss()
  metrics['rmse'] = round(torch.sqrt(criterion(predictions, targets)).item(),2)
  metrics['r'] = round(pearsonr(predictions, targets).item(),3)
  return metrics

from google.colab import drive
drive.mount('/content/drive')

log_dir = '/content/drive/MyDrive/WUR/metatipstar/log-try'
!mkdir -p $log_dir 


from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter(log_dir=f'{log_dir}/9-June')

%load_ext tensorboard
%tensorboard --logdir $log_dir

num_epochs = 4000
model_path = os.path.join('/content/metatipstar/log',f'synthetic-model.ckpt')
model_synthetic = Model1.load_from_checkpoint(model_path, n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr))
model_synthetic.do_log = False

dataset = datasets['obs']

test_loader = DataLoader(dataset, batch_size=find_batch_size(len(dataset)))
trainer = pl.Trainer(logger=False, accelerator="auto", enable_model_summary=False, enable_progress_bar=False)
results_synthetic = trainer.predict(model_synthetic, test_loader)
predictions = torch.reshape(results_synthetic[0]['predictions'],(-1,))
targets = torch.reshape(results_synthetic[0]['targets'],(-1,))
predictions = torch.mul(predictions, scale_factor_target)
targets = torch.mul(targets, scale_factor_target)
metrics_synthetic = compute_metrics(targets, predictions)

expids = ['DRV00', 'DRV96', 'DRV97', 'DRV98', 'DRV99', 'KB009035', 'KB019045', 'KB029055',
 'KB039074', 'KB961083', 'KB971106', 'KB981118', 'KB981119', 'KB981120',
 'KB981121', 'KB991139', 'KB991140', 'KB999019', 'KP009059', 'KP029112',
 'KP039147', 'KP940316', 'KP950000', 'KP960366', 'KP970384', 'KP980407',
 'KP980408', 'KP980411', 'KP990436', 'KP990437', 'KP999038', 'Kp980415',
 'kb009036', 'kb999020', 'kp009060', 'kp999039']

dict_exp = {nr : i for i, nr in enumerate(expids)}

untrained_model = Model1(n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr))
untrained_model.do_log = False

models_transfer = {}
optimizers_transfer = {}
models = {}
optimizers = {}
for fold, id in enumerate(dict_exp):
  models[fold] = Model1(n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr), do_log=False)
  torch.random.manual_seed(42)
  kaiming_init(models[fold])
  optimizers[fold] = models[fold].configure_optimizers()
  model_transfer = Model1.load_from_checkpoint(model_path, n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr), do_log=False)
  model_transfer.freeze()
  for param in model_transfer.combine.parameters(): param.requires_grad = True  
  models_transfer[fold] = model_transfer
  optimizers_transfer[fold] = models_transfer[fold].configure_optimizers()


#fig_train, axes_train = plt.subplots(len(expids), num_epochs, sharex = True, sharey = True, figsize=(35,100))

for epoch in range(num_epochs):
  do_log = (epoch % 25 == 0)
  if do_log:
    predictions_storage, targets_storage = [], []
    predictions_transfer_storage, targets_transfer_storage = [], []

  for fold, id in enumerate(dict_exp):

    if do_log:
      val_idx = np.where(dataset.experimentids == id)[0]
      test_set = Subset(dataset, val_idx)
      test_loader = DataLoader(test_set, batch_size=find_batch_size(len(val_idx)))
    
      trainer = pl.Trainer(logger=False, accelerator="auto", enable_model_summary=False, enable_progress_bar=False)
      test_results = trainer.predict(models[fold], test_loader)
      preds = torch.mul(test_results[0]['predictions'].cpu(), scale_factor_target)
      targs = torch.mul(test_results[0]['targets'].cpu(), scale_factor_target)
      predictions_storage.append(preds)
      targets_storage.append(targs)

      trainer_transfer = pl.Trainer(logger=False, accelerator="auto", enable_model_summary=False, enable_progress_bar=False)
      test_results_transfer = trainer_transfer.predict(models_transfer[fold], test_loader)
      preds_transfer = torch.mul(test_results_transfer[0]['predictions'].cpu(), scale_factor_target)
      targs_transfer = torch.mul(test_results_transfer[0]['targets'].cpu(), scale_factor_target)
      predictions_transfer_storage.append(preds_transfer)
      targets_transfer_storage.append(targs_transfer)

    train_idx = np.where(dataset.experimentids != id)[0]
    train_set = Subset(dataset, train_idx)
    train_loader = DataLoader(train_set, batch_size = find_batch_size(len(train_idx)), shuffle = True)
    train_metrics = TrainingEpochLoop(models[fold], optimizers[fold], train_loader).run()
    train_metrics_transfer = TrainingEpochLoop(models_transfer[fold], optimizers_transfer[fold], train_loader).run()

    if do_log and fold == 0:
      metrics = compute_metrics(train_metrics['targets'], train_metrics['predictions'])
      metrics_transfer = compute_metrics(train_metrics_transfer['targets'], train_metrics_transfer['predictions'])
      writer.add_scalars("rmse-train", {'rmse':metrics['rmse'], 'rmse-transfer':metrics_transfer['rmse'], 'rmse-synthetic':metrics_synthetic['rmse']}, epoch)
      writer.add_scalars("r-train", {'r':metrics['r'], 'r-transfer':metrics_transfer['r'], 'r-synthetic':metrics_synthetic['r']}, epoch)

      fig_tb, ax_tb = plt.subplots(1,1)
      scatterplot(train_metrics['targets'], train_metrics['predictions'], metrics, ax_tb)
      writer.add_figure("train", fig_tb, epoch)

      fig_tb, ax_tb = plt.subplots(1,1)
      scatterplot(train_metrics_transfer['targets'], train_metrics_transfer['predictions'], metrics_transfer, ax_tb)
      writer.add_figure("train-transfer", fig_tb, epoch)    


  if do_log == False: continue 
  predictions = torch.reshape(torch.cat(predictions_storage),(-1,))
  targets = torch.reshape(torch.cat(targets_storage),(-1,))
  metrics = compute_metrics(targets, predictions)
  metrics['epoch'] = epoch

  predictions_transfer = torch.reshape(torch.cat(predictions_transfer_storage),(-1,))
  targets_transfer = torch.reshape(torch.cat(targets_transfer_storage),(-1,))
  metrics_transfer = compute_metrics(targets_transfer, predictions_transfer)
  metrics_transfer['epoch'] = epoch

  writer.add_scalars("rmse-test", {'rmse':metrics['rmse'], 'rmse-transfer':metrics_transfer['rmse'], 'rmse-synthetic':metrics_synthetic['rmse']}, epoch)
  writer.add_scalars("r-test", {'r':metrics['r'], 'r-transfer':metrics_transfer['r'], 'r-synthetic':metrics_synthetic['r']}, epoch)

  fig_tb, ax_tb = plt.subplots(1,1)
  scatterplot(targets, predictions, metrics, ax_tb)
  writer.add_figure("validation", fig_tb, epoch)

  fig_tb, ax_tb = plt.subplots(1,1)
  scatterplot(targets_transfer, predictions_transfer, metrics_transfer, ax_tb)
  writer.add_figure("validation-transfer", fig_tb, epoch)
  writer.flush()

#fig_train.show()


In [None]:
%%script echo skipping
import numpy as np
obs_yield = torch.mul(datasets['obs'].tensor_targets, scale_factor_target).numpy()
fig, ax = plt.subplots(1,1)
plt.hist(obs_yield)
plt.show()
mean_predictions = np.mean(obs_yield)
rmse = np.sqrt(np.mean((mean_predictions-obs_yield)**2))
print(rmse)

In [None]:
%%script echo skipping
from torch.utils.data import SubsetRandomSampler
from torch.utils.data import Dataset, DataLoader
from pytorch_lightning import loggers as pl_loggers
import logging
import numpy as np
logging.getLogger("pytorch_lightning").setLevel(logging.WARNING)

dataset = datasets['obs']
model1 = Model1(n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr), do_log=False)

expids = ['DRV00', 'DRV96', 'DRV97', 'DRV98', 'DRV99', 'KB009035', 'KB019045', 'KB029055',
 'KB039074', 'KB961083', 'KB971106', 'KB981118', 'KB981119', 'KB981120',
 'KB981121', 'KB991139', 'KB991140', 'KB999019', 'KP009059', 'KP029112',
 'KP039147', 'KP940316', 'KP950000', 'KP960366', 'KP970384', 'KP980407',
 'KP980408', 'KP980411', 'KP990436', 'KP990437', 'KP999038', 'Kp980415',
 'kb009036', 'kb999020', 'kp009060', 'kp999039']

dict_exp = {nr : i for i, nr in enumerate(expids)}

results_datamodel = {}
results_synthetic = {}

for fold,id in enumerate(dict_exp):
  train_idx = np.where(dataset.experimentids != id)[0]
  val_idx = np.where(dataset.experimentids ==id)[0]
  print(f'{fold} {id} {len(train_idx)} {len(val_idx)}')
  logger = pl_loggers.TensorBoardLogger(log_dir, name=f'data-model-{fold}')
  train_sampler = SubsetRandomSampler(train_idx)
  test_sampler = SubsetRandomSampler(val_idx)
  train_loader = DataLoader(dataset, batch_size=find_batch_size(len(train_idx)), sampler=train_sampler)
  test_loader = DataLoader(dataset, batch_size=find_batch_size(len(val_idx)), sampler=test_sampler)
  trainer = pl.Trainer(devices=1, max_epochs=500, logger=logger, accelerator="auto", enable_model_summary=False, enable_progress_bar=False)
  model_cv = Model1(n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr), do_log=False)
  trainer.fit(model=model_cv, train_dataloaders=train_loader, val_dataloaders=test_loader)
  results_datamodel[fold] = trainer.predict(model_cv, test_loader)
  model_path = os.path.join(log_dir,f'synthetic-model.ckpt')
  model_synthetic = Model1.load_from_checkpoint(model_path, n_timeseries=n_channels_timeseries, l_timeseries=min_days, n_continuous=n_features_continuous, n_categories=len(dict_soilnr))
  results_synthetic[fold] = trainer.predict(model_synthetic, test_loader)


  del model_cv

In [None]:
%%script echo skipping
def scatterplot(targets, predictions, metrics={}, ax=None):
  ax.scatter(targets, predictions, zorder=1)
  ax.set_xlim(0.0, scale_factor_target)
  ax.set_ylim(0.0, scale_factor_target)
  ax.plot([0, 1], [0, 1], transform=ax.transAxes, color='grey')
  ax.grid()
  ax.set_title(f'{str(metrics)} n={len(predictions)}')
  ax.set_xlabel('Observed yield (fresh ton/ha)')
  ax.set_ylabel('Predicted yield (fresh ton/ha)')

def compute_metrics(targets, predictions):
  metrics = {}
  criterion = nn.MSELoss()
  metrics['rmse'] = round(torch.sqrt(criterion(predictions, targets)).item(),2)
  metrics['r'] = round(pearsonr(predictions, targets).item(),3)
  return metrics

fig, axes = plt.subplots(1, 2, figsize=(20,10))

for i, results in enumerate ([results_datamodel, results_synthetic]):
  ax = axes[i]
  results = {key: value for key, value in results.items() if key <30}
  predictions = torch.reshape(torch.cat([x[0]['predictions'] for x in results.values()]),(-1,))
  targets = torch.reshape(torch.cat([x[0]['targets'] for x in results.values()]),(-1,))
  #predictions = torch.reshape(results_synthetic[0]['predictions'],(-1,))
  #targets = torch.reshape(results_synthetic[0]['targets'],(-1,))
  predictions_rescaled = torch.mul(predictions, scale_factor_target)
  targets_rescaled = torch.mul(targets, scale_factor_target)
  metrics = compute_metrics(targets_rescaled, predictions_rescaled)
  scatterplot(targets_rescaled, predictions_rescaled, metrics, ax)

plt.show()