## Tasks 
* come from the concept of meta learning literature.
* Set of observations 
* Context_sets 
* Target_sets
* Tasks

In [2]:
import logging

logging.captureWarnings(True)

import deepsensor.torch
from deepsensor.data import DataProcessor
from deepsensor.data.sources import get_ghcnd_station_data, get_era5_reanalysis_data, get_earthenv_auxiliary_data, get_gldas_land_mask

import matplotlib.pyplot as plt

# Using the same settings allows use to use pre-downloaded cached data
data_range = ("2016-06-25", "2016-06-30")
extent = "europe"
station_var_IDs = ["TAVG", "PRCP"]
era5_var_IDs = ["2m_temperature", "10m_u_component_of_wind", "10m_v_component_of_wind"]
auxiliary_var_IDs = ["elevation", "tpi"]
cache_dir = "mycache"

station_raw_df = get_ghcnd_station_data(station_var_IDs, extent, date_range=data_range, cache=True, cache_dir=cache_dir)
era5_raw_ds = get_era5_reanalysis_data(era5_var_IDs, extent, date_range=data_range, cache=True, cache_dir=cache_dir)
auxiliary_raw_ds = get_earthenv_auxiliary_data(auxiliary_var_IDs, extent, "10KM", cache=True, cache_dir=cache_dir)
land_mask_raw_ds = get_gldas_land_mask(extent, cache=True, cache_dir=cache_dir)

data_processor = DataProcessor(x1_name="lat", x2_name="lon")
era5_ds = data_processor(era5_raw_ds)
aux_ds, land_mask_ds = data_processor([auxiliary_raw_ds, land_mask_raw_ds], method="min_max")
station_df = data_processor(station_raw_df)

100%|██████████| 3133/3133 [04:39<00:00, 11.23it/s]


In [7]:
era5_ds

In [4]:
land_mask_ds

In [8]:
station_raw_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,PRCP,TAVG
time,lat,lon,station,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-06-25,35.0170,-1.4500,AGM00060531,0.0,26.1
2016-06-25,35.1000,-1.8500,AGE00147716,0.0,23.3
2016-06-25,35.1170,36.7500,SYM00040030,,33.7
2016-06-25,35.1670,2.3170,AGM00060514,0.0,31.8
2016-06-25,35.2000,-0.6170,AGM00060520,0.0,30.6
...,...,...,...,...,...
2016-06-30,54.0731,12.3256,GME00123058,0.1,
2016-06-30,37.8500,27.8500,TUM00017234,,30.2
2016-06-30,45.5300,8.6690,ITM00016064,0.3,25.8
2016-06-30,69.9775,23.3581,NOE00133110,0.0,


In [55]:
station_raw_df.shape

(16556, 2)

In [37]:
aux_ds

In [34]:
from deepsensor.data import TaskLoader
task_loader = TaskLoader(context=[era5_ds, land_mask_ds,aux_ds], target=station_df)
task = task_loader("2016-06-25", context_sampling=[52, 112,40], target_sampling=245)

In [35]:
task.keys()

dict_keys(['time', 'ops', 'X_c', 'Y_c', 'X_t', 'Y_t'])

In [36]:
print(task)

time: 2016-06-25 00:00:00
ops: []
X_c: [(2, 52), (2, 112), (2, 40)]
Y_c: [(3, 52), (1, 112), (2, 40)]
X_t: [(2, 245)]
Y_t: [(2, 245)]



### Task structure
A Task typically contains at least the following entries:

"time": timestamp that was used for slicing the spatiotemporal data.

"ops" list of processing operations that have been applied to the data (more on this shortly).

"X_c" and "Y_c": length-
 lists of context set observations 
 and 
.

"X_t" and "Y_t": as above, but for the target sets. In the example above, the target observations are known, so this Task may be used for training

### Gridded data in tasks.
* Data that lies on a regular grid is given a compact tuple representation for the "X" entries:

In [52]:
task_with_gridded_data = task_loader("2016-06-25", context_sampling=["all", "all","all"], target_sampling=245)

In [53]:
print(task_with_gridded_data)

time: 2016-06-25 00:00:00
ops: []
X_c: [((1, 141), (1, 221)), ((1, 140), (1, 220)), ((1, 420), (1, 660))]
Y_c: [(3, 141, 221), (1, 140, 220), (2, 420, 660)]
X_t: [(2, 245)]
Y_t: [(2, 245)]



In [54]:
task_with_gridded_data["X_c"][0][0]

array([[0.6363636 , 0.6318182 , 0.6272727 , 0.6227273 , 0.6181818 ,
        0.6136364 , 0.6090909 , 0.6045455 , 0.6       , 0.5954546 ,
        0.59090906, 0.5863636 , 0.58181816, 0.5772727 , 0.57272726,
        0.5681818 , 0.56363636, 0.5590909 , 0.55454546, 0.55      ,
        0.54545456, 0.5409091 , 0.53636366, 0.5318182 , 0.5272727 ,
        0.52272725, 0.5181818 , 0.51363635, 0.5090909 , 0.50454545,
        0.5       , 0.49545455, 0.4909091 , 0.48636365, 0.48181817,
        0.47727272, 0.47272727, 0.46818182, 0.46363637, 0.45909092,
        0.45454547, 0.45      , 0.44545454, 0.4409091 , 0.43636364,
        0.4318182 , 0.42727274, 0.4227273 , 0.4181818 , 0.41363636,
        0.4090909 , 0.40454546, 0.4       , 0.39545456, 0.3909091 ,
        0.38636363, 0.38181818, 0.37727273, 0.37272727, 0.36818182,
        0.36363637, 0.3590909 , 0.35454544, 0.35      , 0.34545454,
        0.3409091 , 0.33636364, 0.3318182 , 0.3272727 , 0.32272726,
        0.3181818 , 0.31363636, 0.3090909 , 0.30

### Task Methods.
* Removing NaNs
* Adding Batch dimensions.
* These operations will be recorded in the order they were applied in the "ops" entry of the task.


In [5]:
print(task.add_batch_dim().convert_to_tensor())

time: 2016-06-25 00:00:00
ops: ['batch_dim', 'tensor']
X_c: [torch.Size([1, 2, 52]), torch.Size([1, 2, 112])]
Y_c: [torch.Size([1, 3, 52]), torch.Size([1, 1, 112])]
X_t: [torch.Size([1, 2, 245])]
Y_t: [torch.Size([1, 2, 245])]



### Gridded Data can be flatten by using .flatten_gridded_data.

In [6]:
print(task_with_gridded_data.flatten_gridded_data())

time: 2016-06-25 00:00:00
ops: ['gridded_data_flattened']
X_c: [(2, 31161), (2, 30800)]
Y_c: [(3, 31161), (1, 30800)]
X_t: [(2, 245)]
Y_t: [(2, 245)]

