# `DataLoaders` for feeding data into models

In [None]:
import numpy as np
import pandas as pd

import fastai
from fastai.tabular.core import Normalize
from fastai.tabular.core import FillMissing
from fastai.tabular.core import TabularPandas
from fastai.tabular.core import IndexSplitter
from fastcore.transform import Pipeline

from src.setup_logging import *

from vaep.io.datasplits import DataSplits
from vaep.models import ae


print(f"fastai version: {fastai.__version__}")

Create data

- train data without missings
- validation and test data with missings

Could be adapted to have more or less missing in training, validation or test data. Choosen as in current version the validation data cannot contain features with missing values which were not missing in the training data.

In [None]:
N, M = 150, 15

def create_df(N:int, M:int, scaling_factor:float=30.0, prop_na:float=0.0, start_idx:int=0):   
    X = np.random.rand(N, M)

    if prop_na>0.0 and prop_na<1.0:
        mask = ~(X < prop_na)
        X = np.where(mask, X, np.nan)
    
    X *= scaling_factor
    
    X = pd.DataFrame(X,
                  index=[f'sample_{i:0{len(str(N))}}' for i in range(start_idx, start_idx+N)],
                  columns=(f'feat_{i:0{len(str(M))}}' for i in range(M)))
    return X

X = create_df(N, M)
X = X.append(create_df(int(N*0.3), M, prop_na=.1, start_idx=len(X)))

idx_val = X.index[N:] # RandomSplitter could be used, but used to show IndexSplitter usage with Tabular

X_test = create_df(int(N*0.3), M, prop_na=.1, start_idx=len(X))

# data = DataSplits.from_folder(folder='data/msinstrument_in_QE4', use_wide_format=True)
data = DataSplits(train_X=X.loc[X.index.difference(idx_val)],
                  val_X=X.loc[idx_val],
                  test_X=X_test)

data.val_X.loc[data.val_X.isna().any(axis=1), data.val_X.isna().any(axis=0)]

## Collab

## Denoising Autoencoder

In [None]:
from fastai.tabular.core import Normalize
from fastai.tabular.core import FillMissing
from fastai.tabular.core import TabularPandas

In [None]:
from fastai.tabular.core import IndexSplitter

### DataSet `Tabular`

- `fastai.tabular.core.Tabular`


Adding procs / transforms manually

```python
cont_names = list(splits.train_X.columns)
to = TabularPandas(splits.train_X, cont_names=cont_names, do_setup=False)

tf_norm = NORMALIZER()
tf_fillna = FillMissing(add_col=True)

_ = tf_norm.setups(to)  # returns to
_ = tf_fillna.setup(to)
```

No added in a manuel pipeline. See [opened issue](https://github.com/fastai/fastai/issues/3530) on `Tabular` behaviour.
Setting transformation (procs) in the constructor is somehow not persistent, although very similar code is called.

```
# not entirely empty, but to.procs.fs needs to be populated
type(to.procs), to.procs.fs # __call__, setup, decode, fs
```

In [None]:
X = data.train_X.append(data.val_X)

splits = X.index.get_indexer(data.val_X.index) # In Tabular iloc is used, not loc for splitting
splits = IndexSplitter(splits)(X) # splits is are to list of integer indicies (for iloc)

procs = [Normalize, FillMissing]

to = TabularPandas(X, procs=None, cont_names=X.columns.to_list(), splits=splits) # to = tabular object
print("Tabular object:", type(to))

In [None]:
from fastcore.transform import Pipeline

from fastcore.basics import store_attr
class FillMissingKeepAll(FillMissing):
    """Replacement for `FillMissing` including also non-missing features
    in the training data which might be missing in the validation or test data.
    """
    def setups(self, to):
        store_attr(but='to', na_dict={n:self.fill_strategy(to[n], self.fill_vals[n])
                            for n in to.conts.keys()})
        self.fill_strategy = self.fill_strategy.__name__
        
procs = None # [Normalize, FillMissing]

to = TabularPandas(X, procs=procs, cont_names=X.columns.to_list(), splits=splits) # to = tabular object

procs = Pipeline([Normalize, FillMissingKeepAll])

procs.setup(to)

print("Tabular object:", type(to))
to.items.head()

Test data with procs

#### Transform test data

In [None]:
to_test = TabularPandas(data.test_X, procs=None, cont_names=data.test_X.columns.to_list(), splits=None, do_setup=True)
_ = procs(to_test) # inplace operation
to_test.items.head()

#### DataLoader

In [None]:
dls = to.dataloaders(bs=4)
dls.show_batch()

#### Feeding one batch to the model

In [None]:
cats, conts, ys =  dls.one_batch()

In [None]:
model = ae.Autoencoder(n_features=M, n_neurons=int(
    M/2), last_decoder_activation=None, dim_latent=10)
model

The forward pass just uses the conts features

In [None]:
model(conts)

#### target
- missing puzzle piece is to have a `callable` y-block which transforms part of the input. In principle it could be the same as the continous features

### PyTorch Dataset

## Variational Autoencoder

## FastAi version