# FASTAI implementation

In [None]:
from pathlib import Path

from src import config
from src.analyzers import *
from vaep.transform import StandardScaler, get_df_fitted_mean_std

In [None]:
import logging
from src.logging import setup_logger

logger = logging.getLogger()  # returns root-logger
logger.setLevel(logging.CRITICAL)  # silence for everything else
logger.handlers = []


logger = setup_logger(logger=logging.getLogger('vaep'))
logger.info("Experiment 01")

## Load data

- 1000 features (most abundant peptides)
- later a subset of samples is selected

In [None]:
N_SAMPLES_TO_LOAD = None
FN_PEPTIDE_INTENSITIES = config.FOLDER_DATA / 'df_intensities_N07813_M01000'
analysis = AnalyzePeptides(
    fname=FN_PEPTIDE_INTENSITIES, nrows=N_SAMPLES_TO_LOAD, index_col=0)
analysis.df = analysis.df.sort_index()  # sort by date
assert analysis.df.index.is_unique, "Non-unique training samples"
analysis

### Select consecutives samples for training

In [None]:
import random

N_SAMPLES = 1000
logger.info(f"Selected {N_SAMPLES}")
analysis.N_SAMPLES = N_SAMPLES


def get_consecutive_data_indices(index, n_samples=N_SAMPLES):
    start_sample = len(index) - n_samples
    start_sample = random.randint(0, start_sample)
    return index[start_sample:start_sample+n_samples]


indices_selected = get_consecutive_data_indices(analysis.df.index)
analysis.samples = indices_selected
analysis.df = analysis.df.loc[indices_selected]

FRACTION = 0.9

class Indices(SimpleNamespace):
    pass

indices = Indices()
indices.train, indices.valid = indices_selected[:int(
    FRACTION*N_SAMPLES)], indices_selected[int(FRACTION*N_SAMPLES):]
analysis.indices = indices

analysis.df_train = analysis.df.loc[indices.train]
analysis.df_valid = analysis.df.loc[indices.valid]

analysis.df

## Fastai Dataloader

> fastai includes a replacement for Pytorch's DataLoader which is largely API-compatible, and adds a lot of useful functionality and flexibility. Before we look at the class, there are a couple of helpers we'll need to define. [[link](https://docs.fast.ai/data.load.html)]

In [None]:
# import fastai.tabular.all as tab
from fastcore.transform import Transform

from fastai.tabular.data import TabularDataLoaders

Create dataloaders using an appropriate factory method from `TabularDataLoaders` class, here [`from_df`](https://docs.fast.ai/tabular.data.html#TabularDataLoaders.from_df)

In [None]:
# DataFrame is shuffled
N_VAL = 100
valid_idx = list(range(N_VAL))
dls = TabularDataLoaders.from_df(df=analysis.df, valid_idx=valid_idx, bs=64, 
                                 cat_names=None,
                                 cont_names=list(analysis.df.columns),
                                 y_names=None,
                                 procs=None, # add options                                 
                                )
analysis.dls = dls

In [None]:
dls.show_batch()  # loses object index attribute

In [None]:
dls.valid.show_batch()

In [None]:
for batch in dls.train:
    print(batch) # cat_names, cont_names, y_names
    break

## Transforms

- procs ?
- can be applied to an `pd.DataFrame`

In [None]:
class Normalize(Transform):
    def setup(self, array):
        self.mean = array.mean()  # this assumes tensor, numpy arrays and alike
        # should be applied along axis 0 (over the samples)
        self.std = array.std()  # ddof=0 in scikit-learn

    def encodes(self, x):
        x_enc = (x - self.mean) / self.std
        return x_enc

    def decodes(self, x_enc):
        x = (self.std * x_enc) + self.mean
        return x


tf_norm = Normalize()
tf_norm.setup(analysis.df_train)

Compare results to scikit learn implementation of [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html).

Differences seem to arrive due to iterative computation of mean and standard-deviation in scikit-learn, see [`_incremental_mean_and_var`](https://github.com/scikit-learn/scikit-learn/blob/15a949460dbf19e5e196b8ef48f9712b72a3b3c3/sklearn/utils/extmath.py#L792)

In [None]:
M = 5
scaler = StandardScaler().fit(analysis.df_train)
pd.DataFrame(
    {
        ('Transform', 'mean'): tf_norm.mean[:M],
        ('Transform', 'std'): tf_norm.std[:M],
        ('StandardScaler', 'mean'): scaler.mean_[:M],
        ('StandardScaler', 'std'): scaler.scale_[:M]
    }
)

In [None]:
N = 10
tf_norm(analysis.df_train.iloc[:N]).iloc[:, :M]

In [None]:
scaler.transform(analysis.df_train.iloc[:N]).iloc[:, :M]

### TabularDataLoader with procs

- ToDo: Replace Normalize with custom Normalize

In [None]:
from fastai.tabular.core import Normalize, FillMissing
Normalize, FillMissing 

In [None]:
# DataFrame is shuffled
N_VAL = 100
valid_idx = list(range(N_VAL))
dls = TabularDataLoaders.from_df(df=analysis.df, valid_idx=valid_idx, bs=64, 
                                 cat_names=None,
                                 cont_names=list(analysis.df.columns),
                                 y_names=None,
                                 procs=[Normalize, FillMissing], # add options  
                                )
# analysis.dls = dls

In [None]:
sample = dls.one_batch()
sample # cat, cont, target

In [None]:
sample = dls.decode(sample) # DataLoader with decoded
sample.cats

In [None]:
sample.conts

### Transformation in Pipeline

In [None]:
procs = dls.procs
type(procs)

In [None]:
type(procs.normalize)

Initalized  procs/transforms in `fs` attribute:

In [None]:
[type(x) for x in procs.fs]

### loss function

So now the `loss_func` signature and the `NN_Module` forward path have to be adapted. Unsure how to do this in plain PyTorch yet. So we only use the dataloader for now.

- Callback needed to set `xb` to `yb`, see [callback-attributes](https://docs.fast.ai/callback.core.html#Attributes-available-to-callbacks) and [example](https://github.com/dhuynh95/fastai_autoencoder/blob/bc357927f26273d676dca9a41018411408b97430/fastai_autoencoder/callback.py#L16)

In [None]:
# loss_function(recon_x=batch_recon, x=batch, mask=mask, mu=mu, logvar=logvar)
# learn = Learner(dls, NN_Module, opt_func=SGD, loss_func=mnist_loss, metrics=batch_accuracy)

## Scrap Code

- understand the many ways to interact with data in FastAi's library

- `ItemTransform` to is performed of some data in a DataFrame? ("to dataloaders", "to datasets" name to signal intermediate ?)

In [None]:
# # #source: https://nbviewer.jupyter.org/github/EtienneT/TabularVAE/blob/master/TabularAE.ipynb
# from fastai.tabular.all import *

# class ReadTabBatchIdentity(ItemTransform):
#     "Read a batch of data and return the inputs as both `x` and `y`"
#     def __init__(self, to): store_attr()

#     def encodes(self, to):
#         if not to.with_cont: res = (tensor(to.cats).long(),) + (tensor(to.cats).long(),)
#         else: res = (tensor(to.cats).long(),tensor(to.conts).float()) + (tensor(to.cats).long(), tensor(to.conts).float())
#         if to.device is not None: res = to_device(res, to.device)
#         return res
    
# class TabularPandasIdentity(TabularPandas): pass

# @delegates()
# class TabDataLoaderIdentity(TabDataLoader):
#     "A transformed `DataLoader` for AutoEncoder problems with Tabular data"
#     do_item = noops
#     def __init__(self, dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, **kwargs):
#         if after_batch is None: after_batch = L(TransformBlock().batch_tfms)+ReadTabBatchIdentity(dataset)
#         super().__init__(dataset, bs=bs, shuffle=shuffle, after_batch=after_batch, num_workers=num_workers, **kwargs)

#     def create_batch(self, i): return self.dataset.iloc[i]

# TabularPandasIdentity._dl_type = TabDataLoaderIdentity

[Tabular dataset in numpy](https://muellerzr.github.io/fastblog/2020/04/22/TabularNumpy.html#Bringing-in-NumPy)
  - `DataSet`: PyTorch interface
  - `DataLoader` customization
      1. `create_item`
      2. `create_batch`
      3. `get_idxs`
      4. `shuffle_ds`

The example is build on top of `TabularPandas` dataset.

Custom [`TransformBlock`](https://github.com/fastai/fastai/blob/master/fastai/data/block.py#L13)

```python
class TransformBlock():
    "A basic wrapper that links defaults transforms for the data block API"
    def __init__(self, type_tfms=None, item_tfms=None, batch_tfms=None, dl_type=None, dls_kwargs=None):
        self.type_tfms  =            L(type_tfms)
        self.item_tfms  = ToTensor + L(item_tfms)
        self.batch_tfms =            L(batch_tfms)
        self.dl_type,self.dls_kwargs = dl_type,({} if dls_kwargs is None else dls_kwargs)
```

## TabularPandas - Applying transforms

1. Transform are applied in place
2. Order of procs is ignored. Instead a pre-defined order is applied

In [None]:
import pandas as pd
import numpy as np
from fastai.tabular.all import *

In [None]:
df = pd.DataFrame(np.random.rand(100, 10)* 2 + 24 )
df = df.mask(cond=pd.DataFrame(np.random.rand(100,10)>0.9))
df.columns = list('ABCDEFGIJH')
df.describe()

In [None]:
procs = [Normalize, FillMissing(add_col=True)]
cont_names = list(df.columns)

to = TabularPandas(df, procs=procs, cont_names=cont_names)
print("Tabular object:", type(to))

to.items # items reveals data in DataFrame

In [None]:
cont_names = list(df.columns)
to_manuel = to = TabularPandas(df, cont_names=cont_names)
procs = [Normalize().setups(to_manuel), FillMissing(add_col=True).setup(to_manuel)]
to_manuel.items

In [None]:
to_manuel.items.describe()

In [None]:
cont_names = list(df.columns)
to_manuel = to = TabularPandas(df, cont_names=cont_names)
procs = [FillMissing(add_col=True).setup(to_manuel), Normalize().setups(to_manuel)]
to_manuel.items

In [None]:
to_manuel.items.describe()