# Datasets

Datasets are `Iterable` (through their `__getitem__` and `__len__` attribute).
Datasets are provided to `DataLoaders` which perform the aggreation to batches.

In [None]:
import random
import numpy as np
import pandas as pd
import vaep.io.datasets as datasets
import vaep.utils as test_data

In [None]:
N, M = 15, 7
data = test_data.create_random_missing_data(N, M, prop_missing=.4)

## Datasets

- `PeptideDatasetInMemory`
- `PeptideDatasetInMemoryMasked`
- `PeptideDatasetInMemoryNoMissings`

## `DatasetWithMaskAndNoTarget`

In [None]:
dataset = datasets.DatasetWithMaskAndNoTarget(df=pd.DataFrame(data))
for _mask, _array in dataset:
    break
_array, _mask

###  `PeptideDatasetInMemory`

- with duplicated target in memory

In [None]:
dataset = datasets.PeptideDatasetInMemory(data)
for _array, _mask, _target in dataset:
    break
_array, _mask, _target

In [None]:
id(_array), id(_mask), id(_target) 

In [None]:
_array is _target # should be true

In [None]:
data = test_data.create_random_missing_data(N, M, prop_missing=0.3)
dataset = datasets.PeptideDatasetInMemoryMasked(df=pd.DataFrame(data), fill_na=25.0)

for _array, _mask in dataset:
    if any(_mask):
        print(_array, _mask)
        break

### `DatasetWithTarget`

In [None]:
data = test_data.create_random_missing_data(N, M, prop_missing=0.3)
dataset = datasets.DatasetWithTarget(df=pd.DataFrame(data))

for _mask, _array, target in dataset:
    if any(_mask):
        print(_array, _mask, target, sep='\n')
        break

### `DatasetWithTargetSpecifyTarget`

In [None]:
data = test_data.create_random_missing_data(N, M, prop_missing=0.2)

df = pd.DataFrame(data)

val_y = df.stack().groupby(level=0).sample(frac=0.2)
# targets = val_y.unstack().sort_index()
targets = val_y.unstack()

df[targets.notna()] = pd.NA
df

The targets are complementary

In [None]:
targets

In [None]:
dataset = datasets.DatasetWithTargetSpecifyTarget(df=df, targets=targets)
for _mask, _array, target in dataset:
    if any(_mask):
        print(_mask, _array, target, sep='\n')
        break

In [None]:
row = random.randint(0,len(dataset)-1)
print(f"{row = }")
dataset[row]

### `PeptideDatasetInMemoryNoMissings`

In [None]:
# data and pd.DataFrame.data share the same memory
try:
    dataset = datasets.PeptideDatasetInMemoryNoMissings(data)
    for _array in dataset:
        print(_array)
        break
except AssertionError as e:
    print(e)

## DataLoaders

FastAI DataLoaders accept pytorch datasets

In [None]:
from fastai.collab import CollabDataLoaders
# , MSELossFlat, Learner
# from fastai.collab import EmbeddingDotBias

from vaep.io.datasplits import long_format


data = pd.DataFrame(data)
data.index.name, data.columns.name = ('Sample ID', 'peptide')
df_long = long_format(pd.DataFrame(data))
df_long.reset_index(inplace=True)
df_long.head()

In [None]:
dls = CollabDataLoaders.from_df(df_long,  valid_pct=0.15, 
                                user_name='Sample ID', item_name='peptide', rating_name='intensity',
                               bs=4)
type(dls.dataset), dls.dataset._dl_type # no __mro__?

Iterating over the dataset gives the column names

In [None]:
for x in dls.dataset:
    print(x)

Training DataFrame is hidden under items

In [None]:
dls.dataset.items

In [None]:
for x in dls.train_ds:
    print(x)
    break

In [None]:
dls.train_ds

Iterating over the dataset returns columns, not single rows

In [None]:
dls.train_ds.__getitem__??

In [None]:
dls.train_ds.items['Sample ID']

But the `DataLoader` return the numeric representation in batches:

In [None]:
for batch in dls.train_ds:
    break
batch

In [None]:
dls.train.__iter__??

In [None]:
from torch.utils.data.dataloader import _SingleProcessDataLoaderIter
_SingleProcessDataLoaderIter??

So.. It seems too complicated
- the `_collate_fn` seems to aggrete the data from the DataFrame
- should be possible to keep track of that 

In [None]:
next(iter(dls.dataset))