# Datasets

Datasets are `Iterable` (through their `__getitem__` and `__len__` attribute).
Datasets are provided to `DataLoaders` which perform the aggreation to batches.

In [1]:
import numpy as np
import pandas as pd
import vaep.io.datasets as datasets
import vaep.tests.helpers as test_data

In [2]:
N, M = 15, 7
data = test_data.create_random_missing_data(N, M)

## Datasets

- `PeptideDatasetInMemory`
- `PeptideDatasetInMemoryMasked`
- `PeptideDatasetInMemoryNoMissings`

## `DatasetWithMaskAndNoTarget`

In [3]:
dataset = datasets.DatasetWithMaskAndNoTarget(df=pd.DataFrame(data))
for _mask, _array in dataset:
    break
_array, _mask

(tensor([24.2070, 23.3077, 21.1193, 22.3930, 21.7593, 26.0015,     nan],
        dtype=torch.float64),
 tensor([False, False, False, False, False, False,  True]))

###  `PeptideDatasetInMemory`

- with duplicated target in memory

In [4]:
dataset = datasets.PeptideDatasetInMemory(data)
for _array, _mask, _target in dataset:
    break
_array, _mask, _target

(tensor([24.2070, 23.3077, 21.1193, 22.3930, 21.7593, 26.0015,  0.0000]),
 tensor([ True,  True,  True,  True,  True,  True, False]),
 tensor([24.2070, 23.3077, 21.1193, 22.3930, 21.7593, 26.0015,     nan]))

In [5]:
id(_array), id(_mask), id(_target) 

(1887755107280, 1887755189840, 1887755190160)

In [6]:
_array is _target # should be true

False

In [7]:
data = test_data.create_random_missing_data(N, M, prop_missing=0.3)
dataset = datasets.PeptideDatasetInMemoryMasked(df=pd.DataFrame(data), fill_na=25.0)

for _array, _mask in dataset:
    if any(_mask):
        print(_array, _mask)
        break

tensor([False, False,  True, False, False, False,  True]) tensor([27.2777, 24.2437, 25.0000, 25.9134, 25.5872, 24.8014, 25.0000],
       dtype=torch.float64)


### `DatasetWithTarget`

In [12]:
data = test_data.create_random_missing_data(N, M, prop_missing=0.3)
dataset = datasets.DatasetWithTarget(df=pd.DataFrame(data))

for _mask, _array, target in dataset:
    if any(_mask):
        print(_array, _mask, target, sep='\n')
        break

tensor([25.9559, 22.4594, 28.5827, 24.1366,     nan, 27.6225, 22.9695],
       dtype=torch.float64)
tensor([False, False, False, False,  True, False, False])
tensor([25.9559, 22.4594, 28.5827, 24.1366,     nan, 27.6225, 22.9695],
       dtype=torch.float64)


### `PeptideDatasetInMemoryNoMissings`

In [None]:
# data and pd.DataFrame.data share the same memory
try:
    dataset = datasets.PeptideDatasetInMemoryNoMissings(data)
    for _array in dataset:
        print(_array)
        break
except AssertionError as e:
    print(e)

## DataLoaders

FastAI DataLoaders accept pytorch datasets

In [None]:
from fastai.collab import CollabDataLoaders
# , MSELossFlat, Learner
# from fastai.collab import EmbeddingDotBias

from vaep.io.datasplits import long_format



df_long = long_format(pd.DataFrame(data))
df_long.index.names = ('Sample ID', 'peptide')
df_long.reset_index(inplace=True)
df_long.head()

In [None]:
dls = CollabDataLoaders.from_df(df_long,  valid_pct=0.15, 
                                user_name='Sample ID', item_name='peptide', rating_name='intensity',
                               bs=4)
type(dls.dataset), dls.dataset._dl_type # no __mro__?

Iterating over the dataset gives the column names

In [None]:
for x in dls.dataset:
    print(x)

Training DataFrame is hidden under items

In [None]:
dls.dataset.items

In [None]:
for x in dls.train_ds:
    print(x)
    break

In [None]:
dls.train_ds

Iterating over the dataset returns columns, not single rows

In [None]:
dls.train_ds.__getitem__??

In [None]:
dls.train_ds.items['Sample ID']

But the `DataLoader` return the numeric representation in batches:

In [None]:
for batch in dls.train:
    break
batch

In [None]:
dls.train.__iter__??

In [None]:
from torch.utils.data.dataloader import _SingleProcessDataLoaderIter
_SingleProcessDataLoaderIter??

So.. It seems too complicated
- the `_collate_fn` seems to aggrete the data from the DataFrame
- should be possible to keep track of that 

In [None]:
next(iter(dls.dataset))
