# Create a Pytorch Dataset for RIR predictions

Tengo 100 ids, del 0 al 99.
los microfonos 1, 7 y 34 forman el grupo de micros de referencia
y el resto los quiero usar como targets en un dataset de pytorch.
En un pase forward, cogería la posicion de uno de los microfonos del grupo target, y los micros del grupo de referencia para predecir la señal en en micro target.
Cómo haría el Dataset? Es recomendable usar "yield"?
ChatGPT Plus

Tu planteamiento para crear un dataset personalizado en PyTorch tiene mucho sentido, y no, normalmente no se recomienda usar yield dentro de un Dataset de PyTorch porque:

    PyTorch espera que el __getitem__ devuelva datos directamente (no un generador).

    El sistema de DataLoader usa indexación aleatoria y múltiples workers, y yield no es compatible con eso directamente.

## Dataset that returns positions


### Tests

In [2]:
# export
from exp.nb_rirDatasets import MeshRIR
from torch.utils.data import Dataset


In [3]:


ds = MeshRIR(root="data", download=False, dataset="S1", 
            source_id=0, start_signal=0, signal_size=512)

ds.get_nmics()

3969

In [4]:
# ids of microphones used for the interpolation
ids_env = [7, 100, 1002, 2958, 2643]

# ids of microphones used for the prediction
ids_pred_mics = [i for i in range(ds.get_nmics()) if i not in ids_env]

print(f"Length of All mics: {ds.get_nmics()}")
print(f"Length of prediction mics: {len(ids_pred_mics)}")

env = {}
env['signal'] = [ds.get_mic(i) for i in ids_env]
env['time'] = [ds.get_time(i) for i in ids_env]
env['position'] = [ds.get_pos(i) for i in ids_env]




Length of All mics: 3969
Length of prediction mics: 3964


In [5]:
import numpy as np
import torch

env['signal'] = torch.from_numpy(np.stack(env['signal']).astype(np.float32))
env['time'] = torch.from_numpy(np.stack(env['time']).astype(np.float32))
env['position'] = torch.from_numpy(np.stack(env['position']).astype(np.float32))



In [6]:

print(env['signal'].shape)
print(env['time'].shape)
print(env['position'].shape)



torch.Size([5, 512])
torch.Size([5, 512])
torch.Size([5, 3])


In [7]:
idx = 4
target = dict(signal=ds.get_mic(idx), 
             time=ds.get_time(idx), 
             position=ds.get_pos(idx))
print(target['signal'].shape)
print(target['time'].shape)
print(target['position'].shape)


(512,)
(512,)
(3,)


### Dataset Class with a constant environment

In [8]:
# export
from exp.nb_rirDatasets import DB_microphones
from typing import List

class DSRirFixedEnv(Dataset):
    """ 
    Dataset with a fixed environment
    In this version I let the the predict microphone to be any of the micros in the data
    (including those labeled as environment)
    """
    def __init__(self, 
                 mic_dataset: DB_microphones, 
                 ids_env: List[int],
                 ):
        super().__init__()
        self.dataset = mic_dataset
        self.ids_env = ids_env        

        # Environment microphones
        self.env = {}
        self.env['signal'] = [ds.get_mic(i) for i in ids_env]
        self.env['time'] = [ds.get_time(i) for i in ids_env]
        self.env['position'] = [ds.get_pos(i) for i in ids_env]
        # Change to torch tensors
        self.env['signal'] = torch.from_numpy(np.stack(self.env['signal']).astype(np.float32))
        self.env['time'] = torch.from_numpy(np.stack(self.env['time']).astype(np.float32))
        self.env['position'] = torch.from_numpy(np.stack(self.env['position']).astype(np.float32))

    def __len__(self):
        return self.dataset.get_nmics()
    
    def __getitem__(self, idx):
        """
        In this version the environment is fixed, so in the __getitem__ 
        we only return the target 
        """       
        
        return dict(signal=self.dataset.get_mic(idx),
                    time=self.dataset.get_time(idx), 
                    position=self.dataset.get_pos(idx))
 
    def get_env(self):
        """
        Return the environment
        """
        return self.env


In [9]:
dsrir = DSRirFixedEnv(ds, ids_env=[100, 300, 500])

# Accesing an element
print(f"Length of dataset: {len(dsrir)}")
print("Target at index 1. ")
print(f"using list indexing:   {dsrir[1]['position']} ") 
print(f"and using __getitem__: {dsrir.__getitem__(1)['position']} ")

# Print the environment
print()
print("Environment \nPositions:")
print(dsrir.get_env()['position'])

Length of dataset: 3969
Target at index 1. 
using list indexing:   [-0.45 -0.5  -0.2 ] 
and using __getitem__: [-0.45 -0.5  -0.2 ] 

Environment 
Positions:
tensor([[ 0.3000, -0.3000, -0.2000],
        [-0.2000,  0.2000, -0.2000],
        [ 0.3500, -0.4000, -0.1500]])


### Datasets with random environments in each sample

In [10]:
#export
import math 
import random
        
class DS_random_pick(torch.utils.data.Dataset):
    def __init__(
        self,
        mic_dataset: DB_microphones, 
        n_ref_mics: int = 4,  # number of mics I will pick as my environment to interpolate
        max_combinations: int = 1000,  # number of maximum combinations
    ):
        super().__init__()
        self.dataset = mic_dataset
        self.n_ref_mics = n_ref_mics
        self.max_combinations = max_combinations

        # number of combinations without replacement of n elements in groups of r : n!/(r!*(n-r)!)
        n = self.dataset.get_nmics()
        r = self.n_ref_mics
        n_comb = int(math.factorial(n) / math.factorial(n - r) / math.factorial(r))
        self.len_comb_dataset = min(n_comb, self.max_combinations)

    def __len__(self):
        return self.len_comb_dataset

    def __getitem__(self, idx):
        ids = random.sample(range(self.dataset.get_nmics()), self.n_ref_mics + 1)

        signals = [self.dataset.get_mic(i) for i in ids]
        positions = [self.dataset.get_pos(i) for i in ids]
        times = [self.dataset.get_time(i) for i in ids]

        env = dict(
             signal=torch.from_numpy(np.stack(signals[1:]).astype(np.float32)),
             time=torch.from_numpy(np.stack(times[1:]).astype(np.float32)),
             position=torch.from_numpy(np.stack(positions[1:]).astype(np.float32)),
             )
        
        target = dict(
             signal=torch.from_numpy(np.stack(signals[0]).astype(np.float32)),
             time=torch.from_numpy(np.stack(times[0]).astype(np.float32)),
             position=torch.from_numpy(np.stack(positions[0]).astype(np.float32)),
             )
                   
        return env, target


In [11]:
dsrir2 = DS_random_pick(mic_dataset=ds, n_ref_mics=4, max_combinations=100)

In [12]:
type(dsrir2[0])
env2, target2 = dsrir2[0]
print(env2['position'])
print(target2['position'])

tensor([[-0.2000, -0.3500,  0.1500],
        [ 0.1500, -0.5000,  0.2000],
        [-0.2500,  0.1500,  0.2000],
        [ 0.1500, -0.4500, -0.1000]])
tensor([ 0.2000, -0.3500,  0.2000])


### Datamodules

In [13]:
#export
import torch
import lightning.pytorch as L
from torch.utils.data import random_split, ConcatDataset, DataLoader
from typing import List

def ensure_list(x):
    if isinstance(x, Dataset):
        return [x]
    elif isinstance(x, list):
        return x
    elif x is None:
        return []
    else:
        raise TypeError(f"Expected Dataset or list of Datasets, got {type(x)}")
    
class DM_PL_DataModule(L.LightningDataModule):
    def __init__(self, 
                 ls_datasets_train: List[torch.utils.data.Dataset] = [], 
                 ls_datasets_test: List[torch.utils.data.Dataset] = [],
                 batch_size: int = 64, num_workers: int = 0, 
                 ):
        super().__init__()
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.ls_datasets_train = ensure_list(ls_datasets_train) 
        self.ls_datasets_test = ensure_list(ls_datasets_train)

    def setup(self, stage):
        if stage == "fit":
            self.ds_train, self.ds_val = random_split( ConcatDataset(self.ls_datasets_train), 
                                                        [0.8, 0.2])

        # Assign test dataset for use in dataloader(s)
        if stage == "test":
            self.ds_test = ConcatDataset(self.ls_datasets_test)

    def train_dataloader(self):
        return DataLoader(self.ds_train, batch_size=self.batch_size, shuffle=True,
            num_workers=self.num_workers, pin_memory=False, collate_fn=None)

    def val_dataloader(self):
        return DataLoader(self.ds_val, batch_size=self.batch_size, num_workers=self.num_workers)

    def test_dataloader(self):
        return DataLoader(self.ds_test, batch_size=self.batch_size)

In [14]:
nsize = 128
ds1  = MeshRIR(root="data", download=False, dataset="S1",
               source_id=0, start_signal=0, signal_size=nsize)

ds2 = MeshRIR(root="data", download=False, dataset="S32",
              source_id=3, start_signal=0, signal_size=nsize)

dsrir1 = DSRirFixedEnv(ds1, ids_env=[100, 300, 500, 1000])
dsrir2 = DSRirFixedEnv(ds2, ids_env=[101, 301, 501, 1001])

dsrir3 = DS_random_pick(ds1, n_ref_mics=4, max_combinations=1000)
dsrir4 = DS_random_pick(ds2, n_ref_mics=4, max_combinations=1000)


In [15]:
dm = DM_PL_DataModule( dsrir1, [], batch_size=12, num_workers=0)
CD = ConcatDataset(dm.ls_datasets_train) # error

# CD = ConcatDataset(dsrir1) # error
# CD = ConcatDataset([dsrir1]) # Ok
# Si aplicas list() a algo que no es iterable, como un int o un objeto que no define __iter__, obtendrás un error:
# list(123)  # ❌ TypeError: 'int' object is not iterable
# ds = MyDataset()
# list(ds)  # devuelve una lista de samples si MyDataset es iterable



In [17]:
dm1 = DM_PL_DataModule( dsrir1, dsrir2, batch_size=12, num_workers=0)
dm2 = DM_PL_DataModule( dsrir3, dsrir4, batch_size=12, num_workers=0)


In [18]:
for dm in [dm1, dm2]:
    dm.setup(stage='fit')
    print(f"Length ds_train: {dm.ds_train.__len__()}")

    

Length ds_train: 3176
Length ds_train: 800


I selected 80% for train 20% for validation, so for a dataset of 1000 samples, 800 are in the train dataset and 200 in the validation dataset