In [None]:
#| hide 
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| default_exp datasets/mics_databases

#  Class for RIR measurement databases
> Class to get the acoustic time-series and other meta-data of RIR acoustic measurements 

In [None]:
#| export
import os
from pathlib import Path
import numpy as np
from torchvision.datasets.utils import download_url
# For testing and adding methods to a class as patches
from fastcore.all import patch, test_eq
# For abstract base classes
from abc import ABC, abstractmethod
# For type hinting
from typing import Optional, List, Union, Tuple, ClassVar
from urllib.error import URLError
from scipy.io import loadmat



## Helper funtions

We will define many class properties with ``@property`` and to make sure all the attributes are initialized we define the following method

In [None]:
#| hide
from nbdev.showdoc import show_doc

In [None]:
#| exporti 
#| hide 
def checked_property(attr_name: str, attr_type: type = object):
    """ 
    Ensures that the attribute is initialized before accessing it. 
    """
    def getter(self):
        value = getattr(self, attr_name)
        if value is None:
            raise ValueError(f"Attribute '{attr_name}' is not initialized.")
        return value
    return property(getter)


In [None]:
show_doc(checked_property)

---

[source](https://github.com/Ramon-PR/DataScience_exploration/blob/main/DataScience_exploration/datasets/mics_databases.py#L25){target="_blank" style="float:right; font-size:smaller"}

### checked_property

>      checked_property (attr_name:str, attr_type:type=<class 'object'>)

*Ensures that the attribute is initialized before accessing it.*

Example of use:  


In [None]:
class Mics(ABC):  
    _fs: Optional[int] = None  
    fs = checked_property('_fs', float)  

In [None]:
mic = Mics()
print(mic._fs)  # ``_fs`` is None,
try:
    print(mic.fs)  # ❌ But the property ``fs`` requires _fs to be initialized to a float value
except ValueError as e:
    print(f"Caught ValueError: {e}")


None
Caught ValueError: Attribute '_fs' is not initialized.


## Database for microphones
> The base class to handle RIR measurements.



This class defines common properties and methods for the different RIR databases that will inherit from it.
The class DB_microphones will be an abstract class (from abc import ABC, abstractmethod) 

+ **ABC**: base clase to declare an **A**bstract **B**ase **C**lass  
+ **abstractmethod**: it is a decorator to indicate which methods have to be implemented by the subclasses  

This is useful since this base class can not be implemented and will force the subclasses to implement certain methods `abstractmethod`



Inspired by MNIST dataset, we will download the data in a folder structure like `./root/class_name/raw`.

+ **root**: is a parameter passed to the class
+ **class_name**: is the name of the class used to download the database  
+ **raw**: is the subfolder where the raw data is downloaded  

and we will include a `mirror` list with the urls where we can find the data to download, and a list `resources` with the name of the file to download and it's md5 checksum.



In [None]:
#| export
class DB_microphones(ABC):
    """
        Base class for microphone databases.
        I define the @property methods here, so I don't have to redefine them in the subclasses.
    """

    # ClassVar tells Pylance that these are Class variables, not instance variables.
    # and initializes them to empty lists (although __init_subclass__ will ensure they are defined in subclasses)
    mirrors: ClassVar[list[str]] = [] # List of urls to download the data from.
    resources: ClassVar[list[tuple[str, str]]] = [] # List with tuples (filename, md5) for the files to download.

    # This method is called when a subclass is defined. And I use it to ensure that the subclass has the required class attributes.
    def __init_subclass__(cls, **kwargs):
        super().__init_subclass__(**kwargs)
        if not hasattr(cls, 'resources'):
            raise NotImplementedError(f"{cls.__name__} must define class attribute 'resources'")
        if not hasattr(cls, 'mirrors'):
            raise NotImplementedError(f"{cls.__name__} must define class attribute 'mirrors'")


    _fs: Optional[float] # Using Optional to indicate that these attributes can be None until initialized
    _nmics: Optional[int]
    _nt: Optional[int]
    _n_sources: Optional[int]
    _source_id: Optional[int]
    _signal_size: Optional[int]
    _signal_start: Optional[int]
    
    def __init__(self, 
                 root: str = "./data", # Path to the root directory of the database, where the data will be dowloaded 
                 dataname: str = "RIR", # Name of the dataset, used to create a subdirectory in the root directory
                 signal_start: int = 0, # Start index of the signal in the data
                 signal_size: Optional[int] = None, # int or None. Size of the signal to be extracted from the data, if None, the whole signal will be loaded.
                 ):
        
        self.root = root
        self.dataname = dataname
        self._signal_start = signal_start
        self._signal_size = signal_size

        self._fs = None
        self._nmics = None
        self._nt = None
        self._n_sources = None
        self._source_id = None

        # Create the root directory if it does not exist
        Path(self.root).mkdir(parents=True, exist_ok=True)

    @abstractmethod
    def load_data(self, filepath: str):
        """ Load the data from the given filepath."""
        pass

    # Validated properties via helper
    fs = checked_property("_fs", float)
    n_mics = checked_property("_nmics", int)
    nt = checked_property("_nt", int)
    n_sources = checked_property("_n_sources", int)
    source_id = checked_property("_source_id", int)
    signal_size = checked_property("_signal_size", int)
    signal_start = checked_property("_signal_start", int)

    @property
    def raw_folder(self) -> str:
        """ Returns the path to the raw data folder. ./data/class_name/raw """
        return os.path.join(self.root, self.__class__.__name__, "raw")

    @property
    def dt(self) -> float:
        return 1.0 / self.fs  
    
    @abstractmethod
    def get_mic(self, imic: int, start: int, size: int) -> np.ndarray:
        pass

    @abstractmethod
    def get_pos(self, imic: int) -> np.ndarray:
        pass

    def get_time(self, start, size):
        return (start + np.arange(size)) * self.dt
    
    def _matching_resources(self,
                         pattern: str, # pattern to look for in resource names
                         ) -> list:
        """ match if the pattern is found in any of the resources """ 

        if not hasattr(self, 'resources'):
            print("No resources found.")
            return []

        # Assuming self.resources is a list of tuples (resource_name, resource_data)
        # where resource_name is a string and resource_data can be any type
        matches = [(res, md5) for res, md5 in self.resources if pattern.lower() in res.lower()]

        return(matches)

    
    def _download_resource(self, 
                           resource_name: str, # name of the resource to download
                            ) -> None:
        
        """ download a resource by its name """
        
        if not hasattr(self, 'resources'):
            print("No resources found.")
            return

        # Check the matching resources
        down_resources = self._matching_resources(pattern = resource_name)
        if not down_resources:
            print(f"No resources found matching '{resource_name}'.")
            return

        for file, md5 in down_resources:
            errors = []
            for mirror in self.mirrors:
                url = os.path.join(mirror, file)
                try:
                    if not os.path.isfile(os.path.join(self.raw_folder, file)):
                        print(f"Downloading {file} from {mirror}")
                        download_url(url=url, root=self.raw_folder, filename=file, md5=md5)

                except URLError as e:
                    errors.append(e)
                    continue
                break
            else:
                s = f"Error downloading {file}:\n"
                for mirror, err in zip(self.mirrors, errors):
                    s += f"Tried {mirror}, got:\n{str(err)}\n"
                raise RuntimeError(s)


    @classmethod
    def print_resources(cls):
        print(f"Resources for class {cls.__name__}:")
        for name, md5 in cls.resources:
            print(f"- {name} ")

### Zea database
> Database from [Elias Zea](https://www.sciencedirect.com/science/article/abs/pii/S0022460X19304316) . It will inherit from DB_micorphones 

This is one of the RIR databases. It will have to implement it's own attributes:  
    + `mirrors`  
    + `resources`  
    + `microphone spacing`  

And the methods:  
    + To check what resource to load  
    + To download the resources  
    + To unpack the downloaded resources  
    + To load the selected resource (database/dataname)  
    + To get the different attributes in the database: `dx`, `dt`, `fs`, `num_mics`, `num_sources`  
    + And also the data related with the microphone recordings: `imic`, `position`, `time_samples`, `signal`  
     

In [None]:
#| export
class ZeaRIR(DB_microphones):
    """ ZeaRIR database. """

    mirrors = [
            "https://raw.githubusercontent.com/eliaszea/RIRIS/main/dependencies/measurementData/"
        ]

    resources = [
            ("BalderRIR.mat", "bc904010041dc18e54a1a61b23ee3f99"),
            ("FrejaRIR.mat", "1dedf2ab190ad48fbfa9403409418a1d"),
            ("MuninRIR.mat", "5c90de0cbbc61128de332fffc64261c9"),
        ]
    
    _dx = 3e-2  # Distance between microphones in meters, as per the database documentation.

    def __init__(self,
                 root: str = "./data", # Path to the root directory of the database, where the data will be dowloaded
                 dataname: str = "Balder", # String matching the name of the resources to download and load. (if several resources are available, all will be downloaded but only the first one will be loaded). 
                 signal_start: int = 0, # Start index of the signal to load.
                 signal_size: Optional[int] = None, # Size of the signal to load. If None, the whole signal will be loaded.
                 ):
        super().__init__(root, dataname, signal_start, signal_size)

        # Detect the matching resource
        matched_res = self._matching_resources(dataname)
        if not matched_res:
            raise ValueError(f"No resources found matching '{dataname}'.")

        print("Matched resources to download:")
        for res, _ in matched_res:
            print(f"- {res}")

        # Download the resource if it does not exist in the raw folder 
        self._download_resource(resource_name=dataname)

        # Extract data if compressed
        self._unpack_resource()

        # Load the data from the first matching resource
        self.dataname = matched_res[0][0]
        self.load_data(os.path.join(self.raw_folder, self.dataname))


    def load_data(self, filepath: str):
        """ Loads all the Matlab data from the given filepath."""
        print(f"Loading the resource {filepath} ...")
        _rawdata = loadmat(filepath, simplify_cells=True)
        self._fs = _rawdata['out']['fs']

        T = _rawdata['out']['T']
        M = _rawdata['out']['M']

        assert self._signal_start is not None
        start_sample = self._signal_start
        if self._signal_size is None:
            self._signal_size = T - start_sample

        assert self._signal_size is not None
        last_sample = self._signal_start + self._signal_size

        assert (start_sample >= 0 and start_sample < T), f"The start_signal should be in [0, {T-1}]."
        assert (last_sample > 0 and last_sample <= T), f"The size_signal should be in [1, {T-start_sample}]."
        
        self._RIR = _rawdata['out']['image'][start_sample:last_sample, :]  # Transpose to have (n_mics, n_sources, nt)

        self._nmics = M
        self._nt = self.signal_size
        self._n_sources = 1
        self._source_id = 0


    def get_mic(self, imic: int, start: int, size: int) -> np.ndarray:
        return self._RIR[start:start + size, imic]
    

    def get_pos(self, imic: int) -> np.ndarray:
        assert 0 <= imic < self.n_mics, f"Microphone index {imic} out of range [0, {self.n_mics - 1}]"
        return imic * self._dx * np.array([1])

    def _unpack_resource(self):
        """ Unpack the resource if it is compressed. """
        # For now, I assume the resources are not compressed, but this can be extended later.
        pass    

#### Checks that Zea database works

In [None]:
db = ZeaRIR(root="./data", dataname="RIR", signal_start=0, signal_size=128)


Matched resources to download:
- BalderRIR.mat
- FrejaRIR.mat
- MuninRIR.mat
Loading the resource ./data/ZeaRIR/raw/BalderRIR.mat ...


In [None]:
print(f"Database: {db.__class__.__name__}")
print(f"Room: {db.dataname}")
print(f"Sampling frequency: {db.fs} Hz")
print(f"Number of microphones: {db.n_mics}")
print(f"Number of time samples selected: {db.nt}")
print(f"Number of sources: {db.n_sources}")
print(f"Signal start: {db.signal_start}")
print(f"Signal size: {db.signal_size}")
print(f"Source ID: {db.source_id}")


Database: ZeaRIR
Room: BalderRIR.mat
Sampling frequency: 11250 Hz
Number of microphones: 100
Number of time samples selected: 128
Number of sources: 1
Signal start: 0
Signal size: 128
Source ID: 0


Testing 

In [None]:

print(db._signal_size)
print(db._signal_start)
print(db._fs)
print(db._RIR.shape)
print(db.get_mic(imic=0, start=0, size=4))  # Get the first 4 samples of the first microphone
print(db.get_pos(imic=1))  # Get the position of the second microphone
print(db.get_time(start=0, size=4))  # Get the time for the first 4 samples

128
0
11250
(128, 100)
[ 0.00041836  0.0001148  -0.00129174  0.00162724]
[0.03]
[0.00000000e+00 8.88888889e-05 1.77777778e-04 2.66666667e-04]


This is how I have calculated the MD5 of each file in resources to add it in the class definition 

In [None]:
from torchvision.datasets.utils import calculate_md5, check_md5

In [None]:

db = ZeaRIR(root="./data")
for file, md5_class in db.resources:
    url = os.path.join(db.mirrors[0], file)
    download_url(url, root=db.raw_folder, filename=file)
    md5 = calculate_md5(os.path.join(db.raw_folder, file))
    print(f"File: {file}, MD5: {md5}")
    assert check_md5(os.path.join(db.raw_folder, file), md5_class), (
    f"Check the MD5 of the resource '{file}' for the class '{db.__class__.__name__}' "
)

Matched resources to download:
- BalderRIR.mat
Loading the resource ./data/ZeaRIR/raw/BalderRIR.mat ...
File: BalderRIR.mat, MD5: bc904010041dc18e54a1a61b23ee3f99
File: FrejaRIR.mat, MD5: 1dedf2ab190ad48fbfa9403409418a1d
File: MuninRIR.mat, MD5: 5c90de0cbbc61128de332fffc64261c9


Here we implement the option to print the resources that can be downloaded.  
I use `@patch` from `fastcore` to add this function to the class after the class has already been defined.  
Since we just want to print the resources (class attributes), it is a class method, so it does not need an instance of the class.  



::: {.callout-note}
Pylance linting does not like `patch` and will underline it as a possible error.  
I have added it directly to the class (the following code is just for testing purposes).
([This is a callout from Quarto](https://quarto.org/docs/authoring/callouts.html#callout-types))
:::


In [None]:
@patch(cls_method=True)  
def print_resources(cls: DB_microphones):
    print(f"Resources for class {cls.__name__}:")
    for name, md5 in cls.resources:
        print(f"- {name} ")


In [None]:
ZeaRIR.print_resources()

Now let's implement a method to recognize if a string pattern provided as dataname matches any resources

> Downloading:  

We give the option to give a string pattern to download several resources,  
but each instance of the class should be used to provide measurements of only one of the resources

In [None]:
res, _ =  db._matching_resources("balder")[0]
# print(res)
pathfname = os.path.join(db.raw_folder, res)
print(f"Path to resource: {pathfname}")



In [None]:

db = ZeaRIR(root="./data", dataname="RIR")
db._matching_resources("RIR")
db._download_resource("RIR")




In [None]:
#| hide
from nbdev import show_doc

In [None]:
3


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()