In [None]:
import os
import pandas as pd
from ketos.data_handling import selection_table as sl
from ketos.data_handling.database_interface import AudioWriter, create_database
from ketos.data_handling.parsing import load_audio_representation
import ketos.audio.audio_loader as al
import tarfile

In [None]:
# root_path='/home/sadmans/KW_detector_multiclass/' # Orca-VM
# root_path='/home/sadman/KW_detector_multiclass/' # DL Training pc
# root_path='/home/sadman/projects/ctb-ruthjoy/sadman/Projects/KW_detector_multiclass/' # Cedar
root_path='../../' 
cedar_username='sadman'

spec_cfg = load_audio_representation(root_path+'code/create_db/spec_config.json', name="spectrogram")
print("spec_cfg:", spec_cfg)

path_dict = {'train_annot': root_path+'annotations/train/',
             'test_annot': root_path+'annotations/test/',
             'audio_data_dir': '/data/audio',
             'database_save_filename': root_path+'saved_database/test_ds_multiclass_original.h5'
             }

select_step, select_min_overlap=spec_cfg['duration'], 1.0

In [None]:
# Read DFO annotation
dataset_name='dfo_no_overlap'
annot_df=pd.read_csv(path_dict['train_annot']+dataset_name+'_train_multiclass.csv', sep=',')
annot_df.head()

In [None]:
# Stanradize annotation
std_annot_df, label_dict=sl.standardize(table=annot_df, signal_labels=['KW', 'HB', 'D', 'OTHER'], trim_table=False, return_label_dict=True)
print("label_dict:", label_dict) # {'KW': 1, 'HB': 2, 'D': 3}

sel_annot_df = sl.select(annotations=std_annot_df, length=spec_cfg['duration'], step=select_step, min_overlap=select_min_overlap, center=False)
kw_sel_annot_df=sl.query(sel_annot_df, annotations=None, filename=None, label=1, start=None, end=None)
hb_sel_annot_df=sl.query(sel_annot_df, annotations=None, filename=None, label=2, start=None, end=None)
d_sel_annot_df=sl.query(sel_annot_df, annotations=None, filename=None, label=3, start=None, end=None)
other_sel_annot_df=sl.query(sel_annot_df, annotations=None, filename=None, label=4, start=None, end=None)
print(len(kw_sel_annot_df), len(hb_sel_annot_df), len(d_sel_annot_df), len(other_sel_annot_df))
print("Adding positive samples from ", dataset_name)

In [None]:
# Read audio files from tar file
tar = tarfile.open('/home/'+cedar_username+'/projects/ctb-ruthjoy/SRKW/DFO/DFO.tar')
all_tar_names=tar.getnames()

In [None]:
from tqdm import tqdm

def create_database(output_file, data_dir, selections, channel=0, 
    audio_repres={'type': 'Waveform'}, annotations=None, dataset_name=None,
    max_size=None, verbose=True, progress_bar=True, discard_wrong_shape=False, 
    allow_resizing=1, include_source=True, include_label=True, 
    include_attrs=False, attrs=None, data_name=None, index_cols=None,
    mode='a', tar_file=None, tar_extract_location=root_path+'saved_database/'):

    """ Create a database from a selection table where audio files are saved within a .tar file

        If 'dataset_name' is not specified, the name of the folder containing the audio 
        files ('data_dir') will be used.
        
        If the method encounters problems loading/writing a sound clipe, it continues 
        while printing a warning
    
        Args:
            output_file:str
                The name of the HDF5 file in which the data will be stored.
                Can include the path (e.g.:'/home/user/data/database_abc.h5').
                If the file does not exist, it will be created.
                If the file already exists, new data will be appended to it.
            data_dir:str
                Path to folder containing \*.wav files.
            selections: pandas DataFrame
                Selection table
            channel: int
                For stereo recordings, this can be used to select which channel to read from
            audio_repres: dict or list(dict)
                A dictionary containing the parameters used to generate the spectrogram or waveform
                segments. See :class:~ketos.audio.auio_loader.AudioLoader for details on the 
                required and optional fields for each type of signal. It is also possible to specify 
                multiple audio representations as a list.
            annotations: pandas DataFrame
                Annotation table. Optional.
            dataset_name:str
                Name of the node (HDF5 group) within the database (e.g.: 'train')
                Under this node, two datasets will be created: 'data' and 'data_annot',
                containing the data (spectrograms or waveforms) and the annotations for each
                entry in the selections_table.                
            max_size: int
                Maximum size of output database file in bytes.
                If file exceeds this size, it will be split up into several 
                files with _000, _001, etc, appended to the filename.
                The default values is max_size=1E9 (1 Gbyte). 
                If None, no restriction is imposed on the file size (i.e. the file 
                is never split).
            verbose: bool
                Print relevant information during execution such as no. of files written to disk
            progress_bar: bool
                Show progress bar.  
            discard_wrong_shape: bool
                Discard objects that do not have the same shape as previously saved objects. Default is False.
            allow_resizing: int
                If the object shape differs from previously saved objects, the object 
                will be resized using the resize method of the scikit-image package, provided the mismatch 
                is no greater than allow_resizing in either dimension. 
            include_source: bool
                If True, the name of the wav file from which the waveform or 
                spectrogram was generated and the offset within that file, is 
                saved to the table. Default is True.
            include_label: bool
                Include integer label column in data table. Only relevant for weakly annotated samples. Default is True.
            include_attrs: bool
                If True, load data from attribute columns in the selection table. Default is False.
            attrs: list(str)
                Specify the names of the attribute columns that you wish to load data from. 
                Overwrites include_attrs if specified. If None, all columns will be loaded provided that 
                include_attrs=True.
            data_name: str or list(str) 
                Name(s) of the data columns. If None is specified, the data column is named 'data', 
                or 'data0', 'data1', ... if the table contains multiple data columns.
            index_cols: str og list(str)
                Create indices for the specified columns in the data table to allow for faster queries.
                For example, `index_cols="filename"` or `index_cols=["filename", "label"]`
            mode: str
                The mode to open the file. It can be one of the following:
                    ’w’: Write; a new file is created (an existing file with the same name would be deleted). 
                    ’a’: Append; an existing file is opened for reading and writing, and if the file does not exist it is created. This is the default.
                    ’r+’: It is similar to ‘a’, but the file must already exist.
            tar_file: tarfile 
                The tar object which contains the tar files. This object is acquired after opening the tar file.
            tar_extract_location: str 
                Path where the audio files will be extracted temporarily and then deleted later.

    """    
    
    loader = al.AudioSelectionLoader(path=data_dir, selections=selections, channel=channel, 
        repres=audio_repres, annotations=annotations, include_attrs=include_attrs, attrs=attrs)

    writer = AudioWriter(output_file=output_file, max_size=max_size, verbose=verbose, mode=mode,
        discard_wrong_shape=discard_wrong_shape, allow_resizing=allow_resizing, 
        include_source=include_source, include_label=include_label, data_name=data_name, index_cols=index_cols)
    
    if dataset_name is None: dataset_name = os.path.basename(data_dir)
    path_to_dataset = dataset_name if dataset_name.startswith('/') else '/' + dataset_name
    
    last_processed_filename=''
    for i in tqdm(range(loader.num()), disable = not progress_bar):
        loader_file_full_path=loader.sel_gen.get_selection(id=i)['filename']
        loader_filepath, loader_filename=os.path.split(loader_file_full_path)
        extract_foldername=os.path.basename(os.path.dirname(loader_file_full_path))
        filename_to_extract=tar_extract_location+extract_foldername+'/'+loader_filename

        if os.path.exists(filename_to_extract)==False:
            print("Not found, extracting . . .")
            print("Member:",extract_foldername+'/'+loader_filename)
            print("path:",tar_extract_location)
            member_name=extract_foldername+'/'+loader_filename
            if(member_name in all_tar_names):
                print("Member found . . . ", member_name)
                tar_file.extract(member=extract_foldername+'/'+loader_filename, path=tar_extract_location)
        
        if(last_processed_filename!='' and last_processed_filename!=loader_file_full_path):
            print("removing", last_processed_filename)
            os.remove(last_processed_filename)   

        try:
            x = next(loader)
        except Exception as e:
            if(verbose):
                print("Warning: while loading {0}, Message: {1}".format(loader_filename, str(e)))
            continue
        
        try:
            writer.write(x=x, path=path_to_dataset, name='data')
        except Exception as e:
            if(verbose):
                print("Warning: while writing {0}, Message: {1}".format(loader_filename, str(e)))

        last_processed_filename=loader_file_full_path
    
    try:
        os.remove(last_processed_filename) 
        os.rmdir(loader_filepath)
    except OSError as e:
        print("Error while deleting audio folder location: %s : %s" % (loader_filepath, e.strerror))

    writer.close()

In [None]:
# Create database for each class label
create_database(output_file=path_dict['database_save_filename'], data_dir=path_dict['audio_data_dir'],
                                dataset_name='kw', selections=kw_sel_annot_df,
                                audio_repres=spec_cfg, tar_file=tar)
create_database(output_file=path_dict['database_save_filename'], data_dir=path_dict['audio_data_dir'],
                                dataset_name='hb', selections=hb_sel_annot_df,
                                audio_repres=spec_cfg, tar_file=tar)
create_database(output_file=path_dict['database_save_filename'], data_dir=path_dict['audio_data_dir'],
                                dataset_name='dolphin', selections=d_sel_annot_df,
                                audio_repres=spec_cfg, tar_file=tar)
create_database(output_file=path_dict['database_save_filename'], data_dir=path_dict['audio_data_dir'],
                                dataset_name='other', selections=other_sel_annot_df,
                                audio_repres=spec_cfg, tar_file=tar)