# Seconds to Frames

In [1]:
import numpy as np
import pandas as pd
import torchaudio
import torch
import yaml

In [2]:
def load_desc_file(_desc_file,__class_labels):
    _desc_dict = dict()
    for i,line in enumerate(open(_desc_file)):
        if i >0:
            words = line.strip().split(',')
            name = words[0]
            if name not in _desc_dict:
                _desc_dict[name] = list()
            _desc_dict[name].append([float(words[-3]), float(words[-2]), __class_labels[words[-1]]])
    return _desc_dict

def transformation(signal,SAMPLE_RATE,N_FFT,HOP,N_MELS,trans="logmel"):
    if trans == "mel":
        transf = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=N_FFT,
        hop_length=HOP ,
        n_mels=N_MELS
        )
        signal = transf(signal)
    if trans == "logmel":
        transf = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=N_FFT,
        hop_length=HOP ,
        n_mels=N_MELS,
        normalized = True
        )
        signal = transf(signal)
        signal = torch.log(signal+1e-5)
    return signal

def get_audio_sample_label(signal,target_sample_rate,hop, audio_sample_path,annotations_file,labels):
    label = np.zeros((len(labels), signal.shape[1]))
    filenames = load_desc_file(annotations_file,labels)
    tmp_data = np.array(filenames[audio_sample_path])
    frame_start = np.floor(tmp_data[:, 0] * target_sample_rate / hop).astype(int)
    frame_end = np.ceil(tmp_data[:, 1] * target_sample_rate / hop).astype(int)
    se_class = tmp_data[:, 2].astype(int)
    for ind, val in enumerate(se_class):
        label[val, frame_start[ind]:frame_end[ind]] = 1
    return label

In [3]:
taxonomy_path = 'taxonomy.yml'
labels_path = 'metadata_jak_en.csv'

In [4]:
df_labels = pd.read_csv(labels_path)
df_labels

Unnamed: 0,Filename,Wetland,Day,Month,Year,Hours,Start,End,Clase
0,audio1500.wav,angachilla,29,10,2020,11:00,0.000,299.996,blown
1,audio1500.wav,angachilla,29,10,2020,11:00,0.317,299.996,rain_medium
2,audio1500.wav,angachilla,29,10,2020,11:00,0.000,299.996,other_birds
3,audio1500.wav,angachilla,29,10,2020,11:00,10.454,21.858,Vanellus_chilensis
4,audio1500.wav,angachilla,29,10,2020,11:00,20.000,23.300,dog
...,...,...,...,...,...,...,...,...,...
20582,audio1209.wav,miraflores,20,7,2020,08:00,175.138,176.418,other_birds
20583,audio1209.wav,miraflores,20,7,2020,08:00,172.078,288.436,motor
20584,audio1209.wav,miraflores,20,7,2020,08:00,248.797,251.377,dog
20585,audio1209.wav,miraflores,20,7,2020,08:00,259.377,260.977,dog


In [5]:
with open(taxonomy_path) as taxpath:
    taxonomy = yaml.safe_load(taxpath)
    taxo = taxonomy['taxonomy']

In [6]:
SAMPLE_RATE = 44100
LEN_SEC = 300
LEN_SAMPLES = LEN_SEC*SAMPLE_RATE
N_FFT = 2048
HOP = int(N_FFT/2)
N_MELS = 40

In [7]:
signal_toy = 50*torch.ones(LEN_SAMPLES)

In [8]:
audio_samples_path = df_labels.Filename.unique()

In [9]:
sigmel = transformation(signal_toy,SAMPLE_RATE,N_FFT,HOP,N_MELS,trans="logmel")
sigmel.shape

torch.Size([40, 12920])

In [12]:
annotations_frames = {}
for a in audio_samples_path:
    n,t = a.split('.')
    an_a = get_audio_sample_label(sigmel,SAMPLE_RATE,HOP, a,labels_path,taxo)
    annotations_frames[n]=an_a
    torch.save(an_a,"data/"+n+".pt")