In [1]:
import os
import json
import torchaudio
import numpy as np
import pandas as pd
import torch
import torch.nn.functional
from torch.utils.data import Dataset
import random

In [2]:
# Dataset CSV
df = pd.read_csv('samples.csv')
labels = [*set(list(df['sample_track_id']))]
d = {'index': list(range(0, len(labels))), 'mid': labels, 'display_name':["''"]*len(labels)}
df = pd.DataFrame(data=d)
df.to_csv('class_labels_indices.csv', index = False)

In [3]:
#JSON
directory = 'canciones'
df = pd.read_csv('samples.csv')
diccionarios = []
# Iteramos sobre archivos en ./canciones
for filename in os.listdir(directory):
    direccion = os.path.join(directory, filename)
    if os.path.isfile(direccion):
        # Quitamos extensión
        original = filename.replace(".flac", "")
        # Determinamos qué samples contiene cada canción según samples.csv
        etiquetas = ','.join([*set([str(df['original_track_id'][i]) for i in list(df.index[df['sample_track_id'] == original])])])
        # Placeholder si una canción no contiene samples
        if etiquetas == '':
            etiquetas = 'T000'
        diccionario = {
        "wav": direccion,
        "labels": etiquetas
        }
        diccionarios.append(diccionario)
        
data = {
    "data":diccionarios
}
json_object = json.dumps(data, indent=4)
# Creamos json del dataset
with open("train_data.json", "w") as outfile:
    outfile.write(json_object)

In [4]:
def make_index_dict(label_csv):
    index_lookup = {}
    with open(label_csv, 'r') as f:
        csv_reader = csv.DictReader(f)
        line_count = 0
        for row in csv_reader:
            index_lookup[row['mid']] = row['index']
            line_count += 1
    return index_lookup

def preemphasis(signal,coeff=0.97):
    """perform preemphasis on the input signal.

    :param signal: The signal to filter.
    :param coeff: The preemphasis coefficient. 0 is none, default 0.97.
    :returns: the filtered signal.
    """
    return np.append(signal[0],signal[1:]-coeff*signal[:-1])

class AudiosetDataset(Dataset):
    def __init__(self, dataset_json_file, audio_conf, label_csv=None):
        """
        Dataset that manages audio recordings
        :param audio_conf: Dictionary containing the audio loading and preprocessing settings
        :param dataset_json_file
        """
        self.datapath = dataset_json_file
        with open(dataset_json_file, 'r') as fp:
            data_json = json.load(fp)

        self.data = data_json['data']
        self.audio_conf = audio_conf = {'num_mel_bins': 128, 'target_length': 1024, 'freqm': 24, 'timem': 192, 'mixup': 0.5}
        self.melbins = self.audio_conf.get('num_mel_bins')
        self.index_dict = make_index_dict(label_csv)
        self.label_num = len(self.index_dict)
        print('number of classes is {:d}'.format(self.label_num))
        
    def _wav2fbank(self, filename):
        # mixup
        waveform, sr = torchaudio.load(filename)
        waveform = waveform - waveform.mean()
    
        fbank = torchaudio.compliance.kaldi.fbank(waveform, htk_compat=True, sample_frequency=sr, use_energy=False,
                                                  window_type='hanning', num_mel_bins=self.melbins, dither=0.0, frame_shift=10)
        target_length = self.audio_conf.get('target_length')
        n_frames = fbank.shape[0]
        p = target_length - n_frames
        # cut and pad
        if p > 0:
            m = torch.nn.ZeroPad2d((0, 0, 0, p))
            fbank = m(fbank)
        elif p < 0:
            fbank = fbank[0:target_length, :]
        return fbank

    def __getitem__(self, index):
        """
        returns: image, audio, nframes
        where image is a FloatTensor of size (3, H, W)
        audio is a FloatTensor of size (N_freq, N_frames) for spectrogram, or (N_frames) for waveform
        nframes is an integer
        """        
        datum = self.data[index]
        label_indices = np.zeros(self.label_num)
        fbank = self._wav2fbank(datum['wav'])
        for label_str in datum['labels'].split(','):
            label_indices[int(self.index_dict[label_str])] = 1.0
        label_indices = torch.FloatTensor(label_indices)
        fbank = torch.transpose(fbank, 0, 1)
        # this is just to satisfy new torchaudio version, which only accept [1, freq, time]
        fbank = fbank.unsqueeze(0)
        # squeeze it back, it is just a trick to satisfy new torchaudio version
        fbank = fbank.squeeze(0)
        fbank = torch.transpose(fbank, 0, 1)

        # the output fbank shape is [time_frame_num, frequency_bins], e.g., [1024, 128]
        return fbank, label_indices