This tool converts a folder of samples to a big rectangular matrix with one mono sample per row.

Samples should be placed in `data/mydataset/samples/`. They could be `.mp3`, `.wav`, or anything else that ffmpeg can work with. They may be all in one folder, or in nested sub-folders.

Change the path below to point to the root directory, e.g., `data/mydataset/`.

The samplerate `sr` is not necessarily the native samplerate of the samples, it's the samplerate you want to load them at.

The output of this notebook is:
* `data/mydataset/durations.txt`
* `data/mydataset/filenames.txt`
* `data/mydataset/samples.npy`

In [2]:
data_root = '/home/tracek/Data/Birdman/'
sr = 16000
max_length = sr * 4 # ignore samples longer than 4 seconds
fixed_length = sr // 4 # trim all samples to 250 milliseconds
limit = None # set this to 100 to only load the first 100 samples

In [3]:
import numpy as np
from os.path import join
from multiprocessing import Pool
from utils import list_all_files, ffmpeg_load_audio

In [4]:
files = list(list_all_files(join(data_root, 'samples'), ['.wav']))
len(files)

142

In [5]:
def load_sample(fn, sr=None,
                max_length=None, fixed_length=None, normalize=True):
    if fn == '': # ignore empty filenames
        return None
    audio, _ = ffmpeg_load_audio(fn, sr, mono=True)
    duration = len(audio)
    if duration == 0: # ignore zero-length samples
        return None
    if max_length and duration >= max_length: # ignore long samples
        return None
    if fixed_length:
        audio.resize(fixed_length)
    max_val = np.abs(audio).max()
    if max_val == 0: # ignore completely silent sounds
        return None
    if normalize:
        audio /= max_val
    return (fn, audio, duration)

In [7]:
def job(fn):
    return load_sample(fn, sr=sr,
                       max_length=max_length, fixed_length=fixed_length)
pool = Pool()
%time results = pool.map(job, files[:limit])
print('Processed', len(results), 'samples')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 1.31 s
Processed 142 samples


In [10]:
valid = filter(None, results)
filenames = [x[0] for x in valid]
samples = [x[1] for x in valid]
durations = [x[2] for x in valid]
samples = np.asarray(samples)
np.savetxt(join(data_root, 'filenames.txt'), filenames, fmt='%s')
np.savetxt(join(data_root, 'durations.txt'), durations, fmt='%i')
%time np.save(join(data_root, 'samples.npy'), samples)
print('Saved', len(valid), 'samples')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 357 µs


TypeError: object of type 'filter' has no len()

In [11]:
results

[('/home/tracek/Data/Birdman/samples/STHELENA-02_20140605_200000_1_16k_081.740_081.844.wav',
  array([ 0.03625674,  0.01518863, -0.02155806, ...,  0.        ,
          0.        ,  0.        ], dtype=float32),
  1675),
 ('/home/tracek/Data/Birdman/samples/STHELENA-02_20140605_200000_1_16k_082.187_082.937.wav',
  array([ 0.03028798,  0.02805362,  0.04046673, ...,  0.00148957,
          0.00049652, -0.01812314], dtype=float32),
  12000),
 ('/home/tracek/Data/Birdman/samples/STHELENA-02_20140605_200000_1_16k_010.456_010.567.wav',
  array([-0.12734452, -0.05100362,  0.03553801, ...,  0.        ,
          0.        ,  0.        ], dtype=float32),
  1780),
 ('/home/tracek/Data/Birdman/samples/STHELENA-02_20140605_200000_1_16k_068.037_068.787.wav',
  array([ 0.01075269, -0.15591398, -0.01612903, ...,  0.31182796,
         -0.2311828 ,  0.0483871 ], dtype=float32),
  12000),
 ('/home/tracek/Data/Birdman/samples/STHELENA-02_20140605_200000_1_16k_008.123_008.444.wav',
  array([-0.09440559, -0.