In [1]:
%cd ..
%pwd

/workspace/DeepSpeech-2.x


'/workspace/DeepSpeech-2.x'

In [2]:
import argparse
import functools

from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
from deepspeech.frontend.normalizer import FeatureNormalizer
from deepspeech.utils.utility import add_arguments
from deepspeech.utils.utility import print_arguments

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples',      int,    -1,    "# of samples to for statistics.")
add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",
        choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim',    int, 13, "Audio feature dim.")
add_arg('delta_delta',    bool,
        False,
        "Audio feature with delta delta.")
add_arg('stride_ms',    float, 10.0,  "stride length in ms.")
add_arg('window_ms',    float, 20.0,  "stride length in ms.")
add_arg('sample_rate',    int, 16000,  "target sample rate.")
add_arg('manifest_path',    str,
        'examples/aishell/s1/data/manifest.train.raw',
        "Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('num_workers',
                        default=1,
                        type=int,
                        help='num of subprocess workers for processing')
add_arg('output_path',    str,
        'data/librispeech/mean_std.npz',
        "Filepath of write mean and stddev to (.npz).")
# yapf: disable
args = parser.parse_args([])
print(args)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  def convert_to_list(value, n, name, dtype=np.int):
register user softmax to paddle, remove this when fixed!
register user log_softmax to paddle, remove this when fixed!
register user sigmoid to paddle, remove this when fixed!
register user log_sigmoid to paddle, remove this when fixed!
register user relu to paddle, remove this when fixed!
override cat of paddle if exists or register, remove this when fixed!
override item of paddle.Tensor if exists or register, remove this when fixed!
override long of paddle.Tensor if exists or register, remove this when fixed!
override new_full of paddle.Tensor if exists or register, remove this when fixed!
override eq of paddle.Tensor if exists or register, remove this when fixed!
override eq of paddle if exists or register, remove this when fixed!
override contiguous of paddle.Tensor if exists or register, remove this when fixed

Namespace(delta_delta=False, feat_dim=13, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=1, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='linear', stride_ms=10.0, window_ms=20.0)


In [5]:
import random

import numpy as np
import paddle
from paddle.io import DataLoader
from paddle.io import Dataset

from deepspeech.frontend.audio import AudioSegment
from deepspeech.frontend.utility import load_cmvn
from deepspeech.frontend.utility import read_manifest

class CollateFunc(object):
    ''' Collate function for AudioDataset
    '''
    def __init__(self):
        pass
       
    def __call__(self, batch):
        mean_stat = None
        var_stat = None
        number = 0
        for feat in batch:
            sums = np.sum(feat, axis=1)
            if mean_stat is None:
                mean_stat = sums
            else:
                mean_stat += sums

            square_sums = np.sum(np.square(feat), axis=1)
            if var_stat is None:
                var_stat = square_sums
            else:
                var_stat += square_sums

            number += feat.shape[1]
        #return paddle.to_tensor(number), paddle.to_tensor(mean_stat), paddle.to_tensor(var_stat)
        return number, mean_stat, var_stat


class AudioDataset(Dataset):
    def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):
        self.feature_func = feature_func
        self._rng = rng
        manifest = read_manifest(manifest_path)
        if num_samples == -1:
            sampled_manifest = manifest
        else:
            sampled_manifest = self._rng.sample(manifest, num_samples)
        self.items = sampled_manifest

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        key = self.items[idx]['feat']
        audioseg = AudioSegment.from_file(key)
        feat = self.feature_func(audioseg)  #(D, T)
        return feat

In [6]:

augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
    specgram_type=args.specgram_type,
    feat_dim=args.feat_dim,
    delta_delta=args.delta_delta,
    stride_ms=args.stride_ms,
    window_ms=args.window_ms,
    n_fft=None,
    max_freq=None,
    target_sample_rate=args.sample_rate,
    use_dB_normalization=True,
    target_dB=-20)

def augment_and_featurize(audio_segment):
    augmentation_pipeline.transform_audio(audio_segment)
    return audio_featurizer.featurize(audio_segment)


collate_func = CollateFunc()

dataset = AudioDataset(
    args.manifest_path,
    augment_and_featurize, 
    args.num_samples)

batch_size = 20
data_loader = DataLoader(
    dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=args.num_workers,
    collate_fn=collate_func)

with paddle.no_grad():
    all_mean_stat = None
    all_var_stat = None
    all_number = 0
    wav_number = 0
    #     for i, batch in enumerate(data_loader()):
    for batch in data_loader():
        number, mean_stat, var_stat = batch
        if i == 0:
            all_mean_stat = mean_stat
            all_var_stat = var_stat
        else:
            all_mean_stat += mean_stat
            all_var_stat += var_stat
        all_number += number
        wav_number += batch_size

        if wav_number % 1000 == 0:
            print('process {} wavs,{} frames'.format(wav_number,
                                                           all_number))

cmvn_info = {
    'mean_stat': list(all_mean_stat.tolist()),
    'var_stat': list(all_var_stat.tolist()),
    'frame_num': all_number
}

Process Process-2:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 463, in _worker_loop
    six.reraise(*sys.exc_info())
  File "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/six.py", line 703, in reraise
    raise value
  File "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 446, in _worker_loop
    for s in sample:
TypeError: 'int' object is not iterable
2021-04-20 07:43:09,866 - ERROR - DataLoader reader thread raised an exception!


SystemError: (Fatal) Blocking queue is killed because the data reader raises an exception.
  [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)


Exception in thread Thread-5:
Traceback (most recent call last):
  File "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 684, in _get_data
    data = self._data_queue.get(timeout=self._timeout)
  File "/usr/local/lib/python3.7/multiprocessing/queues.py", line 105, in get
    raise Empty
_queue.Empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/threading.py", line 917, in _bootstrap_inner
    self.run()
  File "/usr/local/lib/python3.7/threading.py", line 865, in run
    self._target(*self._args, **self._kwargs)
  File "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 616, in _thread_loop
    batch = self._get_data()
  File "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py", line 700, in _get_data
    "pids