Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve audio featurizer and add shift augmentor for DS2. #114

Merged
merged 2 commits into from
Jun 26, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions deep_speech_2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,13 @@ python compute_mean_std.py --help
For GPU Training:

```
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python train.py
```

For CPU Training:

```
python train.py --trainer_count 8 --use_gpu False
python train.py --use_gpu False
```

More help for arguments:
Expand Down
157 changes: 91 additions & 66 deletions deep_speech_2/data_utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,54 @@ def from_file(cls, file):
samples, sample_rate = soundfile.read(file, dtype='float32')
return cls(samples, sample_rate)

@classmethod
def slice_from_file(cls, file, start=None, end=None):
Copy link
Contributor Author

@xinghai-sun xinghai-sun Jun 21, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@reviewers:
No different for slice_from_file and make_silence. Only re-order them.

"""Loads a small section of an audio without having to load
the entire file into the memory which can be incredibly wasteful.

:param file: Input audio filepath or file object.
:type file: basestring|file
:param start: Start time in seconds. If start is negative, it wraps
around from the end. If not provided, this function
reads from the very beginning.
:type start: float
:param end: End time in seconds. If end is negative, it wraps around
from the end. If not provided, the default behvaior is
to read to the end of the file.
:type end: float
:return: AudioSegment instance of the specified slice of the input
audio file.
:rtype: AudioSegment
:raise ValueError: If start or end is incorrectly set, e.g. out of
bounds in time.
"""
sndfile = soundfile.SoundFile(file)
sample_rate = sndfile.samplerate
duration = float(len(sndfile)) / sample_rate
start = 0. if start is None else start
end = 0. if end is None else end
if start < 0.0:
start += duration
if end < 0.0:
end += duration
if start < 0.0:
raise ValueError("The slice start position (%f s) is out of "
"bounds." % start)
if end < 0.0:
raise ValueError("The slice end position (%f s) is out of bounds." %
end)
if start > end:
raise ValueError("The slice start position (%f s) is later than "
"the slice end position (%f s)." % (start, end))
if end > duration:
raise ValueError("The slice end position (%f s) is out of bounds "
"(> %f s)" % (end, duration))
start_frame = int(start * sample_rate)
end_frame = int(end * sample_rate)
sndfile.seek(start_frame)
data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
return cls(data, sample_rate)

@classmethod
def from_bytes(cls, bytes):
"""Create audio segment from a byte string containing audio samples.
Expand Down Expand Up @@ -105,6 +153,20 @@ def concatenate(cls, *segments):
samples = np.concatenate([seg.samples for seg in segments])
return cls(samples, sample_rate)

@classmethod
def make_silence(cls, duration, sample_rate):
"""Creates a silent audio segment of the given duration and sample rate.

:param duration: Length of silence in seconds.
:type duration: float
:param sample_rate: Sample rate.
:type sample_rate: float
:return: Silent AudioSegment instance of the given duration.
:rtype: AudioSegment
"""
samples = np.zeros(int(duration * sample_rate))
return cls(samples, sample_rate)

def to_wav_file(self, filepath, dtype='float32'):
"""Save audio segment to disk as wav file.

Expand All @@ -130,68 +192,6 @@ def to_wav_file(self, filepath, dtype='float32'):
format='WAV',
subtype=subtype_map[dtype])

@classmethod
def slice_from_file(cls, file, start=None, end=None):
"""Loads a small section of an audio without having to load
the entire file into the memory which can be incredibly wasteful.

:param file: Input audio filepath or file object.
:type file: basestring|file
:param start: Start time in seconds. If start is negative, it wraps
around from the end. If not provided, this function
reads from the very beginning.
:type start: float
:param end: End time in seconds. If end is negative, it wraps around
from the end. If not provided, the default behvaior is
to read to the end of the file.
:type end: float
:return: AudioSegment instance of the specified slice of the input
audio file.
:rtype: AudioSegment
:raise ValueError: If start or end is incorrectly set, e.g. out of
bounds in time.
"""
sndfile = soundfile.SoundFile(file)
sample_rate = sndfile.samplerate
duration = float(len(sndfile)) / sample_rate
start = 0. if start is None else start
end = 0. if end is None else end
if start < 0.0:
start += duration
if end < 0.0:
end += duration
if start < 0.0:
raise ValueError("The slice start position (%f s) is out of "
"bounds." % start)
if end < 0.0:
raise ValueError("The slice end position (%f s) is out of bounds." %
end)
if start > end:
raise ValueError("The slice start position (%f s) is later than "
"the slice end position (%f s)." % (start, end))
if end > duration:
raise ValueError("The slice end position (%f s) is out of bounds "
"(> %f s)" % (end, duration))
start_frame = int(start * sample_rate)
end_frame = int(end * sample_rate)
sndfile.seek(start_frame)
data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
return cls(data, sample_rate)

@classmethod
def make_silence(cls, duration, sample_rate):
"""Creates a silent audio segment of the given duration and sample rate.

:param duration: Length of silence in seconds.
:type duration: float
:param sample_rate: Sample rate.
:type sample_rate: float
:return: Silent AudioSegment instance of the given duration.
:rtype: AudioSegment
"""
samples = np.zeros(int(duration * sample_rate))
return cls(samples, sample_rate)

def superimpose(self, other):
"""Add samples from another segment to those of this segment
(sample-wise addition, not segment concatenation).
Expand Down Expand Up @@ -225,7 +225,7 @@ def to_bytes(self, dtype='float32'):
samples = self._convert_samples_from_float32(self._samples, dtype)
return samples.tostring()

def apply_gain(self, gain):
def gain_db(self, gain):
"""Apply gain in decibels to samples.

Note that this is an in-place transformation.
Expand Down Expand Up @@ -278,7 +278,7 @@ def normalize(self, target_db=-20, max_gain_db=300.0):
"Unable to normalize segment to %f dB because the "
"the probable gain have exceeds max_gain_db (%f dB)" %
(target_db, max_gain_db))
self.apply_gain(min(max_gain_db, target_db - self.rms_db))
self.gain_db(min(max_gain_db, target_db - self.rms_db))

def normalize_online_bayesian(self,
target_db,
Expand Down Expand Up @@ -319,7 +319,7 @@ def normalize_online_bayesian(self,
rms_estimate_db = 10 * np.log10(mean_squared_estimate)
# Compute required time-varying gain.
gain_db = target_db - rms_estimate_db
self.apply_gain(gain_db)
self.gain_db(gain_db)

def resample(self, target_sample_rate, quality='sinc_medium'):
"""Resample the audio to a target sample rate.
Expand Down Expand Up @@ -366,6 +366,31 @@ def pad_silence(self, duration, sides='both'):
raise ValueError("Unknown value for the sides %s" % sides)
self._samples = padded._samples

def shift(self, shift_ms):
"""Shift the audio in time. If `shift_ms` is positive, shift with time
advance; if negative, shift with time delay. Silence are padded to
keep the duration unchanged.

Note that this is an in-place transformation.

:param shift_ms: Shift time in millseconds. If positive, shift with
time advance; if negative; shift with time delay.
:type shift_ms: float
:raises ValueError: If shift_ms is longer than audio duration.
"""
if abs(shift_ms) / 1000.0 > self.duration:
raise ValueError("Absolute value of shift_ms should be smaller "
"than audio duration.")
shift_samples = int(shift_ms * self._sample_rate / 1000)
if shift_samples > 0:
# time advance
self._samples[:-shift_samples] = self._samples[shift_samples:]
self._samples[-shift_samples:] = 0
elif shift_samples < 0:
# time delay
self._samples[-shift_samples:] = self._samples[:shift_samples]
self._samples[:-shift_samples] = 0

def subsegment(self, start_sec=None, end_sec=None):
"""Cut the AudioSegment between given boundaries.

Expand Down Expand Up @@ -505,7 +530,7 @@ def add_noise(self,
noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
noise_new = copy.deepcopy(noise)
noise_new.random_subsegment(self.duration, rng=rng)
noise_new.apply_gain(noise_gain_db)
noise_new.gain_db(noise_gain_db)
self.superimpose(noise_new)

@property
Expand Down
3 changes: 3 additions & 0 deletions deep_speech_2/data_utils/augmentor/augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import json
import random
from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor
from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor


class AugmentationPipeline(object):
Expand Down Expand Up @@ -76,5 +77,7 @@ def _get_augmentor(self, augmentor_type, params):
"""Return an augmentation model by the type name, and pass in params."""
if augmentor_type == "volume":
return VolumePerturbAugmentor(self._rng, **params)
elif augmentor_type == "shift":
return ShiftPerturbAugmentor(self._rng, **params)
else:
raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
2 changes: 1 addition & 1 deletion deep_speech_2/data_utils/augmentor/volume_perturb.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,5 +36,5 @@ def transform_audio(self, audio_segment):
:param audio_segment: Audio segment to add effects to.
:type audio_segment: AudioSegmenet|SpeechSegment
"""
gain = self._rng.uniform(min_gain_dBFS, max_gain_dBFS)
gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS)
audio_segment.apply_gain(gain)
7 changes: 6 additions & 1 deletion deep_speech_2/data_utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class DataGenerator(object):
:types max_freq: None|float
:param specgram_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str
:param use_dB_normalization: Whether to normalize the audio to -20 dB
before extracting the features.
:type use_dB_normalization: bool
:param num_threads: Number of CPU threads for processing data.
:type num_threads: int
:param random_seed: Random seed.
Expand All @@ -61,6 +64,7 @@ def __init__(self,
window_ms=20.0,
max_freq=None,
specgram_type='linear',
use_dB_normalization=True,
num_threads=multiprocessing.cpu_count(),
random_seed=0):
self._max_duration = max_duration
Expand All @@ -73,7 +77,8 @@ def __init__(self,
specgram_type=specgram_type,
stride_ms=stride_ms,
window_ms=window_ms,
max_freq=max_freq)
max_freq=max_freq,
use_dB_normalization=use_dB_normalization)
self._num_threads = num_threads
self._rng = random.Random(random_seed)
self._epoch = 0
Expand Down
42 changes: 40 additions & 2 deletions deep_speech_2/data_utils/featurizer/audio_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,26 +24,64 @@ class AudioFeaturizer(object):
corresponding to frequencies between [0, max_freq] are
returned.
:types max_freq: None|float
:param target_sample_rate: Audio are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to change decibels to dB for consistency

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For comments, full name decibels is used for clarity, while in arguments a short name of dB is used instead.
I think it makes sense?

:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
:type target_dB: float
"""

def __init__(self,
specgram_type='linear',
stride_ms=10.0,
window_ms=20.0,
max_freq=None):
max_freq=None,
target_sample_rate=16000,
use_dB_normalization=True,
target_dB=-20):
self._specgram_type = specgram_type
self._stride_ms = stride_ms
self._window_ms = window_ms
self._max_freq = max_freq
self._target_sample_rate = target_sample_rate
self._use_dB_normalization = use_dB_normalization
self._target_dB = target_dB

def featurize(self, audio_segment):
def featurize(self,
audio_segment,
allow_downsampling=True,
allow_upsamplling=True):
"""Extract audio features from AudioSegment or SpeechSegment.

:param audio_segment: Audio/speech segment to extract features from.
:type audio_segment: AudioSegment|SpeechSegment
:param allow_downsampling: Whether to allow audio downsampling before
featurizing.
:type allow_downsampling: bool
:param allow_upsampling: Whether to allow audio upsampling before
featurizing.
:type allow_upsampling: bool
:return: Spectrogram audio feature in 2darray.
:rtype: ndarray
:raises ValueError: If audio sample rate is not supported.
"""
# upsampling or downsampling
if ((audio_segment.sample_rate > self._target_sample_rate and
allow_downsampling) or
(audio_segment.sample_rate < self._target_sample_rate and
allow_upsampling)):
audio_segment.resample(self._target_sample_rate)
if audio_segment.sample_rate != self._target_sample_rate:
raise ValueError("Audio sample rate is not supported. "
"Turn allow_downsampling or allow up_sampling on.")
# decibel normalization
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dB better ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For comments, full name decibels is used for clarity, while in arguments a short name of dB is used instead.
I think it makes sense?

if self._use_dB_normalization:
audio_segment.normalize(target_db=self._target_dB)
# extract spectrogram
return self._compute_specgram(audio_segment.samples,
audio_segment.sample_rate)

Expand Down
24 changes: 21 additions & 3 deletions deep_speech_2/data_utils/featurizer/speech_featurizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,34 @@ class SpeechFeaturizer(object):
corresponding to frequencies between [0, max_freq] are
returned.
:types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or
downsampling is allowed) to this before
extracting spectrogram features.
:type target_sample_rate: float
:param use_dB_normalization: Whether to normalize the audio to a certain
decibels before extracting the features.
:type use_dB_normalization: bool
:param target_dB: Target audio decibels for normalization.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dB better?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For comments, full name decibels is used for clarity, while in arguments a short name of dB is used instead.
I think it makes sense?

:type target_dB: float
"""

def __init__(self,
vocab_filepath,
specgram_type='linear',
stride_ms=10.0,
window_ms=20.0,
max_freq=None):
self._audio_featurizer = AudioFeaturizer(specgram_type, stride_ms,
window_ms, max_freq)
max_freq=None,
target_sample_rate=16000,
use_dB_normalization=True,
target_dB=-20):
self._audio_featurizer = AudioFeaturizer(
specgram_type=specgram_type,
stride_ms=stride_ms,
window_ms=window_ms,
max_freq=max_freq,
target_sample_rate=target_sample_rate,
use_dB_normalization=use_dB_normalization,
target_dB=target_dB)
self._text_featurizer = TextFeaturizer(vocab_filepath)

def featurize(self, speech_segment):
Expand Down
2 changes: 1 addition & 1 deletion deep_speech_2/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='./params.tar.gz',
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
Expand Down
Loading