Skip to content

Commit

Permalink
1. Fix incorrect decoder result printing.
Browse files Browse the repository at this point in the history
2. Fix incorrect batch-norm usage in RNN.
3. Fix overlapping train/dev/test manfests.
4. Update README.md and requirements.txt.
5. Expose more arguments to users in argparser.
6. Update all other details.
  • Loading branch information
xinghai-sun committed Jun 2, 2017
1 parent f6d820e commit 5de8e43
Show file tree
Hide file tree
Showing 8 changed files with 280 additions and 137 deletions.
55 changes: 52 additions & 3 deletions deep_speech_2/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,58 @@
# Deep Speech 2 on PaddlePaddle

## Quick Start

### Installation

Please replace `$PADDLE_INSTALL_DIR` with your paddle installation directory.

```
pip install -r requirements.txt
export LD_LIBRARY_PATH=$PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib:$LD_LIBRARY_PATH
```

For some machines, we also need to install libsndfile1. Details to be added.

### Preparing Dataset(s)

```
sh requirements.sh
python librispeech.py
python train.py
```

Please add warp-ctc library path (usually $PADDLE_INSTALL_DIR/Paddle/third_party/install/warpctc/lib) to LD_LIBRARY_PATH.
More help for arguments:

```
python librispeech.py --help
```

### Traininig

For GPU Training:

```
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --trainer_count 4
```

For CPU Training:

```
python train.py --trainer_count 8 --use_gpu False
```

More help for arguments:

```
python train.py --help
```

### Inferencing

```
python infer.py
```

More help for arguments:

```
python infer.py --help
```
2 changes: 1 addition & 1 deletion deep_speech_2/audio_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class DataGenerator(object):
"""
DataGenerator provides basic audio data preprocessing pipeline, and offer
DataGenerator provides basic audio data preprocessing pipeline, and offers
both instance-level and batch-level data reader interfaces.
Normalized FFT are used as audio features here.
Expand Down
83 changes: 59 additions & 24 deletions deep_speech_2/infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

import paddle.v2 as paddle
from itertools import groupby
import distutils.util
import argparse
import gzip
import audio_data_utils
from audio_data_utils import DataGenerator
from model import deep_speech2

parser = argparse.ArgumentParser(
Expand All @@ -15,15 +16,42 @@
"--num_samples",
default=10,
type=int,
help="Number of samples for inference.")
help="Number of samples for inference. (default: %(default)s)")
parser.add_argument(
"--num_conv_layers", default=2, type=int, help="Convolution layer number.")
"--num_conv_layers",
default=2,
type=int,
help="Convolution layer number. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers",
default=3,
type=int,
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=512,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--num_rnn_layers", default=3, type=int, help="RNN layer number.")
"--normalizer_manifest_path",
default='./manifest.libri.train-clean-100',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size", default=512, type=int, help="RNN layer cell number.")
"--decode_manifest_path",
default='./manifest.libri.test-clean',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--use_gpu", default=True, type=bool, help="Use gpu or not.")
"--model_filepath",
default='./params.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
args = parser.parse_args()


Expand All @@ -39,18 +67,27 @@ def remove_duplicate_and_blank(id_list, blank_id):
return [id for id in id_list if id != blank_id]


def max_infer():
def best_path_decode():
"""
Max-ctc-decoding for DeepSpeech2.
"""
# initialize data generator
data_generator = DataGenerator(
vocab_filepath='eng_vocab.txt',
normalizer_manifest_path=args.normalizer_manifest_path,
normalizer_num_samples=200,
max_duration=20.0,
min_duration=0.0,
stride_ms=10,
window_ms=20)
# create network config
_, vocab_list = audio_data_utils.get_vocabulary()
dict_size = len(vocab_list)
dict_size = data_generator.vocabulary_size()
vocab_list = data_generator.vocabulary_list()
audio_data = paddle.layer.data(
name="audio_spectrogram",
height=161,
width=1000,
type=paddle.data_type.dense_vector(161000))
width=2000,
type=paddle.data_type.dense_vector(322000))
text_data = paddle.layer.data(
name="transcript_text",
type=paddle.data_type.integer_value_sequence(dict_size))
Expand All @@ -64,19 +101,17 @@ def max_infer():

# load parameters
parameters = paddle.parameters.Parameters.from_tar(
gzip.open("params.tar.gz"))
gzip.open(args.model_filepath))

# prepare infer data
feeding = {
"audio_spectrogram": 0,
"transcript_text": 1,
}
test_batch_reader = audio_data_utils.padding_batch_reader(
paddle.batch(
audio_data_utils.reader_creator(
manifest_path="./libri.manifest.test", sort_by_duration=False),
batch_size=args.num_samples),
padding=[-1, 1000])
feeding = data_generator.data_name_feeding()
test_batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path,
batch_size=args.num_samples,
padding_to=2000,
flatten=True,
sort_by_duration=False,
shuffle=False)
infer_data = test_batch_reader().next()

# run max-ctc-decoding
Expand All @@ -89,7 +124,7 @@ def max_infer():
# postprocess
instance_length = len(max_id_results) / args.num_samples
instance_list = [
max_id_results[i:i + instance_length]
max_id_results[i * instance_length:(i + 1) * instance_length]
for i in xrange(0, args.num_samples)
]
for i, instance in enumerate(instance_list):
Expand All @@ -102,7 +137,7 @@ def max_infer():

def main():
paddle.init(use_gpu=args.use_gpu, trainer_count=1)
max_infer()
best_path_decode()


if __name__ == '__main__':
Expand Down
80 changes: 56 additions & 24 deletions deep_speech_2/librispeech.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,46 +7,64 @@
"""

import paddle.v2 as paddle
from paddle.v2.dataset.common import md5file
import os
import wget
import tarfile
import argparse
import soundfile
import json

DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
DATA_HOME = os.path.expanduser('~/.cache2/paddle/dataset/speech')

URL_TEST = "http://www.openslr.org/resources/12/test-clean.tar.gz"
URL_DEV = "http://www.openslr.org/resources/12/dev-clean.tar.gz"
URL_TRAIN = "http://www.openslr.org/resources/12/train-clean-100.tar.gz"
URL_ROOT = "http://www.openslr.org/resources/12"
URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz"
URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz"
URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz"
URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz"
URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz"
URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz"
URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz"

MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9"
MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1"
MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522"
MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa"
MD5_TRAIN_CLEAN_500 = "d1a0fd59409feb2c614ce4d30c387708"

parser = argparse.ArgumentParser(
description='Downloads and prepare LibriSpeech dataset.')
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/Libri",
type=str,
help="Directory to save the dataset.")
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest",
default="./libri.manifest",
"--manifest_prefix",
default="manifest.libri",
type=str,
help="Filepath prefix for output manifests.")
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()


def download(url, target_dir):
if not os.path.exists(target_dir):
os.makedirs(target_dir)
def download(url, md5sum, target_dir):
"""
Download file from url to target_dir, and check md5sum.
"""
if not os.path.exists(target_dir): os.makedirs(target_dir)
filepath = os.path.join(target_dir, url.split("/")[-1])
if not os.path.exists(filepath):
if not (os.path.exists(filepath) and md5file(filepath) == md5sum):
print("Downloading %s ..." % url)
wget.download(url, target_dir)
print("")
print("\nMD5 Chesksum %s ..." % filepath)
assert md5file(filepath) == md5sum, "MD5 checksum failed."
return filepath


def unpack(filepath, target_dir):
"""
Unpack the file to the target_dir.
"""
print("Unpacking %s ..." % filepath)
tar = tarfile.open(filepath)
tar.extractall(target_dir)
Expand All @@ -55,6 +73,14 @@ def unpack(filepath, target_dir):


def create_manifest(data_dir, manifest_path):
"""
Create a manifest file summarizing the dataset (list of filepath and meta
data).
Each line of the manifest contains one audio clip filepath, its
transcription text string, and its duration. Manifest file servers as a
unified interfance to organize data sets.
"""
print("Creating manifest %s ..." % manifest_path)
json_lines = []
for subfolder, _, filelist in os.walk(data_dir):
Expand All @@ -81,25 +107,31 @@ def create_manifest(data_dir, manifest_path):
out_file.write(line + '\n')


def prepare_dataset(url, target_dir, manifest_path):
filepath = download(url, target_dir)
def prepare_dataset(url, md5sum, target_dir, manifest_path):
"""
Download, unpack and create summmary manifest file.
"""
filepath = download(url, md5sum, target_dir)
unpacked_dir = unpack(filepath, target_dir)
create_manifest(unpacked_dir, manifest_path)


def main():
prepare_dataset(
url=URL_TEST,
target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".test")
url=URL_TEST_CLEAN,
md5sum=MD5_TEST_CLEAN,
target_dir=os.path.join(args.target_dir, "test-clean"),
manifest_path=args.manifest_prefix + ".test-clean")
prepare_dataset(
url=URL_DEV,
target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".dev")
url=URL_DEV_CLEAN,
md5sum=MD5_DEV_CLEAN,
target_dir=os.path.join(args.target_dir, "dev-clean"),
manifest_path=args.manifest_prefix + ".dev-clean")
prepare_dataset(
url=URL_TRAIN,
target_dir=os.path.join(args.target_dir),
manifest_path=args.manifest + ".train")
url=URL_TRAIN_CLEAN_100,
md5sum=MD5_TRAIN_CLEAN_100,
target_dir=os.path.join(args.target_dir, "train-clean-100"),
manifest_path=args.manifest_prefix + ".train-clean-100")


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit 5de8e43

Please sign in to comment.