Merge pull request #1712 from yt605155624/add_cnndecoder_onnx

[TTS]add fastspeech2 cnndecoder onnx model
PaddlePaddle · Apr 18, 2022 · b78bc63 · b78bc63
2 parents 0cde9f8 + da93f94
commit b78bc63
Show file tree

Hide file tree

Showing 17 changed files with 907 additions and 200 deletions.
diff --git a/examples/csmsc/tts2/local/inference.sh b/examples/csmsc/tts2/local/inference.sh
@@ -30,21 +30,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --tones_dict=dump/tone_id_map.txt
 fi
 
-# style melgan
-# style melgan's Dygraph to Static Graph is not ready now
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    python3 ${BIN_DIR}/../inference.py \
-        --inference_dir=${train_output_path}/inference \
-        --am=speedyspeech_csmsc \
-        --voc=style_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
-        --output_dir=${train_output_path}/pd_infer_out \
-        --phones_dict=dump/phone_id_map.txt \
-        --tones_dict=dump/tone_id_map.txt
-fi
-
 # hifigan
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     python3 ${BIN_DIR}/../inference.py \
         --inference_dir=${train_output_path}/inference \
         --am=speedyspeech_csmsc \

diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
@@ -231,14 +231,19 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
 The static model can be downloaded here:
 - [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
 - [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
+- [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
+- [fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
 
 The ONNX model can be downloaded here:
 - [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
+- [fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
+- [fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
 default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
 conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
+cnndecoder| 1(gpu) x 153000|1.1153|0.61475|0.03380|0.30414|0.14707|
 
 FastSpeech2 checkpoint contains files listed below.
 ```text

diff --git a/examples/csmsc/tts3/local/inference.sh b/examples/csmsc/tts3/local/inference.sh
@@ -5,6 +5,7 @@ train_output_path=$1
 stage=0
 stop_stage=0
 
+# pwgan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/../inference.py \
         --inference_dir=${train_output_path}/inference \
@@ -27,20 +28,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --phones_dict=dump/phone_id_map.txt
 fi
 
-# style melgan
-# style melgan's Dygraph to Static Graph is not ready now
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    python3 ${BIN_DIR}/../inference.py \
-        --inference_dir=${train_output_path}/inference \
-        --am=fastspeech2_csmsc \
-        --voc=style_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
-        --output_dir=${train_output_path}/pd_infer_out \
-        --phones_dict=dump/phone_id_map.txt
-fi
 
 # hifigan
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     python3 ${BIN_DIR}/../inference.py \
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
@@ -51,7 +41,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
 fi
 
 # wavernn
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../inference.py \
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \

diff --git a/examples/csmsc/tts3/local/inference_streaming.sh b/examples/csmsc/tts3/local/inference_streaming.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference_streaming.py \
+        --inference_dir=${train_output_path}/inference_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference_streaming.py \
+        --inference_dir=${train_output_path}/inference_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference_streaming.py \
+        --inference_dir=${train_output_path}/inference_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+
diff --git a/examples/csmsc/tts3/local/ort_predict_streaming.sh b/examples/csmsc/tts3/local/ort_predict_streaming.sh
@@ -0,0 +1,19 @@
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# e2e, synthesize from text
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict_streaming.py \
+        --inference_dir=${train_output_path}/inference_onnx_streaming \
+        --am=fastspeech2_csmsc \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_streaming \
+        --text=${BIN_DIR}/../csmsc_test.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --device=cpu \
+        --cpu_threads=2 \
+        --am_streaming=True
+fi
diff --git a/examples/csmsc/tts3/local/synthesize_streaming.sh b/examples/csmsc/tts3/local/synthesize_streaming.sh
@@ -88,5 +88,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --text=${BIN_DIR}/../sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
-        --am_streaming=True
+        --am_streaming=True \
+        --inference_dir=${train_output_path}/inference_streaming
 fi
diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh
@@ -31,18 +31,75 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 
+# synthesize_e2e non-streaming
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # synthesize_e2e, vocoder is pwgan
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 
+# inference non-streaming
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # inference with static model
     CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
 fi
 
+# synthesize_e2e streaming
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # synthesize_e2e, vocoder is pwgan
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
 
+# inference streaming
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    # inference with static model
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference_streaming.sh ${train_output_path} || exit -1
+fi
+
+# paddle2onnx non streaming
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
+    ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+fi
+
+
+# onnxruntime non streaming
+# inference with onnxruntime, use fastspeech2 + hifigan by default
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
+    ./local/ort_predict.sh ${train_output_path}
+fi
+
+# paddle2onnx streaming
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    # install paddle2onnx
+    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '0.9.4' ]]; then
+        pip install paddle2onnx==0.9.4
+    fi
+    # streaming acoustic model
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_postnet
+    # vocoder
+    ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
+fi
+
+# onnxruntime streaming
+if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+    # install onnxruntime
+    version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
+    if [[ -z "$version" || ${version} != '1.10.0' ]]; then
+        pip install onnxruntime==1.10.0
+    fi
+    ./local/ort_predict_streaming.sh ${train_output_path}
+fi
+
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
@@ -14,92 +14,17 @@
 import argparse
 from pathlib import Path
 
-import numpy
 import soundfile as sf
-from paddle import inference
 from timer import timer
 
+from paddlespeech.t2s.exps.syn_utils import get_am_output
 from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_predictor
 from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_output
 from paddlespeech.t2s.utils import str2bool
 
 
-def get_predictor(args, filed='am'):
-    full_name = ''
-    if filed == 'am':
-        full_name = args.am
-    elif filed == 'voc':
-        full_name = args.voc
-    model_name = full_name[:full_name.rindex('_')]
-    config = inference.Config(
-        str(Path(args.inference_dir) / (full_name + ".pdmodel")),
-        str(Path(args.inference_dir) / (full_name + ".pdiparams")))
-    if args.device == "gpu":
-        config.enable_use_gpu(100, 0)
-    elif args.device == "cpu":
-        config.disable_gpu()
-    config.enable_memory_optim()
-    predictor = inference.create_predictor(config)
-    return predictor
-
-
-def get_am_output(args, am_predictor, frontend, merge_sentences, input):
-    am_name = args.am[:args.am.rindex('_')]
-    am_dataset = args.am[args.am.rindex('_') + 1:]
-    am_input_names = am_predictor.get_input_names()
-    get_tone_ids = False
-    get_spk_id = False
-    if am_name == 'speedyspeech':
-        get_tone_ids = True
-    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
-        get_spk_id = True
-        spk_id = numpy.array([args.spk_id])
-    if args.lang == 'zh':
-        input_ids = frontend.get_input_ids(
-            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
-        phone_ids = input_ids["phone_ids"]
-    elif args.lang == 'en':
-        input_ids = frontend.get_input_ids(
-            input, merge_sentences=merge_sentences)
-        phone_ids = input_ids["phone_ids"]
-    else:
-        print("lang should in {'zh', 'en'}!")
-
-    if get_tone_ids:
-        tone_ids = input_ids["tone_ids"]
-        tones = tone_ids[0].numpy()
-        tones_handle = am_predictor.get_input_handle(am_input_names[1])
-        tones_handle.reshape(tones.shape)
-        tones_handle.copy_from_cpu(tones)
-    if get_spk_id:
-        spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
-        spk_id_handle.reshape(spk_id.shape)
-        spk_id_handle.copy_from_cpu(spk_id)
-    phones = phone_ids[0].numpy()
-    phones_handle = am_predictor.get_input_handle(am_input_names[0])
-    phones_handle.reshape(phones.shape)
-    phones_handle.copy_from_cpu(phones)
-
-    am_predictor.run()
-    am_output_names = am_predictor.get_output_names()
-    am_output_handle = am_predictor.get_output_handle(am_output_names[0])
-    am_output_data = am_output_handle.copy_to_cpu()
-    return am_output_data
-
-
-def get_voc_output(args, voc_predictor, input):
-    voc_input_names = voc_predictor.get_input_names()
-    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
-    mel_handle.reshape(input.shape)
-    mel_handle.copy_from_cpu(input)
-
-    voc_predictor.run()
-    voc_output_names = voc_predictor.get_output_names()
-    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
-    wav = voc_output_handle.copy_to_cpu()
-    return wav
-
-
 def parse_args():
     parser = argparse.ArgumentParser(
         description="Paddle Infernce with acoustic model & vocoder.")
@@ -204,7 +129,7 @@ def main():
                 merge_sentences=merge_sentences,
                 input=sentence)
             wav = get_voc_output(
-                args, voc_predictor=voc_predictor, input=am_output_data)
+                voc_predictor=voc_predictor, input=am_output_data)
         speed = wav.size / t.elapse
         rtf = fs / speed
         print(
@@ -224,7 +149,7 @@ def main():
                 merge_sentences=merge_sentences,
                 input=sentence)
             wav = get_voc_output(
-                args, voc_predictor=voc_predictor, input=am_output_data)
+                voc_predictor=voc_predictor, input=am_output_data)
 
         N += wav.size
         T += t.elapse