PaddlePaddle · zh794390558 · Nov 18, 2022 · Nov 10, 2022 · Nov 10, 2022 · Nov 10, 2022
diff --git a/paddlespeech/s2t/exps/whisper/test_wav.py b/paddlespeech/s2t/exps/whisper/test_wav.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.∏
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from Whisper (https://github.com/openai/whisper/whisper/)
+import os.path
+import sys
+
+import paddle
+import soundfile
+
+from paddlespeech.s2t.models.whisper import _download
+from paddlespeech.s2t.models.whisper import ModelDimensions
+from paddlespeech.s2t.models.whisper import transcribe
+from paddlespeech.s2t.models.whisper import utils
+from paddlespeech.s2t.models.whisper import Whisper
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+def load_model(model_file):
+    logger.info("download and loading the model file......")
+    download_root = os.getenv(
+        "XDG_CACHE_HOME",
+        os.path.join(os.path.expanduser("~"), ".cache", "whisper"))
+    model_file = _download(args.model_file, download_root, in_memory=False)
+    model_dict = paddle.load(model_file)
+    dims = ModelDimensions(**model_dict["dims"])
+    model = Whisper(dims)
+    model.load_dict(model_dict)
+    return model
+
+
+def check(audio_file: str):
+    if not os.path.isfile(audio_file):
+        print("Please input the right audio file path")
+        sys.exit(-1)
+
+    logger.info("checking the audio file format......")
+    try:
+        sig, sample_rate = soundfile.read(audio_file)
+    except Exception as e:
+        logger.error(str(e))
+        logger.error(
+            "can not open the wav file, please check the audio file format")
+        sys.exit(-1)
+    logger.info("The sample rate is %d" % sample_rate)
+    assert (sample_rate == 16000)
+    logger.info("The audio file format is right")
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    parser.add_argument(
+        "--audio_file", type=str, help="path of the input audio file")
+    parser.add_argument(
+        "--model_file",
+        default="large",
+        type=str,
+        help="path of the input model file")
+    parser.add_argument("--beam_size", type=utils.optional_int, default=5)
+    parser.add_argument("--verbose", type=utils.str2bool, default=True)
+    parser.add_argument("--device", default="gpu")
+
+    args = parser.parse_args()
+
+    check(args.audio_file)
+
+    available_device = paddle.get_device()
+    if args.device == "cpu" and "gpu:" in available_device:
+        warnings.warn("Performing inference on CPU when CUDA is available")
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+
+    model = load_model(args.model_file)
+
+    result = transcribe(
+        model,
+        args.audio_file,
+        beam_size=args.beam_size,
+        fp16=False,
+        verbose=True)
diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py
@@ -0,0 +1,84 @@
+# MIT License, Copyright (c) 2022 OpenAI.
+# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# 
+# Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
+import hashlib
+import io
+import os
+import urllib
+import warnings
+from typing import List
+from typing import Optional
+from typing import Union
+
+import paddle
+from more_itertools import padded
+from tqdm import tqdm
+
+from paddlespeech.s2t.models.whisper.audio import log_mel_spectrogram
+from paddlespeech.s2t.models.whisper.audio import pad_or_trim
+from paddlespeech.s2t.models.whisper.decoding import decode
+from paddlespeech.s2t.models.whisper.decoding import DecodingOptions
+from paddlespeech.s2t.models.whisper.decoding import DecodingResult
+from paddlespeech.s2t.models.whisper.decoding import detect_language
+from paddlespeech.s2t.models.whisper.model import ModelDimensions
+from paddlespeech.s2t.models.whisper.model import Whisper
+from paddlespeech.s2t.models.whisper.transcribe import transcribe
+
+_MODELS = {
+    "large":
+    "https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221108/large.model.pdparams"
+}
+_MODELS_sha256 = {
+    "large": "589a2229582cc9173091f2481bba2cc8228997502ac75cbb0be6d874e8433d0f"
+}
+
+
+def _download(model_key: str, root: str, in_memory: bool) -> Union[bytes, str]:
+    os.makedirs(root, exist_ok=True)
+
+    expected_sha256 = _MODELS_sha256[model_key]
+    url = _MODELS[model_key]
+    download_target = os.path.join(root, os.path.basename(url))
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(
+            f"{download_target} exists and is not a regular file")
+
+    if os.path.isfile(download_target):
+        model_bytes = open(download_target, "rb").read()
+        if hashlib.sha256(model_bytes).hexdigest() == expected_sha256:
+            return model_bytes if in_memory else download_target
+        else:
+            warnings.warn(
+                f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
+            )
+
+    with urllib.request.urlopen(url) as source, open(download_target,
+                                                     "wb") as output:
+        with tqdm(
+                total=int(source.info().get("Content-Length")),
+                ncols=80,
+                unit='iB',
+                unit_scale=True,
+                unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    model_bytes = open(download_target, "rb").read()
+    if hashlib.sha256(model_bytes).hexdigest() != expected_sha256:
+        raise RuntimeError(
+            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
+        )
+
+    return model_bytes if in_memory else download_target
+
+
+def available_models() -> List[str]:
+    """Returns the names of available models"""
+    return list(_MODELS.keys())