Add recognize_azure() due to Bing Speech API deprecation

Uberi · Oct 24, 2018 · 036a53c · 036a53c
1 parent d9e6e3a
commit 036a53c
Showing 1 changed file with 94 additions and 0 deletions.
diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -1015,6 +1015,100 @@ def recognize_wit(self, audio_data, key, show_all=False):
         if "_text" not in result or result["_text"] is None: raise UnknownValueError()
         return result["_text"]
 
+    def recognize_azure(self, audio_data, key, language="en-US", result_format="simple", profanity="masked", location="westus", show_all=False):
+        """
+        Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Azure Speech API.
+
+        The Microsoft Azure Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://azure.microsoft.com/en-ca/pricing/details/cognitive-services/speech-api/>`__ with Microsoft Azure.
+
+        To get the API key, go to the `Microsoft Azure Portal Resources <https://portal.azure.com/>`__ page, go to "All Resources" > "Add" > "See All" > Search "Speech > "Create", and fill in the form to make a "Speech" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Azure Speech API keys are 32-character lowercase hexadecimal strings.
+
+        The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".
+
+        Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.
+
+        Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
+        """
+        assert isinstance(audio_data, AudioData), "Data must be audio data"
+        assert isinstance(key, str), "``key`` must be a string"
+        assert isinstance(result_format, str), "``format`` must be a string"
+        assert isinstance(language, str), "``language`` must be a string"
+
+        access_token, expire_time = getattr(self, "azure_cached_access_token", None), getattr(self, "azure_cached_access_token_expiry", None)
+        allow_caching = True
+        try:
+            from time import monotonic  # we need monotonic time to avoid being affected by system clock changes, but this is only available in Python 3.3+
+        except ImportError:
+            try:
+                from monotonic import monotonic  # use time.monotonic backport for Python 2 if available (from https://pypi.python.org/pypi/monotonic)
+            except (ImportError, RuntimeError):
+                expire_time = None  # monotonic time not available, don't cache access tokens
+                allow_caching = False  # don't allow caching, since monotonic time isn't available
+        if expire_time is None or monotonic() > expire_time:  # caching not enabled, first credential request, or the access token from the previous one expired
+            # get an access token using OAuth
+            credential_url = "https://" + location + ".api.cognitive.microsoft.com/sts/v1.0/issueToken"
+            credential_request = Request(credential_url, data=b"", headers={
+                "Content-type": "application/x-www-form-urlencoded",
+                "Content-Length": "0",
+                "Ocp-Apim-Subscription-Key": key,
+            })
+
+            if allow_caching:
+                start_time = monotonic()
+
+            try:
+                credential_response = urlopen(credential_request, timeout=60)  # credential response can take longer, use longer timeout instead of default one
+            except HTTPError as e:
+                raise RequestError("credential request failed: {}".format(e.reason))
+            except URLError as e:
+                raise RequestError("credential connection failed: {}".format(e.reason))
+            access_token = credential_response.read().decode("utf-8")
+
+            if allow_caching:
+                # save the token for the duration it is valid for
+                self.azure_cached_access_token = access_token
+                self.azure_cached_access_token_expiry = start_time + 600  # according to https://docs.microsoft.com/en-us/azure/cognitive-services/Speech-Service/rest-apis#authentication, the token expires in exactly 10 minutes
+
+        wav_data = audio_data.get_wav_data(
+            convert_rate=16000,  # audio samples must be 8kHz or 16 kHz
+            convert_width=2  # audio samples should be 16-bit
+        )
+
+        url = "https://" + location + ".stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?{}".format(urlencode({
+            "language": language,
+            "format": result_format,
+            "profanity": profanity
+        }))
+
+        if sys.version_info >= (3, 6):  # chunked-transfer requests are only supported in the standard library as of Python 3.6+, use it if possible
+            request = Request(url, data=io.BytesIO(wav_data), headers={
+                "Authorization": "Bearer {}".format(access_token),
+                "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
+                "Transfer-Encoding": "chunked",
+            })
+        else:  # fall back on manually formatting the POST body as a chunked request
+            ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8")
+            chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n"
+            request = Request(url, data=chunked_transfer_encoding_data, headers={
+                "Authorization": "Bearer {}".format(access_token),
+                "Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
+                "Transfer-Encoding": "chunked",
+            })
+
+        try:
+            response = urlopen(request, timeout=self.operation_timeout)
+        except HTTPError as e:
+            raise RequestError("recognition request failed: {}".format(e.reason))
+        except URLError as e:
+            raise RequestError("recognition connection failed: {}".format(e.reason))
+        response_text = response.read().decode("utf-8")
+        result = json.loads(response_text)
+
+        # return results
+        if show_all: return result
+        if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError()
+        return result["DisplayText"]
+
     def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
         """
         Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API.