Permalink
Browse files

Fix Bing Speech API, which was broken due to Bing Speech API backward…

…s incompatibility; also remove auto-segmentation from IBM recognizer
  • Loading branch information...
Uberi committed Jun 26, 2017
1 parent d3e96f0 commit 54dbf48dafd8c6912581f17411c1a12b7d365e62
Showing with 25 additions and 26 deletions.
  1. +1 −1 .travis.yml
  2. +5 −5 reference/library-reference.rst
  3. +19 −20 speech_recognition/__init__.py
View
@@ -26,7 +26,7 @@ sudo: false # this allows TravisCI to use the fast Docker build environment rath
env:
global:
- secure: "jFHi/NK+hkf8Jw/bA06utypMRAzOcpeKPEZz/P2U79c70aIcmeAOGNUG6t5x2hmaeNpaP1STDtOLVdDawLY904rv/2sAhdMExlLUYubVQrJumvfgwyHRep0NLxrWV/Sf7y6FBPsvS0We29sn5HeEUlSzFwLrANyagpZYGeeWI3SGfdseDK/n4SlD436i7n5jM0Vlbmo07JDtdTN5Ov17APtuqy0ZViNhhTG+wvU8RCd/0/1IvstaaOhSa/82jABXNzH12hY4ynSuK75EVdVLj/WstSmH90r+8TS+YHH1D68yFeoub8kjTzZirqDuwb1s0nGOzx3VAC03+Fb48jHNfz2X0LJEj6gOpaaxgXOr4qkb1+Bx4L1bUkMk3ywjKoXFF0BU/haZfPbzG0fFUDubEXYjhC88gM1CR0LrFf4qtIqFcdM4sjasfv7TD2peiuWqVRZeHzjcvQVC8aDxVFFbTF+Cx1xZ1qLxAY5iJ/dUPWpOVcSs0GIJaJw7LQJU5uQbiU0vg17k9QcVYbASJu0cFAt/OsWGDZp/uArSWrMcSoexe8wI8/k5u9XFnOmlEu5kUJXOrZANjniUk5ilFUe+lag2Zl/ZasNtW16qke+vaWfBnpKl7NOoQemWNdYOxgyc/4x9B3x8gryf5XAmfBeqneh7k10O18u6GYpt33r0zuQ=" # encrypted version of "WIT_AI_KEY=(my key)"
- secure: "NUTqadJCac2g6n44Phw6qsKSB+cGGPYDQI0nB+jJ1p+R4K7SYo46ECU0Xs+UexNwcZbmUxxAwekXyVS8Rd7GGadsM4rw0wPmZ//ul24dg+ek8/tDZ96U85yGvcSWoPTYsBbwqvKqxFfWZNLLwkdX38zIeBchDwfseVrPgFMjNuh2gOikszNbJcom23FzlOkR1kcfW2nepsLTK+u1AUB+S8FmvUXsE3oiLEA605FQu+hblXaltu1CeCGmci8cMIP7XX0VdpPlrO4kGodRvZmE/KmNt1fTjESYN1td9PZ3ZhO6ZBJf6TP0VfB6qXz/efVTdeVenxu5J9BpXvcDkEgEa/7PwQc0FeK0eaOmAUSKgjRQTjf1A3vTfvSBseDpGZasCQ3YCS2wF5qBtTBiKlzKyC6xio7fW+DEt9L3IC6gW/hzA5fiXe4ZZygofvNY4u2kVRT7C9GM7UgGwT8nSapqWdr898NfM9goPfNAV5PFsNgNGx8n7659Q4lCmymVPhxD76449kOCNlWozzdCmjnsGrK4JzleTTFXCWpsQFsahJSqQ/W6CNm667zvUnlFKs1/edPOwsnYLSnU1jtJyxVlAX4wUFUKYzYehyv49+DtX0CAMeV149hhINJPzBDc4YAsNxrLnuqjTYm555s0jDz+itmoYv3BEA4wEQp1j0EMJMM=" # encrypted version of "BING_KEY=(my key)"
- secure: "ZKs+ywhJett8CpA24wR8js3C5B0uzaXMFIaiWBgkQfVhwbwkecCjG2HbLaJ1ncXP5VZnrXF6Ym4pZm87q0mIp/S0dMS7ZC5Jikowc3Bdyph9L49MDubZL0SO98+YR9j0QeKw8wxiVP6kv9cw12uVWn4VNgGcuW6AYZ0AqzdvUfW4+zby+Ua9U8LC0RcDKY3GR4Svq6dUjNFtJmI5uJ129UFO4oujCzuHNZL3KSSUJVt1KelVX+1eUNJ67sN3AvoMfx86jXNtN0kS12lZ+dP4YDo+lCtViG/W1dHCCdBmnUZsPE4Bc+Uyvg/BeKZaL1hgrNb6QHCNWmZC7jGxzkP2akwX5PxmKW7ClXn/79c7e84YUiRHlYQgL0qP+kZ7WDG6nJyKqLNFAtTHAw5F++5cpomNThYoCJeQOmkhi+KLEs9KMmn4d/bOLzW1RCeuf0bhjOOCI89+761aqJ1ky8UHJUZCKjYegHLM/bZ9LkKnUi+d+KYNQB8qpluZSLqObknCczh6ekKt/1FdrC+FBbFmpkTCuru1K9APdz01+ipVV8Av6NB+ax0+KPlKp49TA9uzANKWyLRkW9j6LD67MGF6SH/h8t5OeNZXdmf4DGjqv1erbKZeW+y25Hw7lVbqEo1m4T9wn1lmA1nse0kBrqGF+kQ4mNdfNSmWGWKxj+gFuxA=" # encrypted version of "BING_KEY=(my key)"
- secure: "JEtMaAhDglqRrHdKZapxIaY0zlCohsepgxfRckhuCB3RZljeIKjt15Q/8LzFcx0ZdQV2thOQ/2oA0WpnfTckEnh42X+Ki0AUlezjXXYII2DenCs9q7jXxuOYK5AjxcNzyfeh7NnI2R3jdAyf49FdnoOa/OdEZq7aYRouP0yZtVKK/eMueURfr7JMsTrmuYoy1LXkF/yEyxns9HiaSebn7YqeQ7cb9Q5LcSigM6kCXZrtG1K4MqWGrvnqGeabE6xoZVxkf+az6fMv91oZ4spZRfjjlFpGx050gP4SCpk8XQqVS2HAtzVSFBdnLld4ydRoGVHVMAOmvQY5xbk5y9REVj4EVdfeErOhaEz6CfFqZi9UpAS0Zza/7khGDDWkHmfg4O4CzrVLkfdcPIgIKcz9TT9zP+wPVCYmfN2Qq0XB+PJkewjmgPuWZnUyBb402iPs1hWEze8oK6Yk5K3OnBuSqeE4EtvpT/SUrLtroSNcWJJ7i585cqgNB5KwzDDKNnyn0zteQQTj+fUzrumQ+/FTYjaafOVZ6ZAiZ+xvgge0+foB94GCoV/8LUm5rVTtk8vV3c3oJu9jdzsyiOSargYPSYg7iy1kzkC/eQ12rX89EWLGjoP+mveLGBpUebQNbB8vxaVRd8uaozW/G3Vwgelqg7gzrvmwkaYK3g6a1TAVpcs=" # encrypted version of "HOUNDIFY_CLIENT_ID=(my client ID)"
- secure: "izFPobia0Luga6mL0pXDBmp/V1/kzZdFc09PbYUBNoyx63DPmDwP8dtSFy9ynEERJg4HQ6KeQzsPED3ZhnYO3C3lD3y078+k6Ryl15aONLrou6jzDiYMw6KV1CQ6V1OIz3tLwZoS7wwWdr0ZYdMEklYVVVu8wJOzl6aZ8gtp8Y3woev6qrxFeXhkkNZOybtQ8ugV6a5EypVEVQ2IGTEVvA6A8oSGDd8BDOSYyKPQ3LXPx7imA6freqio/b5HaACkBIidFRykly3xkBib2phhww2D18Zdu5imJtCmHxFQ3V+N5ZzlUkgmR9gyvdblQgJ7sCwpQAC/Mb0KWqUDar59nRA1WmY+onVN/t7sjBBCPjS0Ddu5Ls3X9Qdh3rflQ2Fc7nSi8iVITAHFreUKEW/jgJyBnFuau0Cu5DNcZYy24W+GBzwks1g/uoy4vWVbijaIzSEXu352CqClSJpBTltp3z0KZ/9D9VRB1tFoFmlVWkW39bBBqpJy/49mGVlbrG2J+hyCW+J+BQFpTcjXSd+JS57XXYKcm3QXnNxxnIQ5lw/6t92SbWWP+IeJB9fJENFLteE5XjtQWQ7gHbb7hP0iH9u92mJbehzvdo9KwePlIeWFC1Wyw3ZHrLa56DykfPNg9kYcuJdTwLMRxI4X5aG/e1QBVAwM8tii6Zrjag684iM=" # encrypted version of "HOUNDIFY_CLIENT_KEY=(my client key)"
- secure: "uj5LUKDtf214EZPqsjpy6tk8iXEfydC3z/px98xbXa/H6PVN6wMPTHsF1DuuTWCbLrqNyi9/rMbpJFiNuqMm+q0LarrvvuTKHA9JFe/ZA11R1w3WI2ZMTvub6vzCbmcznIkjq981BjFWz5aCazPXhLt18e0iMit2FR+D6jwZ4al8TIo9i6RjkJ3MimH2/Sgm2BnXZ7qHsmDlG+4VsABiPiH0SPzrxqJJ4WSOb8EnNkNcOujiHuYvDNR+6R566bXjV1x+z2ewKb2nae5LOEl8L+6B/CsNT2cyeds2imYWAw9vTZoTajXf2u21M3pqRINQ67CuWhGFOdUXiEd6E/jTQFcsE4GuB7eMIYcHCmPzhhHn1b6XzNJtf923+YlSnayf63Y5jHjeSWSWs6pjJOUjJquuXS8vQYuJYX4n8sXDeEsZg0yD2jdxFMqMmjZoKKJzWPTPUkDTLawZdZs2q6bOF+xBQysUPozgSnxe3koCMFLeA1cU6fUkXWWIFDuAehR0JqYQHaUovoO0ZYx8Env0Ojhl6IZclONxaLVA41CbzkSUC1pg0k/VeMiv6YB2SQsFxV1riKM/OPDxq7AAuUuNVDCj/SGya4BJEYrxtagtmq0em8Q8SJzLq7IFNBNq5pO8IaqA0JO/tieSIsutrhdRzVMI35apuwbE+5jxoDmiGW0=" # encrypted version of "IBM_USERNAME=(my username)"
@@ -239,15 +239,15 @@ Raises a ``speech_recognition.UnknownValueError`` exception if the speech is uni
``recognizer_instance.recognize_bing(audio_data, key, language = "en-US", show_all = False)``
---------------------------------------------------------------------------------------------
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Voice Recognition API.
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API.
The Microsoft Bing Voice Recognition API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__ with Microsoft Cognitive Services.
The Microsoft Bing Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://azure.microsoft.com/en-ca/pricing/details/cognitive-services/speech-api/>`__ with Microsoft Azure.
To get the API key, go to the `Microsoft Cognitive Services subscriptions overview <https://www.microsoft.com/cognitive-services/en-us/subscriptions>`__, go to the entry titled "Speech", and look for the key under the "Keys" column. Microsoft Bing Voice Recognition API keys are 32-character lowercase hexadecimal strings.
To get the API key, go to the `Microsoft Azure Portal Resources <https://portal.azure.com/>`__ page, go to "All Resources" > "Add" > "See All" > Search "Bing Speech API > "Create", and fill in the form to make a "Bing Speech API" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Bing Speech API keys are 32-character lowercase hexadecimal strings.
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/api-reference-rest/BingVoiceRecognition#SupLocales>`__.
The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/api-reference-rest/BingVoiceRecognition#user-content-3-voice-recognition-responses>`__ as a JSON dictionary.
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
@@ -890,15 +890,15 @@ def recognize_wit(self, audio_data, key, show_all=False):
def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
"""
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Voice Recognition API.
Performs speech recognition on ``audio_data`` (an ``AudioData`` instance), using the Microsoft Bing Speech API.
The Microsoft Bing Voice Recognition API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__ with Microsoft Cognitive Services.
The Microsoft Bing Speech API key is specified by ``key``. Unfortunately, these are not available without `signing up for an account <https://azure.microsoft.com/en-ca/pricing/details/cognitive-services/speech-api/>`__ with Microsoft Azure.
To get the API key, go to the `Microsoft Cognitive Services subscriptions overview <https://www.microsoft.com/cognitive-services/en-us/subscriptions>`__, go to the entry titled "Speech", and look for the key under the "Keys" column. Microsoft Bing Voice Recognition API keys are 32-character lowercase hexadecimal strings.
To get the API key, go to the `Microsoft Azure Portal Resources <https://portal.azure.com/>`__ page, go to "All Resources" > "Add" > "See All" > Search "Bing Speech API > "Create", and fill in the form to make a "Bing Speech API" resource. On the resulting page (which is also accessible from the "All Resources" page in the Azure Portal), go to the "Show Access Keys" page, which will have two API keys, either of which can be used for the `key` parameter. Microsoft Bing Speech API keys are 32-character lowercase hexadecimal strings.
The recognition language is determined by ``language``, an RFC5646 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/api-reference-rest/BingVoiceRecognition#SupLocales>`__.
The recognition language is determined by ``language``, a BCP-47 language tag like ``"en-US"`` (US English) or ``"fr-FR"`` (International French), defaulting to US English. A list of supported language values can be found in the `API documentation <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#recognition-language>`__ under "Interactive and dictation mode".
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://www.microsoft.com/cognitive-services/en-us/speech-api/documentation/api-reference-rest/BingVoiceRecognition#user-content-3-voice-recognition-responses>`__ as a JSON dictionary.
Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response <https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition#sample-responses>`__ as a JSON dictionary.
Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection.
"""
@@ -939,27 +939,26 @@ def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
if allow_caching:
# save the token for the duration it is valid for
self.bing_cached_access_token = access_token
self.bing_cached_access_token_expiry = start_time + 600 # according to https://www.microsoft.com/cognitive-services/en-us/Speech-api/documentation/API-Reference-REST/BingVoiceRecognition, the token expires in exactly 10 minutes
self.bing_cached_access_token_expiry = start_time + 600 # according to https://docs.microsoft.com/en-us/azure/cognitive-services/speech/api-reference-rest/bingvoicerecognition, the token expires in exactly 10 minutes
wav_data = audio_data.get_wav_data(
convert_rate=16000, # audio samples must be 8kHz or 16 kHz
convert_width=2 # audio samples should be 16-bit
)
url = "https://speech.platform.bing.com/recognize/query?{}".format(urlencode({
"version": "3.0",
"requestid": uuid.uuid4(),
"appID": "D4D52672-91D7-4C74-8AD8-42B1D98141A5",
"format": "json",
# chunked-transfer encoding is only supported in the standard library for Python 3.6+, so we manually format the POST data as if it was a chunked request
ascii_hex_data_length = "{:X}".format(len(wav_data)).encode("utf-8")
chunked_transfer_encoding_data = ascii_hex_data_length + b"\r\n" + wav_data + b"\r\n0\r\n\r\n"
url = "https://speech.platform.bing.com/speech/recognition/interactive/cognitiveservices/v1?{}".format(urlencode({
"language": language,
"locale": language,
"device.os": "wp7",
"scenarios": "ulm",
"instanceid": uuid.uuid4(),
"result.profanitymarkup": "0",
"requestid": uuid.uuid4(),
}))
request = Request(url, data=wav_data, headers={
request = Request(url, data=chunked_transfer_encoding_data, headers={
"Authorization": "Bearer {}".format(access_token),
"Content-Type": "audio/wav; samplerate=16000; sourcerate={}; trustsourcerate=true".format(audio_data.sample_rate),
"Content-type": "audio/wav; codec=\"audio/pcm\"; samplerate=16000",
"Transfer-Encoding": "chunked",
})
try:
response = urlopen(request, timeout=self.operation_timeout)
@@ -972,8 +971,8 @@ def recognize_bing(self, audio_data, key, language="en-US", show_all=False):
# return results
if show_all: return result
if "header" not in result or "lexical" not in result["header"]: raise UnknownValueError()
return result["header"]["lexical"]
if "RecognitionStatus" not in result or result["RecognitionStatus"] != "Success" or "DisplayText" not in result: raise UnknownValueError()
return result["DisplayText"]
def recognize_houndify(self, audio_data, client_id, client_key, show_all=False):
"""
@@ -1050,8 +1049,8 @@ def recognize_ibm(self, audio_data, username, password, language="en-US", show_a
)
url = "https://stream.watsonplatform.net/speech-to-text/api/v1/recognize?{}".format(urlencode({
"profanity_filter": "false",
"continuous": "true",
"model": "{}_BroadbandModel".format(language),
"inactivity_timeout": -1, # don't stop recognizing when the audio stream activity stops
}))
request = Request(url, data=flac_data, headers={
"Content-Type": "audio/x-flac",

0 comments on commit 54dbf48

Please sign in to comment.