Uberi · Uberi · Oct 8, 2017 · Apr 25, 2017 · May 9, 2017 · May 9, 2017
diff --git a/README.rst b/README.rst
@@ -32,6 +32,7 @@ Speech recognition engine/API support:
 * `Microsoft Bing Voice Recognition <https://www.microsoft.com/cognitive-services/en-us/speech-api>`__
 * `Houndify API <https://houndify.com/>`__
 * `IBM Speech to Text <http://www.ibm.com/smarterplanet/us/en/ibmwatson/developercloud/speech-to-text.html>`__
+* `Snowboy Hotword Detection <https://snowboy.kitt.ai/>`__ (works offline)
 
 **Quickstart:** ``pip install SpeechRecognition``. See the "Installing" section for more details.
 
@@ -344,6 +345,7 @@ Authors
     sbraden <braden.sarah@gmail.com> (Sarah Braden)
     tb0hdan (Bohdan Turkynewych)
     Thynix <steve@asksteved.com> (Steve Dougherty)
+    beeedy <broderick.carlin@gmail.com> (Broderick Carlin)
 
 Please report bugs and suggestions at the `issue tracker <https://github.com/Uberi/speech_recognition/issues>`__!
 

diff --git a/setup.py b/setup.py
@@ -49,7 +49,7 @@ def run(self):
     description=speech_recognition.__doc__,
     long_description=open("README.rst").read(),
     license=speech_recognition.__license__,
-    keywords="speech recognition voice sphinx google wit bing api houndify ibm",
+    keywords="speech recognition voice sphinx google wit bing api houndify ibm snowboy",
     url="https://github.com/Uberi/speech_recognition#readme",
     classifiers=[
         "Development Status :: 5 - Production/Stable",

diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
@@ -19,6 +19,8 @@
 import hmac
 import time
 import uuid
+import sys
+import struct
 
 __author__ = "Anthony Zhang (Uberi)"
 __version__ = "3.6.5"
@@ -515,7 +517,97 @@ def adjust_for_ambient_noise(self, source, duration=1):
             target_energy = energy * self.dynamic_energy_ratio
             self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
 
-    def listen(self, source, timeout=None, phrase_time_limit=None):
+    def __wait_for_hot_word(self, snowboy_location, hot_words, source, timeout=None):
+        """
+        Blocks until a hot word, sometimes refered to as a wake word, it found in an audio input.
+
+        Intended to be used as a means to limit network traffic and reduce cost of online speech-to-text services
+
+        Currently utilizes the SnowBoy service which is free for hobbiest with a paid option for commerical use.
+
+        ``snowboy_location`` is the local top level directory containing the compiled SnowBoy files.
+
+        ``hot_words`` is an iterable element that contains the local file location of models provided by the SnowBoy service, either .pmdl or .umdl format
+
+        ``source`` is the actual audio input as u
+        """
+        assert isinstance(source, AudioSource), "Source must be an audio source"
+        assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
+        assert snowboy_location is not None, "Need to specify snowboy_location argument if using hot words"
+        assert os.path.isfile(snowboy_location + "/snowboydetect.py"), "Can not find snowboydetect.py. Make sure snowboy_location is pointed at the root directory"
+        for f in hot_words: assert os.path.isfile(f), "Unable to locate file with given path: {}".format(f)
+
+        sys.path.append(snowboy_location)
+        import snowboydetect
+
+        models = ",".join(hot_words)
+        # get file path to needed resource file
+        resource = snowboy_location + "/resources/common.res"
+        detector = snowboydetect.SnowboyDetect(resource_filename=resource.encode(), model_str=models.encode())
+        detector.SetAudioGain(1.0)
+        sensitivity = [0.4] * len(hot_words)
+        sensitivity_str = ",".join(str(t) for t in sensitivity)
+        detector.SetSensitivity(sensitivity_str.encode())
+
+        # create a deque to store our raw mic input data and one to store snowboy downsampled data, each hold 5sec of audio
+        mic_buffer = collections.deque(maxlen=(source.SAMPLE_RATE * 5))
+        sb_buffer = collections.deque(maxlen=(detector.SampleRate() * 5))
+
+        # snowboy requires a specific sample rate that it provides, to avoid a ripple of issues we will just downsample momentarily by this ammount
+        resample_ratio = float(source.SAMPLE_RATE) / float(detector.SampleRate())
+        resample_count = 0
+
+        seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
+        elapsed_time = 0
+
+        while True:
+            # handle phrase being too long by cutting off the audio
+            elapsed_time += seconds_per_buffer
+            if timeout and elapsed_time > timeout:
+                break
+
+            buffer = source.stream.read(source.CHUNK)
+            if len(buffer) == 0: break  # reached end of the stream
+
+            # record mic data for use later
+            mic_buffer.extend(buffer)
+
+            # convert byte's into ints so we can downsample
+            int_data = struct.unpack('<' + ('h' * (len(buffer) / source.SAMPLE_WIDTH)), buffer)
+            ds_data = []
+
+            # rough downsampling, can handle downsampling by non-integer values
+            for i in range(len(int_data)):
+                if resample_count <= 0:
+                    sample = int_data[i]
+
+                    # grab the previous sample too, but make sure we have one to grab
+                    prev_sample = sample
+                    if i != 0:
+                        prev_sample = int_data[i - 1]
+
+                    # get a number betwen 0 and 1, this is used to linearly interpolate between the two samples we have
+                    ratio = 0.0 - resample_count
+                    fab_sample = int((1.0 - ratio) * sample + (ratio) * prev_sample + 0.5)
+                    ds_data.append(fab_sample)
+                    resample_count += resample_ratio
+
+                resample_count -= 1.0
+
+            # convert back into bytes so we can feed it into snowboy
+            sb_buffer.extend(struct.pack('<' + ('h' * len(ds_data)), *ds_data))
+
+            # actually run the snowboy detector
+            ans = detector.RunDetection(bytes(bytearray(sb_buffer)))
+            assert ans != -1, "Error initializing streams or reading audio data"
+
+            # if ans is greater than 0, we found a wake word! return audio
+            if ans > 0:
+                return bytes(mic_buffer), elapsed_time
+        # return no sound bytes and add to timer
+        return None, elapsed_time
+
+    def listen(self, source, timeout=None, phrase_time_limit=None, hot_words=[], snowboy_location=None, wait_for_hot_word=False):
         """
         Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.
 
@@ -531,6 +623,10 @@ def listen(self, source, timeout=None, phrase_time_limit=None):
         assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
         assert self.pause_threshold >= self.non_speaking_duration >= 0
 
+        # just make sure hot_words is iterable
+        if not hasattr(hot_words, '__iter__'):
+            hot_words = [hot_words]
+
         seconds_per_buffer = (source.CHUNK + 0.0) / source.SAMPLE_RATE
         pause_buffer_count = int(math.ceil(self.pause_threshold / seconds_per_buffer))  # number of buffers of non-speaking audio during a phrase, before the phrase should be considered complete
         phrase_buffer_count = int(math.ceil(self.phrase_threshold / seconds_per_buffer))  # minimum number of buffers of speaking audio before we consider the speaking audio a phrase
@@ -568,6 +664,13 @@ def listen(self, source, timeout=None, phrase_time_limit=None):
             # read audio input until the phrase ends
             pause_count, phrase_count = 0, 0
             phrase_start_time = elapsed_time
+
+            if wait_for_hot_word:
+                audio_data, delta_time = self.__wait_for_hot_word(snowboy_location, hot_words, source, timeout)
+                elapsed_time += delta_time
+                if audio_data is None:
+                    continue
+                frames.append(audio_data)
             while True:
                 # handle phrase being too long by cutting off the audio
                 elapsed_time += seconds_per_buffer