From 3b75304f38def66410222f8394f7f39e5ef1aee3 Mon Sep 17 00:00:00 2001 From: Christian Date: Thu, 30 Nov 2017 10:39:54 +0100 Subject: [PATCH 1/9] first working tensorflow version --- speech_recognition/__init__.py | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 26ec8118..644a19de 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1201,6 +1201,54 @@ def recognize_ibm(self, audio_data, username, password, language="en-US", show_a transcription.append(hypothesis["transcript"]) return "\n".join(transcription) + def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_actions_frozen.pb', tensor_label='tensorflow-data/conv_actions_labels.txt', show_all=False): + """ + Performs speech recognition on ``audio_data`` (an ``AudioData`` instance). + + Tensor loaded from ``tensor_graph``. + + Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. + + Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + """ + assert isinstance(audio_data, AudioData), "Data must be audio data" + assert isinstance(tensor_graph, str), "``tensor_graph`` must be a string" + assert isinstance(tensor_label, str), "``tensor_label`` must be a string" + + import tensorflow as tf + from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio + + wav_data = audio_data.get_wav_data( + convert_rate=16000, convert_width=2 + ) + # load graph + with tf.gfile.FastGFile(tensor_graph, 'rb') as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name='') + + labels = [line.rstrip() for line in tf.gfile.GFile(tensor_label)] + + with tf.Session() as sess: + input_layer_name = 'wav_data:0' + output_layer_name = 'labels_softmax:0' + num_top_predictions = 1 + # Feed the audio data as input to the graph. + # predictions will contain a two-dimensional array, where one + # dimension represents the input image count, and the other has + # predictions per class + softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name) + predictions, = sess.run(softmax_tensor, {input_layer_name: wav_data}) + + # Sort to show labels in order of confidence + top_k = predictions.argsort()[-num_top_predictions:][::-1] + for node_id in top_k: + human_string = labels[node_id] + score = predictions[node_id] + print('%s (score = %.5f)' % (human_string, score)) + return human_string + + def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" From d46c28855c4df951405be348c28e6073e0c18f12 Mon Sep 17 00:00:00 2001 From: Christian Date: Fri, 1 Dec 2017 12:06:53 +0100 Subject: [PATCH 2/9] tensorflow returns only best label --- speech_recognition/__init__.py | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 644a19de..31887f4b 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1201,22 +1201,23 @@ def recognize_ibm(self, audio_data, username, password, language="en-US", show_a transcription.append(hypothesis["transcript"]) return "\n".join(transcription) - def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_actions_frozen.pb', tensor_label='tensorflow-data/conv_actions_labels.txt', show_all=False): + def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_actions_frozen.pb', tensor_label='tensorflow-data/conv_actions_labels.txt'): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance). - Tensor loaded from ``tensor_graph``. + Path to Tensor loaded from ``tensor_graph``. You can download a model here: http://download.tensorflow.org/models/speech_commands_v0.01.zip - Returns the most likely transcription if ``show_all`` is false (the default). Otherwise, returns the `raw API response `__ as a JSON dictionary. - - Raises a ``speech_recognition.UnknownValueError`` exception if the speech is unintelligible. Raises a ``speech_recognition.RequestError`` exception if the speech recognition operation failed, if the key isn't valid, or if there is no internet connection. + Path to Tensor Labels file loaded from ``tensor_label``. """ assert isinstance(audio_data, AudioData), "Data must be audio data" assert isinstance(tensor_graph, str), "``tensor_graph`` must be a string" assert isinstance(tensor_label, str), "``tensor_label`` must be a string" - import tensorflow as tf - from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio + try: + import tensorflow as tf + from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio + except ImportError: + raise RequestError("missing tensorflow module: ensure that tensorflow is set up correctly.") wav_data = audio_data.get_wav_data( convert_rate=16000, convert_width=2 @@ -1226,30 +1227,22 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) tf.import_graph_def(graph_def, name='') - + # load labels labels = [line.rstrip() for line in tf.gfile.GFile(tensor_label)] with tf.Session() as sess: input_layer_name = 'wav_data:0' output_layer_name = 'labels_softmax:0' num_top_predictions = 1 - # Feed the audio data as input to the graph. - # predictions will contain a two-dimensional array, where one - # dimension represents the input image count, and the other has - # predictions per class softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name) predictions, = sess.run(softmax_tensor, {input_layer_name: wav_data}) - # Sort to show labels in order of confidence - top_k = predictions.argsort()[-num_top_predictions:][::-1] + # Sort labels in order of confidence + top_k = predictions.argsort()[-1:][::-1] for node_id in top_k: human_string = labels[node_id] - score = predictions[node_id] - print('%s (score = %.5f)' % (human_string, score)) return human_string - - def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" flac_converter = shutil_which("flac") # check for installed version first From 890f1f4c488cfc3f4933b31f217c0f32c8c39421 Mon Sep 17 00:00:00 2001 From: Christian Date: Fri, 1 Dec 2017 12:25:36 +0100 Subject: [PATCH 3/9] remove unused imports and vars --- speech_recognition/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 31887f4b..fb4f27f6 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1215,7 +1215,6 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac try: import tensorflow as tf - from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio except ImportError: raise RequestError("missing tensorflow module: ensure that tensorflow is set up correctly.") @@ -1233,7 +1232,6 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac with tf.Session() as sess: input_layer_name = 'wav_data:0' output_layer_name = 'labels_softmax:0' - num_top_predictions = 1 softmax_tensor = sess.graph.get_tensor_by_name(output_layer_name) predictions, = sess.run(softmax_tensor, {input_layer_name: wav_data}) From b4507e6c7f4c9fe321dcab722341c424661ccba5 Mon Sep 17 00:00:00 2001 From: Christian Date: Fri, 1 Dec 2017 14:57:36 +0100 Subject: [PATCH 4/9] unused import is needed --- speech_recognition/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index fb4f27f6..86850305 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1215,6 +1215,7 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac try: import tensorflow as tf + from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio except ImportError: raise RequestError("missing tensorflow module: ensure that tensorflow is set up correctly.") From f5e72fb1a994f8df9484edb2da1700dc053e4e19 Mon Sep 17 00:00:00 2001 From: Christian Date: Tue, 5 Dec 2017 11:04:26 +0100 Subject: [PATCH 5/9] load graph once --- speech_recognition/__init__.py | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index 86850305..a67fd249 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1201,6 +1201,9 @@ def recognize_ibm(self, audio_data, username, password, language="en-US", show_a transcription.append(hypothesis["transcript"]) return "\n".join(transcription) + lasttfgraph = '' + tflabels = None + def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_actions_frozen.pb', tensor_label='tensorflow-data/conv_actions_labels.txt'): """ Performs speech recognition on ``audio_data`` (an ``AudioData`` instance). @@ -1219,16 +1222,20 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac except ImportError: raise RequestError("missing tensorflow module: ensure that tensorflow is set up correctly.") + if not (tensor_graph == self.lasttfgraph): + self.lasttfgraph = tensor_graph + + # load graph + with tf.gfile.FastGFile(tensor_graph, 'rb') as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name='') + # load labels + self.tflabels = [line.rstrip() for line in tf.gfile.GFile(tensor_label)] + wav_data = audio_data.get_wav_data( convert_rate=16000, convert_width=2 ) - # load graph - with tf.gfile.FastGFile(tensor_graph, 'rb') as f: - graph_def = tf.GraphDef() - graph_def.ParseFromString(f.read()) - tf.import_graph_def(graph_def, name='') - # load labels - labels = [line.rstrip() for line in tf.gfile.GFile(tensor_label)] with tf.Session() as sess: input_layer_name = 'wav_data:0' @@ -1239,7 +1246,7 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac # Sort labels in order of confidence top_k = predictions.argsort()[-1:][::-1] for node_id in top_k: - human_string = labels[node_id] + human_string = self.tflabels[node_id] return human_string def get_flac_converter(): From 06c9353e78a26cc071d9ce577aaea81b26793fd1 Mon Sep 17 00:00:00 2001 From: Christian Date: Wed, 6 Dec 2017 15:38:07 +0100 Subject: [PATCH 6/9] removed spaces in blank lines --- speech_recognition/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index f7a1eeae..ca101dfe 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1200,7 +1200,7 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac if not (tensor_graph == self.lasttfgraph): self.lasttfgraph = tensor_graph - + # load graph with tf.gfile.FastGFile(tensor_graph, 'rb') as f: graph_def = tf.GraphDef() @@ -1208,7 +1208,7 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac tf.import_graph_def(graph_def, name='') # load labels self.tflabels = [line.rstrip() for line in tf.gfile.GFile(tensor_label)] - + wav_data = audio_data.get_wav_data( convert_rate=16000, convert_width=2 ) @@ -1225,6 +1225,7 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac human_string = self.tflabels[node_id] return human_string + def get_flac_converter(): """Returns the absolute path of a FLAC converter executable, or raises an OSError if none can be found.""" flac_converter = shutil_which("flac") # check for installed version first From 5b2f5208a30a8f9ef412fffa75e03152acbf7592 Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 11 Dec 2017 11:18:27 +0100 Subject: [PATCH 7/9] no unused import --- speech_recognition/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py index ca101dfe..fbf1ccfb 100644 --- a/speech_recognition/__init__.py +++ b/speech_recognition/__init__.py @@ -1194,7 +1194,6 @@ def recognize_tensorflow(self, audio_data, tensor_graph='tensorflow-data/conv_ac try: import tensorflow as tf - from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio except ImportError: raise RequestError("missing tensorflow module: ensure that tensorflow is set up correctly.") From 513959cf85c5439ef1ceca279006abdc9044f759 Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 11 Dec 2017 11:19:52 +0100 Subject: [PATCH 8/9] added example for tensorflow --- examples/tensorflow_commands.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 examples/tensorflow_commands.py diff --git a/examples/tensorflow_commands.py b/examples/tensorflow_commands.py new file mode 100644 index 00000000..5028f5d8 --- /dev/null +++ b/examples/tensorflow_commands.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python3 +import time +import speech_recognition as sr +from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio + +# obtain audio from the microphone +r = sr.Recognizer() +m = sr.Microphone() + +with m as source: + r.adjust_for_ambient_noise(source) + +def callback(recognizer, audio): + try: + # You can download the data here: http://download.tensorflow.org/models/speech_commands_v0.01.zip + spoken = recognizer.recognize_tensorflow(audio, tensor_graph='speech_recognition/tensorflow-data/conv_actions_frozen.pb', tensor_label='speech_recognition/tensorflow-data/conv_actions_labels.txt') + print(spoken) + except sr.UnknownValueError: + print("Tensorflow could not understand audio") + except sr.RequestError as e: + print("Could not request results from Tensorflow service; {0}".format(e)) + +stop_listening = r.listen_in_background(m, callback, phrase_time_limit=0.6) +time.sleep(100) \ No newline at end of file From 0a7bf7cb2b3ad3af6635535670f2f5a639211fc3 Mon Sep 17 00:00:00 2001 From: Christian Date: Mon, 11 Dec 2017 11:35:09 +0100 Subject: [PATCH 9/9] remove linter errors --- examples/tensorflow_commands.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/tensorflow_commands.py b/examples/tensorflow_commands.py index 5028f5d8..50306c6d 100644 --- a/examples/tensorflow_commands.py +++ b/examples/tensorflow_commands.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import time import speech_recognition as sr -from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio +from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio # noqa # obtain audio from the microphone r = sr.Recognizer() @@ -10,6 +10,7 @@ with m as source: r.adjust_for_ambient_noise(source) + def callback(recognizer, audio): try: # You can download the data here: http://download.tensorflow.org/models/speech_commands_v0.01.zip @@ -20,5 +21,6 @@ def callback(recognizer, audio): except sr.RequestError as e: print("Could not request results from Tensorflow service; {0}".format(e)) + stop_listening = r.listen_in_background(m, callback, phrase_time_limit=0.6) -time.sleep(100) \ No newline at end of file +time.sleep(100)