From fbc88dbb5f82909da655c05958c36e85e3351ebe Mon Sep 17 00:00:00 2001 From: RandoInternetPreson Date: Mon, 1 Jul 2024 15:48:34 -0400 Subject: [PATCH] Update script.py A js implementation of voice recording. The microphone gradio element was causing different errors in different browsers, causing them to eventually crash. https://github.com/oobabooga/text-generation-webui/pull/5929#issuecomment-2198588327 This code and the accompanying script.js files were originally gotten from: https://github.com/oobabooga/text-generation-webui/commit/8ad2b6585a08edcd2574e5662741530966192ce9#diff-e2fbfb497940c19efe0964ba8d6656b6edad2ae579e155c8e294ec7850c76276 And edited by multiple LLMs until I got something working well. --- extensions/whisper_stt/script.py | 136 +++++++++++++++++++++++-------- 1 file changed, 101 insertions(+), 35 deletions(-) diff --git a/extensions/whisper_stt/script.py b/extensions/whisper_stt/script.py index efc62f41e3..5bc277e589 100644 --- a/extensions/whisper_stt/script.py +++ b/extensions/whisper_stt/script.py @@ -1,5 +1,10 @@ import gradio as gr import speech_recognition as sr +import numpy as np +import base64 +import os +import whisper + from modules import shared @@ -8,14 +13,12 @@ 'value': ["", ""] } -# parameters which can be customized in settings.json of webui params = { 'whipser_language': 'english', 'whipser_model': 'small.en', 'auto_submit': True } - def chat_input_modifier(text, visible_text, state): global input_hijack if input_hijack['state']: @@ -24,48 +27,111 @@ def chat_input_modifier(text, visible_text, state): else: return text, visible_text - -def do_stt(audio, whipser_model, whipser_language): - transcription = "" - r = sr.Recognizer() - - # Convert to AudioData - audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4) - +def do_stt(audio, whisper_model, whisper_language): + print(f"Attempting to transcribe with model {whisper_model} and language {whisper_language}") + try: - transcription = r.recognize_whisper(audio_data, language=whipser_language, model=whipser_model) - except sr.UnknownValueError: - print("Whisper could not understand audio") - except sr.RequestError as e: - print("Could not request results from Whisper", e) - - return transcription - - -def auto_transcribe(audio, auto_submit, whipser_model, whipser_language): - if audio is None: + # Load Whisper model + print("Loading Whisper model...") + model = whisper.load_model(whisper_model) + print("Whisper model loaded successfully") + + # Convert audio data to the format Whisper expects + audio_np = audio[1].astype(np.float32) / 32768.0 + + print(f"Audio data shape: {audio_np.shape}, Sample rate: {audio[0]}") + + # Transcribe + print("Starting Whisper transcription...") + result = model.transcribe(audio_np, language=whisper_language, fp16=False) + transcription = result["text"] + print("Whisper transcription completed") + + return transcription + except Exception as e: + print(f"Error in do_stt: {str(e)}") + import traceback + traceback.print_exc() + return "" + +def auto_transcribe(audio_base64, auto_submit, whipser_model, whipser_language): + print("auto_transcribe called") + print(f"auto_submit: {auto_submit}") + print(f"whipser_model: {whipser_model}") + print(f"whipser_language: {whipser_language}") + + if audio_base64 is None or audio_base64 == "": + print("No audio data received") return "", "" - transcription = do_stt(audio, whipser_model, whipser_language) - if auto_submit: - input_hijack.update({"state": True, "value": [transcription, transcription]}) - - return transcription, None - + + try: + # Decode base64 audio data + audio_bytes = base64.b64decode(audio_base64.split(',')[1]) + print(f"Decoded audio bytes length: {len(audio_bytes)}") + + print("Processing audio...") + # Convert WebM to PCM using ffmpeg + import subprocess + + command = ['ffmpeg', '-i', 'pipe:0', '-ar', '16000', '-ac', '1', '-f', 's16le', '-loglevel', 'error', '-'] + process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output, error = process.communicate(input=audio_bytes) + + if error: + print("FFmpeg error:", error.decode()) + + # Convert to numpy array + audio_np = np.frombuffer(output, dtype=np.int16) + + # Use 16kHz sample rate for Whisper + sample_rate = 16000 + audio = (sample_rate, audio_np) + + transcription = do_stt(audio, whipser_model, whipser_language) + print(f"Transcription: {transcription}") + + if auto_submit: + input_hijack.update({"state": True, "value": [transcription, transcription]}) + + return transcription, None + except Exception as e: + print(f"Error in auto_transcribe: {str(e)}") + import traceback + traceback.print_exc() + return "", None + def ui(): with gr.Accordion("Whisper STT", open=True): - with gr.Row(): - audio = gr.Audio(source="microphone") + audio_base64 = gr.Textbox(elem_id="audio-base64", visible=False) with gr.Row(): with gr.Accordion("Settings", open=False): auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit']) - whipser_model = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"]) - whipser_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"]) - - audio.stop_recording( - auto_transcribe, [audio, auto_submit, whipser_model, whipser_language], [shared.gradio['textbox'], audio]).then( - None, auto_submit, None, js="(check) => {if (check) { document.getElementById('Generate').click() }}") + whipser_model = gr.Dropdown( + label='Whisper Model', + value=params['whipser_model'], + choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"] + ) + whipser_language = gr.Dropdown( + label='Whisper Language', + value=params['whipser_language'], + choices=["english", "german", "french", "japanese", "spanish", "russian", "chinese", "korean", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"] # Add more languages as needed + ) + + audio_base64.change( + auto_transcribe, + inputs=[audio_base64, auto_submit, whipser_model, whipser_language], + outputs=[shared.gradio['textbox'], audio_base64] + ).then( + None, auto_submit, None, + _js="(check) => {if (check) { document.getElementById('Generate').click() }}" + ) whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None) whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None) auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None) + +def custom_js(): + js_file_path = os.path.join(os.path.dirname(__file__), "script.js") + with open(js_file_path, "r") as js_file: + return js_file.read()