Skip to content

Commit

Permalink
Update script.py
Browse files Browse the repository at this point in the history
A js implementation of voice recording. The microphone gradio element was causing different errors in different browsers, causing them to eventually crash.

oobabooga#5929 (comment)


This code and the accompanying script.js files were originally gotten from:

oobabooga@8ad2b65#diff-e2fbfb497940c19efe0964ba8d6656b6edad2ae579e155c8e294ec7850c76276

And edited by multiple LLMs until I got something working well.
  • Loading branch information
RandomInternetPreson committed Jul 1, 2024
1 parent 2f71515 commit fbc88db
Showing 1 changed file with 101 additions and 35 deletions.
136 changes: 101 additions & 35 deletions extensions/whisper_stt/script.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import gradio as gr
import speech_recognition as sr
import numpy as np
import base64
import os
import whisper


from modules import shared

Expand All @@ -8,14 +13,12 @@
'value': ["", ""]
}

# parameters which can be customized in settings.json of webui
params = {
'whipser_language': 'english',
'whipser_model': 'small.en',
'auto_submit': True
}


def chat_input_modifier(text, visible_text, state):
global input_hijack
if input_hijack['state']:
Expand All @@ -24,48 +27,111 @@ def chat_input_modifier(text, visible_text, state):
else:
return text, visible_text


def do_stt(audio, whipser_model, whipser_language):
transcription = ""
r = sr.Recognizer()

# Convert to AudioData
audio_data = sr.AudioData(sample_rate=audio[0], frame_data=audio[1], sample_width=4)

def do_stt(audio, whisper_model, whisper_language):
print(f"Attempting to transcribe with model {whisper_model} and language {whisper_language}")

try:
transcription = r.recognize_whisper(audio_data, language=whipser_language, model=whipser_model)
except sr.UnknownValueError:
print("Whisper could not understand audio")
except sr.RequestError as e:
print("Could not request results from Whisper", e)

return transcription


def auto_transcribe(audio, auto_submit, whipser_model, whipser_language):
if audio is None:
# Load Whisper model
print("Loading Whisper model...")
model = whisper.load_model(whisper_model)
print("Whisper model loaded successfully")

# Convert audio data to the format Whisper expects
audio_np = audio[1].astype(np.float32) / 32768.0

print(f"Audio data shape: {audio_np.shape}, Sample rate: {audio[0]}")

# Transcribe
print("Starting Whisper transcription...")
result = model.transcribe(audio_np, language=whisper_language, fp16=False)
transcription = result["text"]
print("Whisper transcription completed")

return transcription
except Exception as e:
print(f"Error in do_stt: {str(e)}")
import traceback
traceback.print_exc()
return ""

def auto_transcribe(audio_base64, auto_submit, whipser_model, whipser_language):
print("auto_transcribe called")
print(f"auto_submit: {auto_submit}")
print(f"whipser_model: {whipser_model}")
print(f"whipser_language: {whipser_language}")

if audio_base64 is None or audio_base64 == "":
print("No audio data received")
return "", ""
transcription = do_stt(audio, whipser_model, whipser_language)
if auto_submit:
input_hijack.update({"state": True, "value": [transcription, transcription]})

return transcription, None


try:
# Decode base64 audio data
audio_bytes = base64.b64decode(audio_base64.split(',')[1])
print(f"Decoded audio bytes length: {len(audio_bytes)}")

print("Processing audio...")
# Convert WebM to PCM using ffmpeg
import subprocess

command = ['ffmpeg', '-i', 'pipe:0', '-ar', '16000', '-ac', '1', '-f', 's16le', '-loglevel', 'error', '-']
process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = process.communicate(input=audio_bytes)

if error:
print("FFmpeg error:", error.decode())

# Convert to numpy array
audio_np = np.frombuffer(output, dtype=np.int16)

# Use 16kHz sample rate for Whisper
sample_rate = 16000
audio = (sample_rate, audio_np)

transcription = do_stt(audio, whipser_model, whipser_language)
print(f"Transcription: {transcription}")

if auto_submit:
input_hijack.update({"state": True, "value": [transcription, transcription]})

return transcription, None
except Exception as e:
print(f"Error in auto_transcribe: {str(e)}")
import traceback
traceback.print_exc()
return "", None


def ui():
with gr.Accordion("Whisper STT", open=True):
with gr.Row():
audio = gr.Audio(source="microphone")
audio_base64 = gr.Textbox(elem_id="audio-base64", visible=False)
with gr.Row():
with gr.Accordion("Settings", open=False):
auto_submit = gr.Checkbox(label='Submit the transcribed audio automatically', value=params['auto_submit'])
whipser_model = gr.Dropdown(label='Whisper Model', value=params['whipser_model'], choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"])
whipser_language = gr.Dropdown(label='Whisper Language', value=params['whipser_language'], choices=["chinese", "german", "spanish", "russian", "korean", "french", "japanese", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"])

audio.stop_recording(
auto_transcribe, [audio, auto_submit, whipser_model, whipser_language], [shared.gradio['textbox'], audio]).then(
None, auto_submit, None, js="(check) => {if (check) { document.getElementById('Generate').click() }}")
whipser_model = gr.Dropdown(
label='Whisper Model',
value=params['whipser_model'],
choices=["tiny.en", "base.en", "small.en", "medium.en", "tiny", "base", "small", "medium", "large"]
)
whipser_language = gr.Dropdown(
label='Whisper Language',
value=params['whipser_language'],
choices=["english", "german", "french", "japanese", "spanish", "russian", "chinese", "korean", "portuguese", "turkish", "polish", "catalan", "dutch", "arabic", "swedish", "italian", "indonesian", "hindi", "finnish", "vietnamese", "hebrew", "ukrainian", "greek", "malay", "czech", "romanian", "danish", "hungarian", "tamil", "norwegian", "thai", "urdu", "croatian", "bulgarian", "lithuanian", "latin", "maori", "malayalam", "welsh", "slovak", "telugu", "persian", "latvian", "bengali", "serbian", "azerbaijani", "slovenian", "kannada", "estonian", "macedonian", "breton", "basque", "icelandic", "armenian", "nepali", "mongolian", "bosnian", "kazakh", "albanian", "swahili", "galician", "marathi", "punjabi", "sinhala", "khmer", "shona", "yoruba", "somali", "afrikaans", "occitan", "georgian", "belarusian", "tajik", "sindhi", "gujarati", "amharic", "yiddish", "lao", "uzbek", "faroese", "haitian creole", "pashto", "turkmen", "nynorsk", "maltese", "sanskrit", "luxembourgish", "myanmar", "tibetan", "tagalog", "malagasy", "assamese", "tatar", "hawaiian", "lingala", "hausa", "bashkir", "javanese", "sundanese"] # Add more languages as needed
)

audio_base64.change(
auto_transcribe,
inputs=[audio_base64, auto_submit, whipser_model, whipser_language],
outputs=[shared.gradio['textbox'], audio_base64]
).then(
None, auto_submit, None,
_js="(check) => {if (check) { document.getElementById('Generate').click() }}"
)

whipser_model.change(lambda x: params.update({"whipser_model": x}), whipser_model, None)
whipser_language.change(lambda x: params.update({"whipser_language": x}), whipser_language, None)
auto_submit.change(lambda x: params.update({"auto_submit": x}), auto_submit, None)

def custom_js():
js_file_path = os.path.join(os.path.dirname(__file__), "script.js")
with open(js_file_path, "r") as js_file:
return js_file.read()

0 comments on commit fbc88db

Please sign in to comment.