1. Installing Libraries

In [1]:
!pip install SpeechRecognition pydub
!apt-get install ffmpeg -y
!pip install sounddevice scipy

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Collecting sounddevice
  Downloading sounddevice-0.5.2-py3-none-any.whl.metadata (1.6 kB)
Downloading sounddevice-0.5.2-py3-none-any.whl (32 kB)
Installing collected packages: sounddevice
Successfully installed sounddevice-0.5.2


2.Upload an audio file

In [2]:
from google.colab import files

uploaded = files.upload()
audio_file = list(uploaded.keys())[0]
print(f"Uploaded file: {audio_file}")


Saving harvard.wav to harvard.wav
Uploaded file: harvard.wav


3. Convert MP3 to WAV

In [3]:
from pydub import AudioSegment

if audio_file.endswith('.mp3'):
    sound = AudioSegment.from_mp3(audio_file)
    audio_file = "converted.wav"
    sound.export(audio_file, format="wav")
    print("Converted MP3 to WAV for recognition")


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


4. Recognize Speech from Audio File

In [4]:
import speech_recognition as sr

r = sr.Recognizer()

with sr.AudioFile(audio_file) as source:
    print("Processing audio...")
    audio_data = r.record(source)

    try:
        text = r.recognize_google(audio_data)
        print("Recognized Text:")
        print(text)
    except sr.UnknownValueError:
        print("Sorry, could not understand the audio")
    except sr.RequestError as e:
        print(f"Could not request results; {e}")



Processing audio...
Recognized Text:
the stale smell of old beer lingers it takes heat to bring out the odor a cold dip restores health and zest a salt pickle taste fine with ham tacos al pastor are my favorite a zestful food is the hot cross bun


5. Record Audio in Colab

In [6]:
from IPython.display import Javascript, display
from base64 import b64decode
import io
import wave

def record_audio(filename="recorded.wav", duration=5):
    display(Javascript("""
    async function recordAudio() {
        const stream = await navigator.mediaDevices.getUserMedia({audio:true});
        const mediaRecorder = new MediaRecorder(stream);
        let chunks = [];
        mediaRecorder.ondataavailable = e => chunks.push(e.data);
        mediaRecorder.start();
        await new Promise(resolve => setTimeout(resolve, %d*1000));
        mediaRecorder.stop();
        await new Promise(resolve => mediaRecorder.onstop = resolve);
        let blob = new Blob(chunks);
        let reader = new FileReader();
        reader.readAsDataURL(blob);
        reader.onloadend = () => {
            const base64data = reader.result.split(',')[1];
            google.colab.kernel.invokeFunction('notebook.save_audio', [base64data, '%s'], {});
        }
    }
    recordAudio();
    """ % (duration, filename)))

from google.colab import output

def save_audio(base64data, filename):
    audio_bytes = b64decode(base64data)
    with open(filename, "wb") as f:
        f.write(audio_bytes)
    print(f"Saved audio to {filename}")

output.register_callback('notebook.save_audio', save_audio)

# Record 5 seconds of audio
record_audio("recorded.wav", duration=5)


<IPython.core.display.Javascript object>

Saved audio to recorded.wav


6. Fix the Recorded Audio

In [7]:
from pydub import AudioSegment

try:
    sound = AudioSegment.from_file("recorded.wav")
    sound.export("recorded_fixed.wav", format="wav")
    print("Converted to proper WAV format: recorded_fixed.wav")
except Exception as e:
    print("Error converting audio:", e)


Converted to proper WAV format: recorded_fixed.wav


7. Recognize Speech from Recorded Audio

In [8]:
import speech_recognition as sr

r = sr.Recognizer()
audio_file = "recorded_fixed.wav"

with sr.AudioFile(audio_file) as source:
    print("Processing audio...")
    audio_data = r.record(source)
    try:
        text = r.recognize_google(audio_data)
        print("Recognized Text:")
        print(text)
    except sr.UnknownValueError:
        print("Sorry, could not understand the audio")
    except sr.RequestError as e:
        print(f"API Error: {e}")


Processing audio...
Recognized Text:
hello hello


8. Launch the App

In [None]:
import gradio as gr
import speech_recognition as sr
from pydub import AudioSegment
import tempfile
import os


def transcribe_audio(audio_path):
    recognizer = sr.Recognizer()

    try:
        sound = AudioSegment.from_file(audio_path)
        temp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
        sound.export(temp_wav.name, format="wav")
        audio_file = temp_wav.name
    except Exception as e:
        return f"Audio conversion failed: {e}"

    try:
        with sr.AudioFile(audio_file) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text
    except sr.UnknownValueError:
        return "Sorry, could not understand the audio"
    except sr.RequestError as e:
        return f"API Error: {e}"
    finally:
        if os.path.exists(audio_file):
            os.remove(audio_file)


with gr.Blocks() as demo:
    gr.Markdown("##                                              Speech Recognition System ")

    with gr.Tab("🎤 Record & Transcribe"):
        mic_audio = gr.Audio(type="filepath", label="Record your voice")
        mic_submit = gr.Button("📝 Transcribe Recorded Audio")
        mic_output = gr.Textbox(label="Recognized Text", lines=5)

        mic_submit.click(fn=transcribe_audio, inputs=mic_audio, outputs=mic_output)

    with gr.Tab("📁 Upload Audio File"):
        upload_audio = gr.Audio(type="filepath", label="Upload Audio (MP3/WAV)")
        upload_submit = gr.Button("📝 Transcribe Uploaded Audio")
        upload_output = gr.Textbox(label="Recognized Text", lines=5)

        upload_submit.click(fn=transcribe_audio, inputs=upload_audio, outputs=upload_output)

demo.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://b59a336aa243221f10.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
