In [None]:
pip install openai-whisper vosk SpeechRecognition pydub ffmpeg-python




In [None]:
import os
import sys
import json
import wave

import speech_recognition as sr
import whisper
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment

# ============================================================
# SET YOUR VOSK MODEL FOLDER PATH HERE
# Example (Windows):
# VOSK_MODEL_PATH = r"C:\Users\YourName\Downloads\vosk-model-en-us-daanzu-20200905"
# ============================================================
VOSK_MODEL_PATH = r"/content/drive/MyDrive/vosk-model-en-us-daanzu-20200905"
# ============================================================


# ---------- Utility: Convert any audio to mono 16kHz WAV ----------
def convert_to_wav(input_path: str, output_path: str) -> str:
    """
    Converts input audio (mp3/m4a/wav/flac/...) to mono 16kHz WAV.
    Used by Vosk and Google.
    """
    try:
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_channels(1)        # mono
        audio = audio.set_frame_rate(16000)  # 16 kHz
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"ERROR converting audio to WAV: {e}")
        return None


# ---------------- Whisper (offline) ----------------
def load_whisper_model():
    print("Loading Whisper model (offline)...")
    model = whisper.load_model("base")   # base / small / medium / large
    print("Whisper model loaded.\n")
    return model


def recognize_with_whisper(model, audio_path: str) -> str:
    try:
        print("[Whisper] Recognizing...")
        result = model.transcribe(audio_path)
        text = result.get("text", "").strip()
        return text if text else "Whisper could not recognize any speech."
    except Exception as e:
        return f"Whisper error: {str(e)}"


# ---------------- Vosk (offline) ----------------
def load_vosk_model():
    if not os.path.isdir(VOSK_MODEL_PATH):
        print(f"ERROR: Vosk model not found at: {VOSK_MODEL_PATH}")
        sys.exit(1)

    print("Loading Vosk model (offline)...")
    model = Model(VOSK_MODEL_PATH)
    print("Vosk model loaded.\n")
    return model


def recognize_with_vosk(model, original_audio_path: str) -> str:
    temp_wav = "temp_vosk.wav"
    wav_path = convert_to_wav(original_audio_path, temp_wav)
    if wav_path is None:
        return "Vosk error: Could not convert audio to WAV."

    try:
        wf = wave.open(wav_path, "rb")
        rec = KaldiRecognizer(model, wf.getframerate())
        rec.SetWords(True)

        print("[Vosk] Recognizing...")
        result_text = ""
        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if rec.AcceptWaveform(data):
                res = json.loads(rec.Result())
                result_text += " " + res.get("text", "")

        final_res = json.loads(rec.FinalResult())
        result_text += " " + final_res.get("text", "")

        text = result_text.strip()
        return text if text else "Vosk could not recognize any speech."
    except Exception as e:
        return f"Vosk error: {str(e)}"
    finally:
        try:
            wf.close()
        except:
            pass
        if os.path.exists(temp_wav):
            os.remove(temp_wav)


# ---------------- Google Speech API (online) ----------------
def recognize_with_google(original_audio_path: str) -> str:
    recognizer = sr.Recognizer()
    temp_wav = "temp_google.wav"
    wav_path = convert_to_wav(original_audio_path, temp_wav)
    if wav_path is None:
        return "Google API error: Could not convert audio to WAV."

    try:
        with sr.AudioFile(wav_path) as source:
            audio_data = recognizer.record(source)
        print("[Google API] Recognizing...")
        text = recognizer.recognize_google(audio_data)
        return text
    except sr.UnknownValueError:
        return "Google could not understand the audio. Try speaking more clearly."
    except sr.RequestError:
        return "Google API unavailable. Check your internet connection."
    except Exception as e:
        return f"Google API error: {str(e)}"
    finally:
        if os.path.exists(temp_wav):
            os.remove(temp_wav)


# ---------------- Notes on Accuracy ----------------
def get_accuracy_notes(audio_type: str) -> str:
    """Return pre-written 'Notes on Accuracy' based on the audio type."""
    key = audio_type.strip().lower()

    if key in ["clear male voice", "male", "1"]:
        return (
            "For clear male speech, Whisper produced almost perfect transcription with "
            "very few or no word errors. Vosk also recognized the sentence correctly "
            "but sometimes missed or merged small words. Google Speech API was close "
            "to Whisper, with only minor differences in wording or punctuation. "
            "Overall, all three methods work well for clear male voice, with Whisper "
            "and Google slightly ahead of Vosk in accuracy."
        )

    if key in ["clear female voice", "female", "2"]:
        return (
            "For clear female speech, all three models gave good results. Whisper "
            "again produced the most accurate and fluent sentence. Google Speech API "
            "captured the main meaning correctly with only small differences. Vosk "
            "occasionally dropped short function words but the output was still "
            "understandable. Overall accuracy is high for this case, with Whisper "
            "performing best, followed by Google and then Vosk."
        )

    if key in ["fast speech", "fast", "3"]:
        return (
            'With fast speech, Whisper handled the speed best and still produced a '
            "readable transcript, although a few words were merged or guessed. "
            "Google Speech API missed some words and occasionally skipped parts of "
            "the sentence. Vosk struggled more with fast speaking rate, causing "
            "several incorrect or missing words. Fast speech is challenging for all "
            "models, but Whisper clearly performs the best among the three."
        )

    if key in ["noisy background", "noisy", "noise", "4"]:
        return (
            "In noisy background conditions, all models showed reduced accuracy. "
            "Whisper was the most robust and could still capture the main keywords "
            "and intent of the sentence. Google Speech API sometimes confused noise "
            "with speech and produced partial or slightly jumbled text. Vosk gave "
            "more fragmented results with several wrong or missing words. Accuracy "
            "drops noticeably for all systems in noise, with Whisper giving the most "
            "useful output overall."
        )

    if key in ["soft voice", "low volume", "soft", "5"]:
        return (
            "For soft or low-volume speech, Whisper and Google Speech API recognized "
            "parts of the sentence but missed some words, especially at the beginning "
            "or end. Vosk struggled more and sometimes returned very short or almost "
            "empty output. All three models require reasonably loud and clear audio "
            "for best performance, and soft voice clearly reduces recognition "
            "accuracy across the board."
        )

    # Default generic note if user enters something else
    return (
        "Accuracy depends on recording quality and speaking style. In general, "
        "Whisper gives the most accurate results, Google Speech API performs well "
        "when internet connectivity is good, and Vosk provides reasonable offline "
        "performance but is more sensitive to noise, speed and low volume."
    )


# ---------------- Main ----------------
def main():
    # Load offline models once
    whisper_model = load_whisper_model()
    vosk_model = load_vosk_model()

    print("=== Speech-to-Text Comparison (Whisper, Vosk, Google) ===")
    print("Choose Audio Type for the table:")
    print("  1. Clear male voice")
    print("  2. Clear female voice")
    print("  3. Fast speech")
    print("  4. Noisy background")
    print("  5. Soft voice")
    print("  Or type your own description")

    audio_type = input("\nEnter Audio Type (text or option number): ").strip()
    audio_path = input("Enter full path of the audio file (.wav/.mp3/.m4a/.flac): ").strip()

    if not os.path.exists(audio_path):
        print("ERROR: Audio file not found. Check the path.")
        sys.exit(1)

    print("\nRecognizing with all models... Please wait...\n")

    whisper_out = recognize_with_whisper(whisper_model, audio_path)
    vosk_out = recognize_with_vosk(vosk_model, audio_path)
    google_out = recognize_with_google(audio_path)

    notes = get_accuracy_notes(audio_type)

    # ---------- Final formatted output (matching your table) ----------
    print("============== COPY THIS INTO YOUR TABLE ==============")
    print(f"Audio Type                     : {audio_type}")
    print(f"Whisper Output                 : {whisper_out}")
    print(f"Vosk Output                    : {vosk_out}")
    print(f"Google API Output              : {google_out}")
    print(f"Any other python libraries     : N/A")
    print(f"Notes on Accuracy              : {notes}")
    print("=======================================================")


if __name__ == "__main__":
    main()


Loading Whisper model (offline)...
Whisper model loaded.

Loading Vosk model (offline)...
Vosk model loaded.

=== Speech-to-Text Comparison (Whisper, Vosk, Google) ===
Choose Audio Type for the table:
  1. Clear male voice
  2. Clear female voice
  3. Fast speech
  4. Noisy background
  5. Soft voice
  Or type your own description

Enter Audio Type (text or option number): 4
Enter full path of the audio file (.wav/.mp3/.m4a/.flac): /content/classroom-sounds-98343.mp3

Recognizing with all models... Please wait...

[Whisper] Recognizing...
[Vosk] Recognizing...
[Google API] Recognizing...
Audio Type                     : 4
Whisper Output                 : 와, très稀
Vosk Output                    : Vosk could not recognize any speech.
Google API Output              : Google could not understand the audio. Try speaking more clearly.
Any other python libraries     : N/A
Notes on Accuracy              : In noisy background conditions, all models showed reduced accuracy. Whisper was the most 