In [1]:
!pip install SpeechRecognition
!pip install pyaudio



In [2]:
!pip install SpeechRecognition openai-whisper pydub numpy noisereduce



In [3]:
!pip install openai-whisper



In [4]:
!pip install imageio[ffmpeg]



In [5]:
!pip install transformers torchaudio



In [6]:
!pip install pocketsphinx



In [18]:
import os
import speech_recognition as sr
import whisper
import numpy as np
import scipy.io.wavfile as wav
import noisereduce as nr
from pydub import AudioSegment

# Ensure Audio File Exists
audio_path = "audio.wav"  # Change this to your actual file
if not os.path.exists(audio_path):
    raise FileNotFoundError(f" Audio file '{audio_path}' not found!")

print("Audio file found. Processing...")

# Convert Audio to Correct Format (Mono, 16-bit, 16kHz WAV)
converted_audio_path = "converted_audio.wav"
audio = AudioSegment.from_file(audio_path)
audio = audio.set_channels(1).set_frame_rate(16000).set_sample_width(2)
audio.export(converted_audio_path, format="wav")
print("Audio converted successfully!")

# Reduce Noise
rate, data = wav.read(converted_audio_path)
data = data.astype(np.float32)  # Ensure correct data type
reduced_noise = nr.reduce_noise(y=data, sr=rate)

# Save Cleaned Audio
cleaned_audio_path = "cleaned_audio.wav"
wav.write(cleaned_audio_path, rate, reduced_noise.astype(np.int16))  # Convert back to int16
print("Noise reduction applied!")

# Initialize Recognizer
recognizer = sr.Recognizer()

# Load Cleaned Audio for Speech Recognition
with sr.AudioFile(cleaned_audio_path) as source:
    audio = recognizer.record(source)

# Recognize Speech using Google API
try:
    google_text = recognizer.recognize_google(audio)
    print("Google Speech Recognition Output:\n", google_text)
except sr.UnknownValueError:
    print("Google Speech Recognition could not understand the audio.")
except sr.RequestError:
    print("Google API request failed.")

# Recognize Speech using OpenAI Whisper
print("Running Whisper Model...")
whisper_model = whisper.load_model("base")

# Run Whisper with CPU Mode (Fixes FP16 Error)
whisper_text = whisper_model.transcribe(cleaned_audio_path, fp16=False)["text"]

print("OpenAI Whisper Output:\n", whisper_text)
print("Speech recognition complete!")

Audio file found. Processing...
Audio converted successfully!
Noise reduction applied!
Google Speech Recognition Output:
 many animals of even complex structure which live parasitically within others are wholly devoid of an alimentary cavity
Running Whisper Model...
OpenAI Whisper Output:
  Many animals of even complex structure which live parasitically within others are wholly devoid of an elementary cavity.
Speech recognition complete!


In [8]:
from pydub import AudioSegment
import os

# Ensure 'cleaned_audio.wav' exists before processing
cleaned_audio_path = "cleaned_audio.wav"
short_audio_path = "short_audio.wav"

if not os.path.exists(cleaned_audio_path):
    raise FileNotFoundError(f" File '{cleaned_audio_path}' not found! Ensure it was created properly.")

print(" Extracting first 5 seconds of audio...")

# Load the existing cleaned audio and extract the first 5 seconds
audio = AudioSegment.from_wav(cleaned_audio_path)
segment = audio[:5000]  # Extract first 5 seconds

# Export the extracted segment
segment.export(short_audio_path, format="wav")
print("Short segment saved as 'short_audio.wav'.")


 Extracting first 5 seconds of audio...
Short segment saved as 'short_audio.wav'.


In [9]:
import speech_recognition as sr

# Initialize recognizer
recognizer = sr.Recognizer()

# Transcribe the extracted segment using Google Speech Recognition
with sr.AudioFile(short_audio_path) as source:
    short_audio = recognizer.record(source)

try:
    segment_text_google = recognizer.recognize_google(short_audio)
    print("Google Speech Recognition (First 5 Secs):\n", segment_text_google)
except sr.UnknownValueError:
    segment_text_google = "N/A"
    print("Google Speech API could not understand the short segment.")
except sr.RequestError:
    segment_text_google = "N/A"
    print(" Google Speech API request failed.")

Google Speech Recognition (First 5 Secs):
 many animals of even complex structure which live parasitically within other


In [10]:
import whisper

# Run Whisper on the short segment
print("Running Whisper on short segment...")
whisper_model = whisper.load_model("base")

# Run Whisper directly on `short_audio.wav`
whisper_text_segment = whisper_model.transcribe(short_audio_path, fp16=False)["text"]
print("OpenAI Whisper (First 5 Secs):\n", whisper_text_segment)

Running Whisper on short segment...
OpenAI Whisper (First 5 Secs):
  many animals of even complex structure which live parasitically within others.


In [11]:
print("\n🔍 ASR Model Benchmarking:")
print(f"Google Speech Recognition Output:\n{segment_text_google}")
print(f"OpenAI Whisper Output:\n{whisper_text_segment}")


🔍 ASR Model Benchmarking:
Google Speech Recognition Output:
many animals of even complex structure which live parasitically within other
OpenAI Whisper Output:
 many animals of even complex structure which live parasitically within others.


In [12]:
# Identify Keywords in the Transcription
keywords = ["Animals", "complex", "parasitically"]  # Customize keywords

found_keywords = [word for word in keywords if word.lower() in whisper_text_segment.lower()]

if found_keywords:
    print("\nKeywords Found in Speech:")
    for word in found_keywords:
        print(f"   - {word}")
else:
    print("\nNo target keywords found in speech.")

print("ASRcomparison & keyword extraction completed!")


Keywords Found in Speech:
   - Animals
   - complex
   - parasitically
ASRcomparison & keyword extraction completed!


In [13]:
import speech_recognition as sr
import whisper

# Initialize recognizer
recognizer = sr.Recognizer()

# Load Short Audio for Processing
with sr.AudioFile("short_audio.wav") as source:
    short_audio = recognizer.record(source)

# **Google Speech API**
try:
    google_text = recognizer.recognize_google(short_audio)
    print(" Google Speech Recognition Output:\n", google_text)
except sr.UnknownValueError:
    google_text = "N/A"
    print(" Google Speech API could not understand the short segment.")
except sr.RequestError:
    google_text = "N/A"
    print(" Google Speech API request failed.")

# **CMU Sphinx (Offline ASR)**
try:
    sphinx_text = recognizer.recognize_sphinx(short_audio)
    print(" CMU Sphinx Speech Recognition Output:\n", sphinx_text)
except sr.UnknownValueError:
    sphinx_text = "N/A"
    print(" CMU Sphinx could not understand the short segment.")
except sr.RequestError:
    sphinx_text = "N/A"
    print(" CMU Sphinx API request failed.")

# **OpenAI Whisper (Deep Learning ASR)**
print(" Running Whisper on short segment...")
whisper_model = whisper.load_model("base")

# Run Whisper directly on `short_audio.wav`
whisper_text = whisper_model.transcribe("short_audio.wav", fp16=False)["text"]
print(" OpenAI Whisper Output:\n", whisper_text)


 Google Speech Recognition Output:
 many animals of even complex structure which live parasitically within other
 CMU Sphinx Speech Recognition Output:
 any animals a huge complex structure which would heart sick we begin on
 Running Whisper on short segment...
 OpenAI Whisper Output:
  many animals of even complex structure which live parasitically within others.


In [14]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [15]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio

# Load pre-trained Wav2Vec2 model
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Load and resample audio to 16kHz (required for Wav2Vec2)
waveform, sample_rate = torchaudio.load("short_audio.wav")
waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

# Process and predict transcription
input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
wav2vec_text = processor.batch_decode(predicted_ids)[0]

print(" Wav2Vec2 Speech Recognition Output:\n", wav2vec_text)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


 Wav2Vec2 Speech Recognition Output:
 MANY ANIMALS OF EVEN COMPLEX STRUCTURE WHICH LIVE PARASITICALLY WITHIN OTHE
