# Speech Recognition & Speech to Text

In [1]:
!pip install SpeechRecognition



In [2]:
import librosa
import speech_recognition as sr

print(f"Librosa version: {librosa.__version__}")
print(f"SpeechRecognition version: {sr.__version__}")

Librosa version: 0.10.2.post1
SpeechRecognition version: 3.14.1


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!ls drive/MyDrive/NLP

nlp-basics.ipynb  sample.wav  speech-recognition.ipynb


In [5]:
import pathlib

audio_file = pathlib.Path("drive/MyDrive/NLP/sample.wav")

audio_file.stat().st_size / 1024 / 1024 # 0.841 MB (or 841KB)

0.8411827087402344

## Loading an audio file into a numpy array (using Librosa)

In [6]:
raw_data, sampling_rate = librosa.load(audio_file)

print(sampling_rate, type(raw_data), raw_data.shape) # Numpy array

22050 <class 'numpy.ndarray'> (110250,)


In [7]:
import IPython.display as ipd

ipd.Audio(raw_data, rate=sampling_rate) # Audio says merry christmas :)

In [8]:
audio_rec = sr.Recognizer()

### Using Google's speech recognition engine

> This API doesn't need any key, so there isn't a well defined SLA, hence it's better to use Google cloud for production

In [9]:
with sr.AudioFile(str(audio_file.absolute())) as source:
  speech = audio_rec.listen(source)

  try:
    text = audio_rec.recognize_google(speech)
    print("Transcript: ")
    print(text)
  except Exception as e:
    print("Unable to understand the audio")
    print(e)


Transcript: 
Merry Christmas


## OpenAI's whisper tool (based on Transformers)

> It's a sequence-to-sequence model that uses ffmpeg and BPE (Byte pair encoding) tokenizers.

In [10]:
!pip install -U openai-whisper



In [11]:
import whisper

print(f'Whisper version: {whisper.__version__}')

Whisper version: 20240930


In [12]:
tiny_model = whisper.load_model("tiny") # 72 MB

  checkpoint = torch.load(fp, map_location=device)


In [13]:
result = tiny_model.transcribe(str(audio_file.absolute())) # Takes ~3s to recognize for a 5 sec audio clip

print(result["text"])



 Merry Christmas.


### Turbo model has high accuracy but needs more resources :(

In [14]:
model = whisper.load_model("turbo") # 1.5 GB

  checkpoint = torch.load(fp, map_location=device)


In [16]:
result = model.transcribe(str(audio_file.absolute())) # Takes ~ 1 minute for inference
print(result["text"])



 Merry Christmas.
