Demo File for Speech Recognition and Speech Synthesis use Cases 

In [2]:
# Requirements
!pip -q install -U openai-whisper
!pip -q install wavio
!pip -q install scipy
!pip -q install sounddevice
!pip -q install pyaudio

Introducing Whisper for non-real time transcriptions

In [None]:
import whisper
import sounddevice as sd
from scipy.io.wavfile import write
import wavio as wv

# Hi this is a sample recording of my voice and to see how well the speech recognition is working.

# Sampling frequency
freq = 16000
 
# Recording duration
duration = 5
 
# Start recorder with the given values 
# of duration and sample frequency
recording = sd.rec(int(duration * freq), 
                   samplerate=freq, channels=2)

print('Recording Audio...') 
# Record audio for the given number of seconds
sd.wait()
 
# This will convert the NumPy array to an audio
# file with the given sampling frequency
write("recording0.wav", freq, recording)

# Use internal transcription definition (Quick Way)
model = whisper.load_model("base")
result = model.transcribe("recording0.wav")
print(result["text"])


# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("recording0.wav")

audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions(
    task = "transcribe",

    # language that the audio is in; uses detected language if None
    language= max(probs, key=probs.get),

    # sampling-related options
    temperature = 0.0,
    sample_len = None,  # maximum number of tokens to sample
    best_of = None,  # number of independent sample trajectories, if t > 0
    beam_size = None,  # number of beams in beam search, if t == 0
    patience = None,  # patience in beam search (arxiv:2204.05424)

    # text or tokens to feed as the prompt or the prefix; for more info:
    # https://github.com/openai/whisper/discussions/117#discussioncomment-3727051
    prompt = None,  # for the previous context
    prefix = None,  # to prefix the current context

    # list of tokens ids (or comma-separated token ids) to suppress
    # "-1" will suppress a set of symbols as defined in `tokenizer.non_speech_tokens()`
    suppress_tokens = "-1",
    suppress_blank = True,  # this will suppress blank outputs

    # timestamp sampling options
    without_timestamps = False,  # use <|notimestamps|> to sample text tokens only
    max_initial_timestamp = 1.0,

    # implementation details
    fp16 = False,  # use fp16 for most of the calculation
)
                                  
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

Introducing Emformer for real-time transcriptions

In [None]:
#Requirements 
!pip -q install torch
!pip -q install torchaudio
!pip -q install SentencePiece

# Might need to conda install ffmpeg

In [2]:
import torch
import torchaudio 

class ContextCacher:
    """Cache the end of input data and prepend the next input data with it.

    Args:
        segment_length (int): The size of main segment.
            If the incoming segment is shorter, then the segment is padded.
        context_length (int): The size of the context, cached and appended.
    """

    def __init__(self, segment_length: int, context_length: int):
        self.segment_length = segment_length
        self.context_length = context_length
        self.context = torch.zeros([context_length])

    def __call__(self, chunk: torch.Tensor):
        if chunk.size(0) < self.segment_length:
            chunk = torch.nn.functional.pad(chunk, (0, self.segment_length - chunk.size(0)))
        chunk_with_context = torch.cat((self.context, chunk))
        self.context = chunk[-self.context_length :]
        return chunk_with_context
    

In [None]:
from torchaudio.io import StreamReader

src = "recording0.wav"

# Get pipeline
bundle = torchaudio.pipelines.EMFORMER_RNNT_BASE_LIBRISPEECH

feature_extractor = bundle.get_streaming_feature_extractor()
decoder = bundle.get_decoder()
token_processor = bundle.get_token_processor()

sample_rate = bundle.sample_rate
segment_length = bundle.segment_length * bundle.hop_length
context_length = bundle.right_context_length * bundle.hop_length

# Stream Audio File
streamer = StreamReader(src)
streamer.add_basic_audio_stream(frames_per_chunk=segment_length, sample_rate=bundle.sample_rate)

state, hypothesis = None, None    
cacher = ContextCacher(segment_length, context_length)

stream_iterator = streamer.stream()

# Run speech recognition
@torch.inference_mode()
def run_inference(num_iter=1000):
    global state, hypothesis
    chunks = []
    feats = []
    for i, (chunk,) in enumerate(stream_iterator, start=1):
        segment = cacher(chunk[:, 0])
        features, length = feature_extractor(segment)
        hypos, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
        hypothesis = hypos
        transcript = token_processor(hypos[0][0], lstrip=False)
        print(transcript, end="\r", flush=True)

        chunks.append(chunk)
        feats.append(features)
        if i == num_iter:
            break

run_inference()

Speech Synthesis Tacotron2 and HifiGan 

More Links to check out:
https://google.github.io/tacotron/publications/tacotron2/index.html
https://github.com/suno-ai/bark
https://github.com/coqui-ai/TTS

Speech Brain
https://github.com/speechbrain/speechbrain/

In [None]:
!pip3 -q install deep_phonemizer

In [4]:
import torch
import torchaudio
import IPython
import matplotlib.pyplot as plt

In [5]:
bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH

processor = bundle.get_text_processor()
tacotron2 = bundle.get_tacotron2().to(device)
vocoder = bundle.get_vocoder().to(device)

torch.random.manual_seed(0)
device = "cuda" if torch.cuda.is_available() else "cpu"

symbols = "_-!'(),.:;? abcdefghijklmnopqrstuvwxyz"
look_up = {s: i for i, s in enumerate(symbols)}
symbols = set(symbols)

def text_to_sequence(text):
    text = text.lower()
    return [look_up[s] for s in text if s in symbols]

text = "This is a response text and to test how well tacotron is working."
print(text_to_sequence(text))
print([processor.tokens[i] for i in processed[0, : lengths[0]]])

with torch.inference_mode():
    processed, lengths = processor(text)
    processed = processed.to(device)
    lengths = lengths.to(device)
    spec, spec_lengths, _ = tacotron2.infer(processed, lengths)
    waveforms, lengths = vocoder(spec, spec_lengths)

plot(waveforms, spec, 22050)



NameError: name 'device' is not defined