In [9]:
# Use a pipeline as a high-level helper
%load_ext autoreload
%autoreload 2
from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
from jiwer import wer
from tqdm import tqdm
import torchaudio
import torch
import csv
import time

# from datasets import load_dataset
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")
# Path to the metadata CSV file
metadata_path = "/home/richard/workspace/data/LJSpeech-1.1/validation.txt"

# Lists to store predictions and ground truths
predictions = []
ground_truths = []
# Timer to calculate processing time
start_time = time.time()
# Process each line in the metadata CSV
with open(metadata_path, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='|')
    for row in tqdm(reader, desc="Processing audio files"):
        wav_path = f"/home/richard/workspace/data/LJSpeech-1.1/wavs/{row[0]}.wav"
        gt_path = f"/home/richard/workspace/data/LJSpeech-1.1/wavs/{row[0]}.normalized.txt"
        with open(gt_path, 'r') as gt_file:
            ground_truth = gt_file.read().strip()

        # Load the audio file
        waveform, sample_rate = torchaudio.load(wav_path)

        # Preprocess the audio
        inputs = processor(audio=waveform.squeeze(), sampling_rate=sample_rate, return_tensors="pt", padding=True)

        # Perform inference
        with torch.no_grad():
            predicted_ids = model.generate(**inputs, max_length=100)
            # logits = model(**inputs).logits

        # Decode the predicted tokens
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        # print(f"Predicted: {transcription}")
        # print(f"Ground Truth: {ground_truth}")
        # Store predictions and ground truths
        predictions.append(transcription)
        ground_truths.append(ground_truth)

        # Report total processing time
end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")
# Calculate Word Error Rate
error_rate = wer(ground_truths, predictions)
print(f"Microsoft Word Error Rate (WER): {error_rate}")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Some weights of SpeechT5ForSpeechToText were not initialized from the model checkpoint at microsoft/speecht5_asr and are newly initialized: ['speecht5.encoder.prenet.pos_sinusoidal_embed.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Processing audio files: 150it [01:30,  1.65it/s]

Total processing time: 90.91 seconds
Microsoft Word Error Rate (WER): 0.6189697465249386





### Open AI Whisper

In [17]:
from torch.nn.attention import SDPBackend, sdpa_kernel
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
torch.set_float32_matmul_precision("high")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3-turbo"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
).to(device)
# Enable static cache and compile the forward pass
model.generation_config.cache_implementation = "static"
model.generation_config.max_new_tokens = 256
model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

processor = AutoProcessor.from_pretrained(model_id)
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda:0


In [15]:
predictions = []
ground_truths = []
# Timer to calculate processing time
start_time = time.time()
with open(metadata_path, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='|')
    for row in tqdm(reader, desc="Processing audio files"):
        wav_path = f"/home/richard/workspace/data/LJSpeech-1.1/wavs/{row[0]}.wav"
        gt_path = f"/home/richard/workspace/data/LJSpeech-1.1/wavs/{row[0]}.normalized.txt"
        with open(gt_path, 'r') as gt_file:
            ground_truth = gt_file.read().strip()

        # Load the audio file
        waveform, sample_rate = torchaudio.load(wav_path)
        # 2 warmup steps
        # for _ in tqdm(range(2), desc="Warm-up step"):
        #     with sdpa_kernel(SDPBackend.MATH):
        #         result = pipe(waveform.squeeze().numpy(), generate_kwargs={"min_new_tokens": 256, "max_new_tokens": 256})
        # break

        # fast run
        with sdpa_kernel(SDPBackend.MATH):
            result = pipe(waveform.squeeze().numpy())
            
        predictions.append(result["text"])
        ground_truths.append(ground_truth)

        # Report total processing time
end_time = time.time()
total_time = end_time - start_time
print(f"Total processing time: {total_time:.2f} seconds")
# Calculate Word Error Rate
error_rate = wer(ground_truths, predictions)
print(f"Open AI Whisper Large Turbo Word Error Rate (WER): {error_rate}")

Processing audio files: 0it [00:00, ?it/s]

Processing audio files: 150it [00:42,  3.51it/s]

Total processing time: 42.75 seconds
Open AI Whisper Large Turbo Word Error Rate (WER): 0.07808667211774326





### Google

In [None]:
from google.cloud import speech
import io

def transcribe_speech(audio_path):
    client = speech.SpeechClient()

    with io.open(audio_path, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,  # Adjust if your file is not LINEAR16
        sample_rate_hertz=16000,  # Adjust to match your audio file
        language_code="en-US",
    )

    response = client.recognize(config=config, audio=audio)

    for result in response.results:
        print("Transcript:", result.alternatives[0].transcript)

with open(metadata_path, 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter='|')
    for row in tqdm(reader, desc="Processing audio files"):
        wav_path = f"/home/richard/workspace/data/LJSpeech-1.1/wavs/{row[0]}.wav"
        gt_path = f"/home/richard/workspace/data/LJSpeech-1.1/wavs/{row[0]}.normalized.txt"
        with open(gt_path, 'r') as gt_file:
            ground_truth = gt_file.read().strip()

        # Load the audio file
        # waveform, sample_rate = torchaudio.load(wav_path)
        output = transcribe_speech(wav_path) 
        print(f"Predicted: {output}")

Processing audio files: 0it [00:11, ?it/s]


DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.