In [1]:
pip install datasets transformers torchaudio librosa

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_metric
from jiwer import wer

# Set paths to the datasets
dataset_dir = os.path.expanduser("~/Desktop/dataset/LibriSpeech")
test_clean_dir = os.path.join(dataset_dir, "test-clean")
test_other_dir = os.path.join(dataset_dir, "test-other")

# Function to preprocess audio files
def speech_file_to_array_fn(file_path):
    speech_array, sampling_rate = torchaudio.load(file_path)
    resampler = torchaudio.transforms.Resample(sampling_rate, 16000)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

# Load pretrained Wav2Vec2 model and processor
model_name = "facebook/wav2vec2-large-960h-lv60-self"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.to("cuda" if torch.cuda.is_available() else "cpu")

# Function to transcribe audio using the model
def predict(speech):
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values.to(model.device)).logits
    pred_ids = torch.argmax(logits, dim=-1)
    predicted_text = processor.batch_decode(pred_ids)
    return predicted_text[0]

# Function to read transcripts from a file
def read_transcripts(transcript_file_path):
    with open(transcript_file_path, 'r') as f:
        references = [line.strip() for line in f]
    return references

# Function to process a directory with multiple audio files and a single transcript file
def process_audio_directory(audio_dir):
    audio_files = [os.path.join(audio_dir, file) for file in os.listdir(audio_dir) if file.endswith(".flac")]
    audio_files.sort()
    
    # Debugging: List all files in the directory
    print(f"Files in directory {audio_dir}: {os.listdir(audio_dir)}")
    
    transcript_file_path = next((os.path.join(audio_dir, file) for file in os.listdir(audio_dir) if file.endswith(".trans.txt")), None)
    if not transcript_file_path:
        raise ValueError(f"No transcript file found in directory: {audio_dir}")
    
    references = read_transcripts(transcript_file_path)
    if len(audio_files) != len(references):
        raise ValueError("The number of audio files and reference texts do not match.")

    predictions = []
    for file_path in audio_files:
        speech = speech_file_to_array_fn(file_path)
        predicted_text = predict(speech)
        predictions.append(predicted_text)

    # Debugging prints
    print(f"Number of files processed in {audio_dir}: {len(audio_files)}")
    print(f"Number of references: {len(references)}")
    print(f"Sample prediction: {predictions[0] if predictions else 'None'}")
    print(f"Sample reference: {references[0] if references else 'None'}")

    return predictions, references

# Function to process the entire dataset
def process_dataset(dataset_dir):
    predictions = []
    references = []
    for root, dirs, _ in os.walk(dataset_dir):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            try:
                pred, ref = process_audio_directory(dir_path)
                predictions.extend(pred)
                references.extend(ref)
            except ValueError as e:
                print(e)
                continue
    return predictions, references

# Process both datasets
predictions_clean, references_clean = process_dataset(test_clean_dir)
predictions_other, references_other = process_dataset(test_other_dir)

# Check if predictions and references are not empty
if not predictions_clean or not references_clean:
    raise ValueError("Predictions or references for test-clean are empty.")
if not predictions_other or not references_other:
    raise ValueError("Predictions or references for test-other are empty.")

# Load the CER metric
cer = load_metric("cer")

# Calculate CER for both datasets
results_clean = cer.compute(predictions=predictions_clean, references=references_clean)
results_other = cer.compute(predictions=predictions_other, references=references_other)

print(f"CER for test-clean: {results_clean:.4f}")
print(f"CER for test-other: {results_other:.4f}")

# Optionally calculate WER for better understanding
wer_clean = wer(references_clean, predictions_clean)
wer_other = wer(references_other, predictions_other)

print(f"WER for test-clean: {wer_clean:.4f}")
print(f"WER for test-other: {wer_other:.4f}")


Some weights of the model checkpoint at facebook/wav2vec2-large-960h-lv60-self were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.maske

Files in directory C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1089: ['134686', '134691']
No transcript file found in directory: C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1089
Files in directory C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1188: ['133604']
No transcript file found in directory: C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1188
Files in directory C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\121: ['121726', '123852', '123859', '127105']
No transcript file found in directory: C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\121
Files in directory C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1221: ['135766', '135767']
No transcript file found in directory: C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1221
Files in directory C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1284: ['1180', '1181', '134647']
No transcript file found in directory: C:\Users\ASUS/Desktop/dataset/LibriSpeech\test-clean\1284
Files 

  cer = load_metric("cer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


CER for test-clean: 0.1341
CER for test-other: 0.1610
WER for test-clean: 0.0650
WER for test-other: 0.0894
