In [None]:
!pip install librosa datasets pyctcdecode

### **Imports**

In [38]:
import os
import torch
import librosa
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Configuration
LANG_ID = "en"
MODEL_ID = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
SAMPLES = 10  # Number of samples to process

# Load processor and model

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# Function to process and transcribe the audio files
def speech_file_to_array_fn(file_path):
    speech_array, sampling_rate = librosa.load(file_path, sr=16_000)  # Load audio file
    return speech_array

# List all audio files in the directory (ensure they are in .wav or other supported formats)
audio_files = ["./b1.mp3"]

# Process the audio files and transcribe them
for i, audio_file in enumerate(audio_files[:SAMPLES]):  # Limiting to the first `SAMPLES` files
    # Load and preprocess audio
    speech = speech_file_to_array_fn(audio_file)

    # Process the input for the model
    inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)

    # Get predictions from the model
    with torch.no_grad():
        logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    predicted_sentence = processor.batch_decode(predicted_ids)[0]  # Decode the first sentence

    # Print the results
    print("-" * 100)
    print("Audio File:", audio_file)
    print("Prediction:", predicted_sentence)
