In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

# Load pre-trained Wav2Vec2.0 model and tokenizer from Hugging Face
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# Load and preprocess the audio file
def load_audio(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    return waveform, sample_rate

# Preprocess the waveform
def preprocess_audio(waveform, sample_rate):
    # If sample rate is not 16kHz, resample it
    if sample_rate != 16000:
        transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = transform(waveform)
    return waveform

# Convert audio to text using the Wav2Vec2.0 model
def speech_to_text(wav_file):
    waveform, sample_rate = load_audio(wav_file)
    waveform = preprocess_audio(waveform, sample_rate)

    # Tokenize and feed to model
    inputs = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", padding=True)
    with torch.no_grad():
        logits = model(inputs.input_values).logits

    # Decode the output logits into text
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = tokenizer.decode(predicted_ids[0])

    return transcription

# Provide the path to your WAV file
wav_file = 'your_audio_file.wav'
transcription = speech_to_text(wav_file)
print(f"Transcription: {transcription}")