In [1]:
import pyaudio
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import numpy as np

# Load Wav2Vec2 processor and model

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

# PyAudio setup for real-time audio capture
CHUNK = 1024  # Number of audio frames per buffer
RATE = 16000  # Sampling rate (Wav2Vec2 expects 16kHz audio)

p = pyaudio.PyAudio()

# Open a stream for microphone input
stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNK)

print("Recording and transcribing...")

# Real-time transcription
try:
    while True:
        # Read audio data from the microphone
        data = stream.read(CHUNK)
        audio_data = np.frombuffer(data, dtype=np.int16)

        # Process the audio chunk through the processor
        input_values = processor(audio_data, return_tensors="pt", sampling_rate=RATE).input_values

        # Perform inference on the model
        with torch.no_grad():
            logits = model(input_values).logits

        # Decode the predicted token IDs to text
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)

        # Print the real-time transcription
        print(f"Transcription: {transcription[0]}")
        
except KeyboardInterrupt:
    # Stop the stream on exit
    stream.stop_stream()
    stream.close()
    p.terminate()

print("Real-time transcription stopped.")


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

OSError: [Errno -9996] Invalid input device (no default output device)