In [2]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import sounddevice as sd
import numpy as np
from queue import Queue
import threading

# Load the model and processor
model_name = "ulrichING/speech_to_text_wave2vect2_english"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)

# Audio recording parameters
RATE = 16000
CHUNK = 1600  # 100ms

# Create a queue to communicate between the audio callback and main thread
q = Queue()

# Callback function to process audio data
def audio_callback(indata, frames, time, status):
    if status:
        print(status)
    q.put(indata.copy())

# Function to perform transcription
def transcribe_audio_chunk(audio_chunk):
    input_values = processor(audio_chunk, sampling_rate=RATE, return_tensors="pt").input_values
    
    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    return transcription

# Function to continuously process audio data
def process_audio():
    while True:
        audio_data = []
        for _ in range(10):  # Collect 1 second of audio (10 * 100ms chunks)
            audio_data.append(q.get())
        
        audio_data = np.concatenate(audio_data)
        audio_data = audio_data.flatten()  # Flatten the 2D array to 1D

        # Perform transcription
        transcription = transcribe_audio_chunk(audio_data)
        
        if transcription.strip():  # Only print non-empty transcriptions
            print("Transcription:", transcription)

# Start the audio processing in a separate thread
processing_thread = threading.Thread(target=process_audio)
processing_thread.start()

# Start the audio stream
try:
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=RATE, blocksize=CHUNK):
        print("* Recording. Press Ctrl+C to stop.")
        processing_thread.join()
except KeyboardInterrupt:
    print("* Stopping")

preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/320 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at ulrichING/speech_to_text_wave2vect2_english were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at ulrichING/speech_to_text_wave2vect2_english and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You s

||PaMacCore (AUHAL)|| AUHAL component not found.

PortAudioError: Error querying host API -9979