In [2]:
import torch
import sounddevice as sd
import numpy as np
import torchaudio
import soundfile as sf
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load fine-tuned model and processor (Nepali example)
model_name = "prajin/wav2vec2-large-xlsr-300m-nepali"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
model.eval()

# Constants
SAMPLE_RATE = 16000  # Required by Wav2Vec2
DURATION = 5  # Seconds of mic input per chunk

print("🎤 Ready to record. Speak into the mic...")

while True:
    print("\n🔴 Recording...")
    audio = sd.rec(int(DURATION * SAMPLE_RATE), samplerate=SAMPLE_RATE, channels=1, dtype='float32')
    sd.wait()
    print("✅ Recording complete.")

    # Convert audio to tensor
    audio_tensor = torch.tensor(audio.squeeze())

    # Preprocess
    inputs = processor(audio_tensor, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)

    # Inference
    with torch.no_grad():
        logits = model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]
    print("📝 Transcription:", transcription)

Some weights of the model checkpoint at prajin/wav2vec2-large-xlsr-300m-nepali were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at prajin/wav2vec2-large-xlsr-300m-nepali and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should prob

🎤 Ready to record. Speak into the mic...

🔴 Recording...
✅ Recording complete.
📝 Transcription: लमेरो नाम अभिषे

🔴 Recording...
✅ Recording complete.
📝 Transcription: मेरो ना

🔴 Recording...
✅ Recording complete.
📝 Transcription: मेरो नाम अभिषेक हो

🔴 Recording...
✅ Recording complete.
📝 Transcription: वा लास्टिक मा म्यान

🔴 Recording...


KeyboardInterrupt: 