In [2]:
import sounddevice as sd
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio

# Check if a GPU is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load model and processor, move the model to the GPU
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium").to(device)
model.config.forced_decoder_ids = None

# Function to record audio from the microphone
def record_audio(duration, sampling_rate):
    print("Recording started...")
    audio = sd.rec(int(duration * sampling_rate), samplerate=sampling_rate, channels=1, dtype="float32")
    sd.wait()  # Wait until recording is finished
    print("Recording finished.")
    return audio.squeeze()

# Record audio from the microphone
audio_data = record_audio(duration=5, sampling_rate=16000)

# Preprocess the audio to extract features
inputs = processor(audio_data, return_tensors="pt", sampling_rate=16000, language = "en")
inputs = {key: inputs[key].to(device) for key in inputs}

# Use the model's generate method to produce predictions
with torch.no_grad():
    predicted_ids = model.generate(inputs["input_features"])

# Decode predicted token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

print("Transcription:", transcription)


Using device: cuda


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Recording started...
Recording finished.
Transcription: [' What are the courses offered in Janakpur Engineering College?']
