Live Transcription with audio block size using sounddevice

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig
import requests
import sounddevice as sd
import torch
import numpy as np
import time

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

    
def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Process audio using the Whisper processor
        input_features = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        # Generate transcription using the Whisper model
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return [""]



def audio_callback(indata, frames, time, status):
    if status:
        print(f"Status: {status}", flush=True)
    # Convert audio chunk to numpy array and flatten to 1D
    audio_chunk = indata[:, 0].flatten()

    if not np.any(audio_chunk):
        print("No audio captured in this chunk.")
        return
    # Transcribe the audio chunk
    transcription = get_transcription(audio_chunk)
    # Print the transcription
    print(transcription[0], flush=True)
    
# Parameters for audio stream
fs = 16000  # Sample rate
block_duration = 5  # Block size in seconds

# Create an audio stream
try:
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, blocksize=int(fs * block_duration)):
        print("Recording... Speak into the microphone.")
        while True:
            # Keep the script running to continuously process audio input
            time.sleep(0.1)
except Exception as e:
    print(f"Error with audio stream: {e}")

Live Transcription with audio block size using pyaudio

In [None]:
import pyaudio
import numpy as np
import torch
import time
import requests
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Process audio using the Whisper processor
        input_features = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        # Generate transcription using the Whisper model
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return [""]

def audio_callback(in_data, frame_count, time_info, status):
    # Convert audio chunk to numpy array and flatten to 1D
    audio_chunk = np.frombuffer(in_data, dtype=np.int16).flatten()

    if not np.any(audio_chunk):
        print("No audio captured in this chunk.")
        return (None, pyaudio.paContinue)
    
    # Transcribe the audio chunk
    transcription = get_transcription(audio_chunk)
    # Print the transcription
    print(transcription[0], flush=True)
    
    return (None, pyaudio.paContinue)

# Parameters for audio stream
fs = 16000  # Sample rate
block_duration = 5  # Block size in seconds
chunk_size = int(fs * block_duration)

# Create a PyAudio instance
p = pyaudio.PyAudio()

# Open an audio stream
try:
    stream = p.open(format=pyaudio.paInt16,
                    channels=1,
                    rate=fs,
                    input=True,
                    frames_per_buffer=chunk_size,
                    stream_callback=audio_callback)
    
    print("Recording... Speak into the microphone.")
    stream.start_stream()
    
    while stream.is_active():
        # Keep the script running to continuously process audio input
        time.sleep(0.1)

except Exception as e:
    print(f"Error with audio stream: {e}")

finally:
    # Stop and close the stream
    if 'stream' in locals():
        stream.stop_stream()
        stream.close()
    p.terminate()

Live Transcription with pause ditection using sounddevice

In [5]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig
import requests
import sounddevice as sd
import torch
import numpy as np
import time

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Parameters
fs = 16000  # Sample rate
block_duration = 2  # Block size in seconds
silence_threshold = 0.005  # Energy threshold for detecting silence
pause_duration = 1.5  # Minimum pause duration in seconds

# Buffer to store audio chunks
buffer = []
last_audio_time = time.time()


def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Process audio using the Whisper processor
        input_features = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        # Generate transcription using the Whisper model
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return [""]

def audio_callback(indata, frames, time_info, status):
    global last_audio_time
    if status:
        print(f"Status: {status}", flush=True)
    
    # Convert audio chunk to numpy array and flatten to 1D
    audio_chunk = indata[:, 0].flatten()
    current_time = time.time()

    # Check for silence
    if np.mean(np.abs(audio_chunk)) < silence_threshold:
        if current_time - last_audio_time > pause_duration:
            # Accumulate audio data
            if buffer:
                # Process accumulated audio data
                combined_audio = np.concatenate(buffer)
                transcription = get_transcription(combined_audio)
                print(f"Transcription: {transcription[0]}", flush=True)
                buffer.clear()  # Clear the buffer after processing
    else:
        # Update last audio time and add to buffer
        last_audio_time = current_time
        buffer.append(audio_chunk)

# Create an audio stream
try:
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, blocksize=int(fs * block_duration)):
        print("Recording... Speak into the microphone.")
        while True:
            # Keep the script running to continuously process audio input
            time.sleep(0.1)
except Exception as e:
    print(f"Error with audio stream: {e}")

Recording... Speak into the microphone.




Time taken for transcription: 0.73 seconds
Transcription: तू को नाही?
Time taken for transcription: 0.97 seconds
Transcription: तो ते नावक आहे.
Time taken for transcription: 0.82 seconds
Transcription: तुझा नावकाय.
Time taken for transcription: 0.70 seconds
Transcription: च्च


KeyboardInterrupt: 

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig
import requests
import sounddevice as sd
import torch
import numpy as np
import time

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr_v3")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr_v3", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Parameters
fs = 16000  # Sample rate
block_duration = 2  # Block size in seconds
silence_threshold = 0.005  # Energy threshold for detecting silence
pause_duration = 1.5  # Minimum pause duration in seconds

# Buffer to store audio chunks
buffer = []
last_audio_time = time.time()


def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Process audio using the Whisper processor
        input_features = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        # Generate transcription using the Whisper model
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return [""]

def audio_callback(indata, frames, time_info, status):
    global last_audio_time
    if status:
        print(f"Status: {status}", flush=True)
    
    # Convert audio chunk to numpy array and flatten to 1D
    audio_chunk = indata[:, 0].flatten()
    current_time = time.time()

    # Check for silence
    if np.mean(np.abs(audio_chunk)) < silence_threshold:
        if current_time - last_audio_time > pause_duration:
            # Accumulate audio data
            if buffer:
                # Process accumulated audio data
                combined_audio = np.concatenate(buffer)
                transcription = get_transcription(combined_audio)
                print(f"Transcription: {transcription[0]}", flush=True)
                buffer.clear()  # Clear the buffer after processing
    else:
        # Update last audio time and add to buffer
        last_audio_time = current_time
        buffer.append(audio_chunk)

# Create an audio stream
try:
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, blocksize=int(fs * block_duration)):
        print("Recording... Speak into the microphone.")
        while True:
            # Keep the script running to continuously process audio input
            time.sleep(0.1)
except Exception as e:
    print(f"Error with audio stream: {e}")

Live Transcription with pause ditection using pyaudio

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig
import requests
import pyaudio
import torch
import numpy as np
import time

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Parameters
fs = 16000  # Sample rate
block_duration = 4  # Block size in seconds
silence_threshold = 50  # Energy threshold for detecting silence
pause_duration = 1.5  # Minimum pause duration in seconds

# Buffer to store audio chunks
buffer = []
last_audio_time = time.time()

def get_transcription(audio_chunk):
    try:
        start_time = time.time()

        input_features = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return [""]

def callback(in_data, frame_count, time_info, status):
    global last_audio_time
    audio_chunk = np.frombuffer(in_data, dtype=np.int16)
    
    current_time = time.time()

    if np.mean(np.abs(audio_chunk)) < silence_threshold:
        if current_time - last_audio_time > pause_duration:
            if buffer:
                combined_audio = np.concatenate(buffer)
                transcription = get_transcription(combined_audio)
                print(f"Transcription: {transcription[0]}", flush=True)
                buffer.clear()
    else:
        last_audio_time = current_time
        buffer.append(audio_chunk)

    return (None, pyaudio.paContinue)

# Create an audio stream using PyAudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
                channels=1,
                rate=fs,
                input=True,
                frames_per_buffer=int(fs * block_duration),
                stream_callback=callback)

try:
    print("Recording... Speak into the microphone.")
    stream.start_stream()

    while stream.is_active():
        time.sleep(0.1)
except Exception as e:
    print(f"Error with audio stream: {e}")
finally:
    stream.stop_stream()
    stream.close()

    p.terminate()

In [13]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig
import requests
import sounddevice as sd
import torch
import numpy as np
import time

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr_v3")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr_v3", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Parameters
fs = 16000  # Sample rate
block_duration = 2  # Block size in seconds
silence_threshold = 0.005  # Energy threshold for detecting silence
pause_duration = 1.5  # Minimum pause duration in seconds

# Buffer to store audio chunks
buffer = []
last_audio_time = time.time()


def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Process audio using the Whisper processor
        input_features = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        # Generate transcription using the Whisper model
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return [""]

def audio_callback(indata, frames, time_info, status):
    global last_audio_time
    if status:
        print(f"Status: {status}", flush=True)
    
    # Convert audio chunk to numpy array and flatten to 1D
    audio_chunk = indata[:, 0].flatten()
    current_time = time.time()

    # Check for silence
    if np.mean(np.abs(audio_chunk)) < silence_threshold:
        if current_time - last_audio_time > pause_duration:
            # Accumulate audio data
            if buffer:
                # Process accumulated audio data
                combined_audio = np.concatenate(buffer)
                transcription = get_transcription(combined_audio)
                print(f"Transcription: {transcription[0]}", flush=True)
                buffer.clear()  # Clear the buffer after processing
    else:
        # Update last audio time and add to buffer
        last_audio_time = current_time
        buffer.append(audio_chunk)

# Create an audio stream
try:
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, blocksize=int(fs * block_duration)):
        print("Recording... Speak into the microphone.")
        while True:
            # Keep the script running to continuously process audio input
            time.sleep(0.1)
except Exception as e:
    print(f"Error with audio stream: {e}")

Recording... Speak into the microphone.
Time taken for transcription: 0.90 seconds
Transcription: तेचे नाव काय आहे?
Time taken for transcription: 0.80 seconds
Transcription: तू काय कधतोस?
Time taken for transcription: 0.97 seconds
Transcription: तुला कोणी बनवलेले आहे.


KeyboardInterrupt: 