## Runs Marathi Speech to Text with faster_whisper on cpu

In [None]:
!pip install faster-whisper sounddevice ctranslate2 torch numpy==1.26.4 transformers

### Convert model to ctranslate2 to use with faster_whisper

In [3]:
!ct2-transformers-converter --model Viraj008/whisper-small-mr --output_dir Viraj008/whisper-small-mr-ct2
# !ct2-transformers-converter --model Viraj008/whisper-small-mr_v3 --output_dir Viraj008/whisper-small-mr-v3-ct2

In [None]:
import sounddevice as sd
import numpy as np
import time
from faster_whisper import WhisperModel


# Initialize the faster-whisper model
model_size ="Viraj008\whisper-small-mr-ct2"
# model_size = "Viraj008/whisper-small-mr-v3-ct2" 

model = WhisperModel(model_size, device="cpu", compute_type="int8")

# Parameters
fs = 16000  # Sample rate
block_duration = 2  # Block size in seconds
silence_threshold = 0.005  # Energy threshold for detecting silence
pause_duration = 1.5  # Minimum pause duration in seconds

# Buffer to store audio chunks
buffer = []
last_audio_time = time.time()

def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Transcribe audio using the faster-whisper model
        segments, info = model.transcribe(audio_chunk, beam_size=5, without_timestamps=True)
        
        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

        # Extract transcription text
        transcription = " ".join(segment.text for segment in segments)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return ""

def audio_callback(indata, frames, time_info, status):
    global last_audio_time
    if status:
        print(f"Status: {status}", flush=True)

    # Convert audio chunk to numpy array and flatten to 1D
    audio_chunk = indata[:, 0].flatten()
    current_time = time.time()

    # Check for silence
    if np.mean(np.abs(audio_chunk)) < silence_threshold:
        if current_time - last_audio_time > pause_duration:
            # Accumulate audio data
            if buffer:
                # Process accumulated audio data
                combined_audio = np.concatenate(buffer)
                transcription = get_transcription(combined_audio)
                print(f"Transcription: {transcription}", flush=True)
                buffer.clear()  # Clear the buffer after processing
    else:
        # Update last audio time and add to buffer
        last_audio_time = current_time
        buffer.append(audio_chunk)

# Create an audio stream
try:
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, blocksize=int(fs * block_duration)):
        print("Recording... Speak into the microphone.")
        while True:
            # Keep the script running to continuously process audio input
            time.sleep(0.1)
except Exception as e:
    print(f"Error with audio stream: {e}")

## Using pyaudio

In [None]:
!pip install jupyter ipywidgets pyaudio faster-whisper ctranslate2 torch numpy==1.26.4 transformers

In [None]:
import pyaudio
import numpy as np
import time
from faster_whisper import WhisperModel

# Initialize the faster-whisper model
model_size = "Viraj008/whisper-small-mr-ct2"
# model_size = "Viraj008/whisper-small-mr-v3-ct2"

model = WhisperModel(model_size, device="cpu", compute_type="int8")

# Parameters
fs = 16000  # Sample rate
block_duration = 2  # Block size in seconds
silence_threshold = 0.005  # Energy threshold for detecting silence
pause_duration = 1.5  # Minimum pause duration in seconds

# Buffer to store audio chunks
buffer = []
last_audio_time = time.time()

def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Transcribe audio using the faster-whisper model
        segments, info = model.transcribe(audio_chunk, beam_size=5, without_timestamps=True)
        
        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

        # Extract transcription text
        transcription = " ".join(segment.text for segment in segments)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return ""

def audio_callback(in_data, frame_count, time_info, status):
    global last_audio_time
    if status:
        print(f"Status: {status}", flush=True)

    # Convert byte string audio data to numpy array
    audio_chunk = np.frombuffer(in_data, dtype=np.float32)
    current_time = time.time()

    # Check for silence
    if np.mean(np.abs(audio_chunk)) < silence_threshold:
        if current_time - last_audio_time > pause_duration:
            # Accumulate audio data
            if buffer:
                # Process accumulated audio data
                combined_audio = np.concatenate(buffer)
                transcription = get_transcription(combined_audio)
                print(f"Transcription: {transcription}", flush=True)
                buffer.clear()  # Clear the buffer after processing
    else:
        # Update last audio time and add to buffer
        last_audio_time = current_time
        buffer.append(audio_chunk)

    return (in_data, pyaudio.paContinue)

# Initialize PyAudio
p = pyaudio.PyAudio()

# Create an audio stream
try:
    stream = p.open(format=pyaudio.paFloat32,
                    channels=1,
                    rate=fs,
                    input=True,
                    frames_per_buffer=int(fs * block_duration),
                    stream_callback=audio_callback)

    stream.start_stream()

    print("Recording... Speak into the microphone.")

    # Keep the script running to continuously process audio input
    while stream.is_active():
        time.sleep(0.1)

except Exception as e:
    print(f"Error with audio stream: {e}")

finally:
    stream.stop_stream()
    stream.close()
    p.terminate()