In [None]:
import platform
import requests
import sounddevice as sd
import torch
import numpy as np
import time
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig

# Function to detect the operating system and decide on GPU support
def get_device():
    current_os = platform.system()
    print(f"Detected operating system: {current_os}")

    if current_os == "Darwin":  # macOS
        if torch.backends.mps.is_available():
            print("Using Metal Performance Shaders (MPS) for GPU.")
            return torch.device("mps")
        else:
            print("MPS is not available. Falling back to CPU.")
            return torch.device("cpu")
    elif current_os == "Windows" or current_os == "Linux":
        if torch.cuda.is_available():
            print("Using CUDA for GPU.")
            return torch.device("cuda")
        else:
            print("CUDA is not available. Falling back to CPU.")
            return torch.device("cpu")
    else:
        print("Unsupported operating system. Falling back to CPU.")
        return torch.device("cpu")

# Get the appropriate device based on the OS
device = get_device()

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")
model.to(device)

# Parameters
fs = 16000  # Sample rate
block_duration = 2  # Block size in seconds
silence_threshold = 0.005  # Energy threshold for detecting silence
pause_duration = 1.5  # Minimum pause duration in seconds

# Buffer to store audio chunks
buffer = []
last_audio_time = time.time()

def get_transcription(audio_chunk):
    try:
        # Start timing
        start_time = time.time()

        # Process audio using the Whisper processor
        input_features = processor(audio_chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
        # Generate transcription using the Whisper model
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

        # Calculate time taken
        time_taken = time.time() - start_time
        print(f"Time taken for transcription: {time_taken:.2f} seconds")

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return [""]

def audio_callback(indata, frames, time_info, status):
    global last_audio_time
    if status:
        print(f"Status: {status}", flush=True)

    # Convert audio chunk to numpy array and flatten to 1D
    audio_chunk = indata[:, 0].flatten()
    current_time = time.time()

    # Check for silence
    if np.mean(np.abs(audio_chunk)) < silence_threshold:
        if current_time - last_audio_time > pause_duration:
            # Accumulate audio data
            if buffer:
                # Process accumulated audio data
                combined_audio = np.concatenate(buffer)
                transcription = get_transcription(combined_audio)
                print(f"Transcription: {transcription[0]}", flush=True)
                buffer.clear()  # Clear the buffer after processing
    else:
        # Update last audio time and add to buffer
        last_audio_time = current_time
        buffer.append(audio_chunk)

# Create an audio stream
try:
    with sd.InputStream(callback=audio_callback, channels=1, samplerate=fs, blocksize=int(fs * block_duration)):
        print("Recording... Speak into the microphone.")
        while True:
            # Keep the script running to continuously process audio input
            time.sleep(0.1)
except Exception as e:
    print(f"Error with audio stream: {e}")

In [None]:
!pip install gradio scipy

In [21]:
import platform
import requests
import torch
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration, WhisperConfig
import gradio as gr
from scipy.signal import resample
import time

# Function to detect the operating system and decide on GPU support
def get_device():
    current_os = platform.system()
    print(f"Detected operating system: {current_os}")

    if current_os == "Darwin":  # macOS
        if torch.backends.mps.is_available():
            print("Using Metal Performance Shaders (MPS) for GPU.")
            return torch.device("mps")
        else:
            print("MPS is not available. Falling back to CPU.")
            return torch.device("cpu")
    elif current_os == "Windows" or current_os == "Linux":
        if torch.cuda.is_available():
            print("Using CUDA for GPU.")
            return torch.device("cuda")
        else:
            print("CUDA is not available. Falling back to CPU.")
            return torch.device("cpu")
    else:
        print("Unsupported operating system. Falling back to CPU.")
        return torch.device("cpu")

# Get the appropriate device based on the OS
device = get_device()

# Initialize the processor and model
processor = WhisperProcessor.from_pretrained("Viraj008/whisper-small-mr")
config_url = "https://huggingface.co/Viraj008/whisper-small-mr/resolve/main/config.json"
config_response = requests.get(config_url)

if config_response.status_code == 200:
    config_dict = config_response.json()
    config = WhisperConfig.from_dict(config_dict)
else:
    raise ValueError("Failed to load configuration from the specified URL.")

model = WhisperForConditionalGeneration.from_pretrained("Viraj008/whisper-small-mr", config=config)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="mr", task="transcribe")
model.to(device)

def transcribe_audio(audio):
    # Start timing
    start_time = time.time()

    # Load audio as numpy array
    sampling_rate, audio_data = audio

    # Check if the sampling rate is not 16000 Hz
    if sampling_rate != 16000:
        print(f"Resampling audio from {sampling_rate} Hz to 16000 Hz")
        number_of_samples = round(len(audio_data) * float(16000) / sampling_rate)
        audio_data = resample(audio_data, number_of_samples)

    # Normalize audio data
    audio_data = (audio_data / np.max(np.abs(audio_data))).astype(np.float32)

    # Process audio using the Whisper processor
    input_features = processor(audio_data, sampling_rate=16000, return_tensors="pt").input_features.to(device)
    
    # Generate transcription using the Whisper model
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    # Calculate transcription time
    time_taken = time.time() - start_time
    transcription_result = f"Transcription: {transcription[0]}\nTime taken: {time_taken:.2f} seconds"
    
    return transcription_result

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("## Whisper Transcription")
    
    # Add audio examples
    audio_examples = [
        "marathi-audio.wav"
    ]
    
    audio_input = gr.Audio(label="Record or Upload Audio", type="numpy")
    transcription_output = gr.Textbox(label="Transcription")
    
    # Add radio buttons for audio examples
    audio_example_selector = gr.Radio(label="Select Audio Example", choices=audio_examples, type="value")
    
    # Function to load selected audio example
    def load_example(selected_example):
        return selected_example
    
    audio_example_selector.change(load_example, inputs=audio_example_selector, outputs=audio_input)
    
    audio_input.change(transcribe_audio, inputs=audio_input, outputs=transcription_output)

# Launch the Gradio interface
demo.launch()

Detected operating system: Windows
Using CUDA for GPU.
Running on local URL:  http://127.0.0.1:7868

To create a public link, set `share=True` in `launch()`.




Resampling audio from 24000 Hz to 16000 Hz


