In [21]:
!pip install transformers==4.37.2
!pip install bitsandbytes==0.41.3 accelerate==0.25.0
!pip install git+https://github.com/openai/whisper.git
!pip install gradio
!pip install gTTS
!pip install huggingface_hub
!pip install bark

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-0eoz6srp
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-0eoz6srp
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [22]:
import torch
from transformers import BitsAndBytesConfig, pipeline, AutoTokenizer, AutoModel, AutoProcessor
import whisper
import gradio as gr
import numpy as np
import librosa
import soundfile as sf
from bark import generate_audio, SAMPLE_RATE
import warnings
import os
import wave
from io import BytesIO

In [23]:
warnings.filterwarnings("ignore")

In [24]:
# CUDA Check
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using torch {torch.__version__} ({DEVICE})")

Using torch 2.4.0+cu121 (cuda)


In [25]:
# Load Whisper model for Speech-to-Text
model_whisper = whisper.load_model("medium", device=DEVICE)
print(f"Whisper model loaded with {sum(np.prod(p.shape) for p in model_whisper.parameters()):,} parameters.")


Whisper model loaded with 762,321,920 parameters.


In [26]:
# Load Text Generation model (NVIDIA Mistral)
model_id = "google/flan-t5-large"
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

pipe = pipeline(
    "text2text-generation",
    model=model_id,
    model_kwargs={"quantization_config": quant_config}
)
print(f"Loaded Text Generation model: {model_id}")

Loaded Text Generation model: google/flan-t5-large


In [27]:
# Load Bark model and processor
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

In [49]:
# Define speaker options
SPEAKERS = {
    "english-male-1": "v2/en_speaker_1",
    "english-male-2": "v2/en_speaker_2",
    "english-female": "v2/en_speaker_9",
    "hindi-male-1": "v2/hi_speaker_2",
    "hindi-male-2": "v2/hi_speaker_5",
    "hindi-female-1": "v2/hi_speaker_0",
    "hindi-female-2": "v2/hi_speaker_4"
}

In [50]:
def convert_audio_to_whisper_format(audio_file):
    audio, sr = librosa.load(audio_file, sr=16000, mono=True)
    processed_audio_path = "processed_audio.wav"
    sf.write(processed_audio_path, audio, sr)
    return processed_audio_path

In [51]:
def transcribe(audio_path):
    if audio_path is None:
        return ''  # Return empty string if no audio input

    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(DEVICE)
    result = model_whisper.decode(mel)
    return result.text

In [57]:
def text_to_speech(text, speaker):
    # Define the voice preset based on the selected speaker
    voice_presets = {
        "english-male-1": "v2/en_speaker_1",
        "english-male-2": "v2/en_speaker_2",
        "english-female": "v2/en_speaker_9",
        "hindi-male-1": "v2/hi_speaker_2",
        "hindi-male-2": "v2/hi_speaker_5",
        "hindi-female-1": "v2/hi_speaker_0",
        "hindi-female-2": "v2/hi_speaker_4"
    }

    # Load the selected voice preset
    voice_preset = voice_presets.get(speaker, "v2/en_speaker_1")  # Default to "english-male-1" if speaker not found

    # Process the text input with the selected voice preset
    inputs = processor(text, voice_preset=voice_preset)

    # Generate the audio
    audio_array = model.generate(**inputs,pad_token_id=100)
    audio_array = audio_array.cpu().numpy().squeeze()  # Convert to numpy array and remove unnecessary dimensions

    return audio_array, SAMPLE_RATE

In [58]:
def process_audio(audio_file, selected_speaker):
    # Check if the audio_file is a string (file path) or a file-like object
    if isinstance(audio_file, str):
        audio_path = audio_file
    elif hasattr(audio_file, 'read'):
        audio_path = 'temp_audio_file.wav'
        with open(audio_path, 'wb') as f:
            f.write(audio_file.read())
    else:
        raise ValueError("Invalid audio file provided.")

    # Convert and transcribe audio
    processed_audio_path = convert_audio_to_whisper_format(audio_path)
    transcription = transcribe(processed_audio_path)

    # Generate speech from text
    audio_array, sample_rate = text_to_speech(transcription, selected_speaker)

    # Save the generated speech to a WAV file
    generated_speech_path = "generated_speech.wav"
    sf.write(generated_speech_path, audio_array, sample_rate)

    return transcription, generated_speech_path

In [59]:
def clear_inputs():
    return None, None, None

In [60]:
interface = gr.Interface(
    fn=process_audio,
    inputs=[
        gr.Audio(sources=["microphone"], type="filepath", label="Record Audio"),  # Microphone input
        gr.Dropdown(
            label="Select Speaker",
            choices=list(SPEAKERS.keys()),  # Use the keys from SPEAKERS dictionary
            value="english-male-1"  # Default speaker
        )
    ],
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.Audio(label="Generated Speech")
    ],
    title="Voice-to-Voice with Customizable Voice",
    description="Record your voice and select a speaker to generate speech based on the transcribed text."
)

In [None]:
interface.launch(debug=True)

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://74f5a69b5b329a0853.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)
