In [None]:
!pip install transformers==4.37.2
!pip install bitsandbytes==0.41.3 accelerate==0.25.0
!pip install git+https://github.com/openai/whisper.git
!pip install gradio
!pip install gTTS
!pip install huggingface_hub
!pip install bark

In [None]:
from transformers import AutoProcessor, AutoModel
import torch
import whisper
import gradio as gr
import warnings
import librosa
import soundfile as sf

In [None]:
warnings.filterwarnings("ignore")

In [None]:
# CUDA Check
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using torch {torch.__version__} ({DEVICE})")

In [None]:
# Load Whisper model for Speech-to-Text
model_whisper = whisper.load_model("medium", device=DEVICE)
print(f"Whisper model loaded with {sum(np.prod(p.shape) for p in model_whisper.parameters()):,} parameters.")


In [None]:
# Load Text Generation model (NVIDIA Mistral)
model_id = "google/flan-t5-large"
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

pipe = pipeline(
    "text2text-generation",
    model=model_id,
    model_kwargs={"quantization_config": quant_config}
)
print(f"Loaded Text Generation model: {model_id}")

In [None]:
# Load Bark model and processor
processor = AutoProcessor.from_pretrained("suno/bark-small")
model = AutoModel.from_pretrained("suno/bark-small")

In [None]:
# Define speaker options
SPEAKERS = {
    "english-male-1": "v2/en_speaker_1",
    "english-male-2": "v2/en_speaker_2",
    "english-female": "v2/en_speaker_9",
    "hindi-male-1": "v2/hi_speaker_2",
    "hindi-male-2": "v2/hi_speaker_5",
    "hindi-female-1": "v2/hi_speaker_0",
    "hindi-female-2": "v2/hi_speaker_4"
}

In [None]:
# Convert audio format to 16kHz mono using librosa
def convert_audio_to_whisper_format(audio_path):
    audio, sr = librosa.load(audio_path, sr=16000, mono=True)
    processed_audio_path = "processed_audio.wav"
    sf.write(processed_audio_path, audio, sr)
    return processed_audio_path


In [None]:
# Transcribe function using Whisper
def transcribe(audio):
    if audio is None or audio == '':
        return ''  # Return empty string if no audio input

    audio = whisper.load_audio(audio)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)
    result = whisper.decode(whisper_model, mel)
    return result.text

In [None]:
# Text-to-Speech conversion using Bark with speaker selection
def text_to_speech_bark(text, speaker_id, file_path="output.wav"):
    inputs = processor(text, return_tensors="pt")
    inputs["speaker_embeddings"] = bark_model.get_speaker_embeddings(speaker_id)
    audio_array = bark_model.generate(inputs)
    sf.write(file_path, audio_array.cpu().numpy(), 24000)
    return file_path

In [None]:
def process_audio(audio_path, speaker):
    # Step 1: Transcribe audio
    speech_to_text_output = transcribe(audio_path)

    if not speech_to_text_output.strip():
        return "No speech detected.", "No response generated.", None

    # Step 2: Add instruction to LLM input
    llm_instruction = "You are an AI assistant. Answer the questions asked accurately and concisely."
    llm_input = f"{llm_instruction}\nUser: {speech_to_text_output}"

    # Step 3: Generate LLM response
    llm_response = pipe(llm_input)[0]['generated_text']

    # Step 4: Convert LLM response to speech using Bark
    processed_audio_path = text_to_speech_bark(llm_response, SPEAKERS[speaker])

    return speech_to_text_output, llm_response, processed_audio_path

In [None]:
def clear_inputs():
    return None, None, None

In [None]:
with gr.Blocks() as demo:
    audio_input = gr.Audio(type="filepath", label="Record your voice")
    speaker_dropdown = gr.Dropdown(choices=list(SPEAKERS.keys()), label="Select Speaker", value="english-male-1")
    transcript_output = gr.Textbox(label="Speech to Text")
    llm_output = gr.Textbox(label="LLM Response")
    audio_output = gr.Audio(label="Response as Audio")

    process_btn = gr.Button("Process Audio")
    clear_btn = gr.Button("Clear")

    process_btn.click(process_audio, inputs=[audio_input, speaker_dropdown], outputs=[transcript_output, llm_output, audio_output])
    clear_btn.click(clear_inputs, outputs=[audio_input, transcript_output, llm_output, audio_output])

In [None]:
demo.launch(debug=True)