🔁 PHASE: Unified Real-Time ASR (Whisper + IndicConformer Integration)


✅ Cell 1 – Install & Setup

Unified Imports

In [1]:
# ────────── CELL: Tuned IndicParler TTS Loading ──────────

# 🔧 Device Setup
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🔥 Using device: {device.upper()}")

# 📦 Core Libraries
import whisper
import pyttsx3
import numpy as np
import sounddevice as sd
import soundfile as sf
import platform
import psutil
import datetime
import time
from difflib import get_close_matches
from transformers import pipeline, AutoTokenizer, AutoModel
from parler_tts import ParlerTTSForConditionalGeneration
import torchaudio
from playsound import playsound

# 🎤 Whisper ASR (English + Intl.)
print("🔁 Loading Whisper model...")
whisper_model = whisper.load_model("small").to(device)
whisper_model.eval()

# 🪷 IndicConformer (Indian ASR)
print("🔁 Loading IndicConformer model...")
indic_model = AutoModel.from_pretrained(
    "ai4bharat/indic-conformer-600m-multilingual",
    trust_remote_code=True
).to(device)
indic_model.eval()

# 🌐 NLLB Translation Pipeline (200+ languages)
print("🔁 Loading NLLB-200 model...")
nllb_pipeline = pipeline(
    "translation",
    model="facebook/nllb-200-distilled-600M",
    device=0 if device == "cuda" else -1
)

# 🧠 Parler-TTS Model & Tokenizers (Tuned)
print("🔁 Loading Indic-Parler TTS model...")
# 1) Load the exact HF model:
model = ParlerTTSForConditionalGeneration.from_pretrained(
    "ai4bharat/indic-parler-tts"
).to(device)
model.eval()

print("🔁 Loading tokenizers...")
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(
    model.config.text_encoder._name_or_path
)

# 2) Ensure pad_token is set correctly:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if description_tokenizer.pad_token is None:
    description_tokenizer.pad_token = description_tokenizer.eos_token

# 🔊 pyttsx3 TTS (Fallback + Intl. Language Speech)
print("🔁 Initializing fallback TTS engine...")
engine = pyttsx3.init()
engine.setProperty("rate", 200)
engine.setProperty("volume", 1.0)

# ✅ Confirmation
print("✅ All models loaded successfully (and Indic-Parler is warmed up).")


🔥 Using device: CPU


Flash attention 2 is not installed


🔁 Loading Whisper model...
🔁 Loading IndicConformer model...


A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indic-conformer-600m-multilingual:
- model_onnx.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Fetching 403 files:   0%|          | 0/403 [00:00<?, ?it/s]

README.md: 0.00B [00:00, ?B/s]

🔁 Loading NLLB-200 model...
🔁 Loading Indic-Parler TTS model...


Config of the text_encoder: <class 'transformers.models.t5.modeling_t5.T5EncoderModel'> is overwritten by shared text_encoder config: T5Config {
  "_name_or_path": "google/flan-t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2816,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "transformers_version": "4.46.1",
  "use_cache": true,
  "vocab_size": 32128
}

Config of the audio_encoder: <class 'transformers.models.dac.modelin

🔁 Loading tokenizers...
🔁 Initializing fallback TTS engine...
✅ All models loaded successfully (and Indic-Parler is warmed up).


In [2]:
'''
Run once Use as many times as you want'''
#%pip install git+https://github.com/openai/whisper.git -q
#%pip install sounddevice scipy torchaudio transformers -q


'\nRun once Use as many times as you want'

✅ Cell 4 – Language Detection & Routing


In [3]:
# 📜 Master list of Indian language codes for special routing
indian_languages = {
    "as": "Assamese", "bn": "Bengali", "brx": "Bodo", "doi": "Dogri", "gu": "Gujarati",
    "hi": "Hindi", "kn": "Kannada", "kok": "Konkani", "ks": "Kashmiri", "mai": "Maithili",
    "ml": "Malayalam", "mni": "Manipuri", "mr": "Marathi", "ne": "Nepali", "or": "Odia",
    "pa": "Punjabi", "sa": "Sanskrit", "sat": "Santali", "sd": "Sindhi", "ta": "Tamil",
    "te": "Telugu", "ur": "Urdu"
}

def detect_input_language_whisper(audio_path: str, model=None):
    """
    Detects language from an audio file using Whisper's detect_language.
    Returns language code, name, and whether it's Indian.
    """
    if model is None:
        raise ValueError("Whisper model must be provided.")
    
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    _, probs = model.detect_language(mel)
    detected_code = max(probs, key=probs.get)
    is_indian = detected_code in indian_languages
    lang_name = indian_languages.get(detected_code, "International")

    print(f"🌐 Detected Language Code: {detected_code}")
    print(f"🌐 Interpreted as: {lang_name}")

    return detected_code, lang_name, is_indian




✅ Cell 5 – IndicConformer or Whisper Transcription


In [4]:
def transcribe_audio(audio_path: str, whisper_model):
    """
    Transcribes audio using IndicConformer if Indian language is detected,
    else falls back to Whisper.
    """
    # 🌐 Detect language using our modular Cell 4 function
    lang_code, lang_name, is_indic = detect_input_language_whisper(audio_path, whisper_model)

    if is_indic:
        print(f"🛤️ Routing to IndicConformer for {lang_name}...")
        try:
            # Load audio
            audio_tensor, sr = torchaudio.load(audio_path)
            if sr != 16000:
                resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
                audio_tensor = resampler(audio_tensor)
            audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True)

            # Load IndicConformer (would ideally cache globally)
            # indic_model = AutoModel.from_pretrained("ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True)
            #Already loaded in cell 2
            # Perform ASR (placeholder – actual method call will depend on interface)
            transcription = indic_model(audio_tensor, lang_code, "ctc")  # Replace with real inference call
            print(f"📝 IndicConformer Transcription: {transcription}")

        except Exception as e:
            print(f"⚠️ IndicConformer failed: {e}")
            print(f"🔁 Falling back to Whisper...")
            transcription = whisper_transcribe(audio_path, whisper_model)
            print(f"📝 Whisper Transcription: {transcription}")

    else:
        print(f"🛤️ International language detected. Using Whisper...")
        transcription = whisper_transcribe(audio_path, whisper_model)
        print(f"📝 Whisper Transcription: {transcription}")
    
    return transcription



def whisper_transcribe(audio_path: str, model):
    """
    Whisper fallback transcription logic.
    """
    audio = whisper.load_audio(audio_path)
    audio = whisper.pad_or_trim(audio)
    mel = whisper.log_mel_spectrogram(audio).to(model.device)

    options = whisper.DecodingOptions()
    result = whisper.decode(model, mel, options)
    return result.text

PHASE-2

In [5]:
# 🔤 Supported language mapping
supported_languages = {
    "english": "en", "hindi": "hi", "telugu": "te", "tamil": "ta", "german": "de",
    "french": "fr", "bengali": "bn", "marathi": "mr", "kannada": "kn", "malayalam": "ml",
    "japanese": "ja", "spanish": "es", "gujarati": "gu", "punjabi": "pa"
}

def get_language_code(spoken_input: str):
    """
    Matches spoken input to a known language.
    Returns (code, language_name) if matched, else (None, None)
    """
    spoken_input = spoken_input.lower().strip()
    close_match = get_close_matches(spoken_input, supported_languages.keys(), n=1, cutoff=0.4)
    if close_match:
        matched_lang = close_match[0]
        print(f"✅ Interpreted as: {matched_lang.capitalize()}")
        return supported_languages[matched_lang], matched_lang
    else:
        return None, None


✅ Cell 7 – Recording Utility (Reusable)


In [None]:
import sounddevice as sd
import soundfile as sf
import numpy as np
from scipy.signal import butter, lfilter

def butter_bandpass(lowcut, highcut, fs, order=4):
    nyq = 0.5 * fs
    # 🛡 Ensure highcut doesn't reach Nyquist
    if highcut >= nyq:
        highcut = nyq - 1
    low = lowcut / nyq
    high = highcut / nyq
    return butter(order, [low, high], btype='band')

def bandpass_filter(data, lowcut=80.0, highcut=7900.0, fs=16000, order=4):
    b, a = butter_bandpass(lowcut, highcut, fs, order)
    return lfilter(b, a, data)


def record_audio(duration=3, fs=16000, filename="user_input.wav", playback=False):
    """
    Records 'duration' seconds of audio, applies gentle bandpass filter
    to remove noise, and saves to 'filename'.
    """
    print(f"🎤 Recording ({duration}s)…")
    data = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype="float32")
    sd.wait()
    raw = data.flatten()

    # 🔇 Apply bandpass filter (removes <80 Hz hum, >8kHz hiss)
    filtered = bandpass_filter(raw, lowcut=80.0, highcut=8000.0, fs=fs)

    if playback:
        print("🔊 Playing back filtered audio…")
        sd.play(filtered, fs)
        sd.wait()

    # Save to file
    sf.write(filename, filtered, fs)
    print(f"✅ Filtered audio saved: {filename}")
    return filename

    return filename


✅ Cell 8 – Voice-Based Target Language Detection Function


In [7]:
def detect_target_language_by_voice(attempts=2):
    """
    Prompts user to speak the target language name.
    Returns language_code, language_name if matched, else (None, None)
    """
    for attempt in range(attempts):
        print(f"🗣️ Attempt {attempt + 1}/{attempts}: Speak target language name (e.g., Tamil, Hindi, German)")
        file_path = record_audio(duration=1.7, filename=f"target_attempt_{attempt + 1}.wav")
        
        audio = whisper.load_audio(file_path)
        audio = whisper.pad_or_trim(audio)
        mel = whisper.log_mel_spectrogram(audio).to(whisper_model.device)

        result = whisper.decode(whisper_model, mel)
        spoken = result.text.strip()
        print(f"📝 You said: {spoken}")

        lang_code, lang_name = get_language_code(spoken)
        if lang_code:
            return lang_code, lang_name
        
        print("⚠️ Not confident. Try again...\n")

    return None, None


✅ Cell 9 – Manual Fallback for Language Detection


In [8]:
def detect_target_language_manually():
    print("🔡 Please type the target language name (e.g., English, Hindi):")
    typed_input = input("Your input: ").strip().lower()
    lang_code, lang_name = get_language_code(typed_input)
    
    if lang_code:
        return lang_code, lang_name
    else:
        print("⚠️ Not recognized. Suggestions:")
        matches = get_close_matches(typed_input, supported_languages.keys(), n=3)
        print("🔎 Close matches:", ", ".join(matches))
        return None, None


✅ Cell 10 – Combined Driver Function for Target Language Selection


In [9]:
def get_target_language():
    print("🎯 Target Language Selection Started")
    code, name = detect_target_language_by_voice()
    if not code:
        print("🔁 Switching to manual input...")
        code, name = detect_target_language_manually()
    
    if code:
        print(f"✅ Final Target Language: {name.capitalize()} ({code})")
    else:
        print("❌ Language not supported. Cannot continue without valid target.")
    
    return code, name


✅ Phase-03: Unified Translation Pipeline (NLLB Only)



In [10]:
# Verified NLLB language codes
NLLB_LANG_CODE_MAP = {
    'en': 'eng_Latn', 'hi': 'hin_Deva', 'te': 'tel_Telu', 'ta': 'tam_Taml',
    'bn': 'ben_Beng', 'ml': 'mal_Mlym', 'kn': 'kan_Knda', 'mr': 'mar_Deva',
    'gu': 'guj_Gujr', 'pa': 'pan_Guru', 'ur': 'urd_Arab', 'ne': 'npi_Deva',
    'or': 'ory_Orya', 'as': 'asm_Beng', 'sd': 'snd_Arab', 'si': 'sin_Sinh',
    'fr': 'fra_Latn', 'de': 'deu_Latn', 'es': 'spa_Latn', 'zh': 'zho_Hans',
    'ja': 'jpn_Jpan', 'ko': 'kor_Hang', 'ru': 'rus_Cyrl', 'ar': 'arb_Arab',
    'pt': 'por_Latn', 'it': 'ita_Latn'
}

🔁 NLLB Translation Function



In [11]:
def translate_with_nllb(text, src_lang_code, tgt_lang_code):
    """
    Translates given text using NLLB model from source to target language.
    """
    try:
        src_nllb = NLLB_LANG_CODE_MAP.get(src_lang_code, "eng_Latn")
        tgt_nllb = NLLB_LANG_CODE_MAP.get(tgt_lang_code, "eng_Latn")
        
        translated = nllb_pipeline(text, src_lang=src_nllb, tgt_lang=tgt_nllb, max_length=512)
        return translated[0]['translation_text']
    
    except Exception as e:
        print(f"[ERROR] Translation failed: {e}")
        return None

🔄 Route Handler: Connect Phase-1 → Phase-2 → Phase-3



In [12]:
def route_translation_pipeline(transcribed_text, detected_lang_code, target_lang_code="en"):
    """
    Routes transcribed audio (from Whisper/IndicASR) and detected lang (FastText)
    through NLLB for translation.
    """
    print(f"\n[INFO] Source Language Detected: {detected_lang_code}")
    print(f"[INFO] Translating to Target Language: {target_lang_code}")

    translated_text = translate_with_nllb(transcribed_text, detected_lang_code, target_lang_code)
    if translated_text:
        print(f"[SUCCESS] Translation Output:\n{translated_text}")
    else:
        print("[FAILURE] No translated output.")
    return translated_text


🔧 Phase 4: Advanced Usage and Optimization of Indic Parler-TTS



1. Installation and Setup


In [13]:
'''
run once Use as many times as you want
''' 
#%pip install git+https://github.com/huggingface/parler-tts.git
#%pip install transformers soundfile

'\nrun once Use as many times as you want\n'

3. Generating Speech with Custom Descriptions


In [14]:


def generate_speech(prompt, description, output_file="output.wav"):
    """
    Generate clean TTS audio using Indic-Parler and validate the waveform.
    """
    if not prompt.strip() or not description.strip():
        print("🚫 Empty prompt or description. Skipping TTS.")
        return

    try:
        # Tokenize
        desc_inputs = description_tokenizer(description, return_tensors="pt", padding=True).to(device)
        prompt_inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)

        # Generate
        with torch.no_grad():
            generation = model.generate(
                input_ids=desc_inputs.input_ids,
                attention_mask=desc_inputs.attention_mask,
                prompt_input_ids=prompt_inputs.input_ids,
                prompt_attention_mask=prompt_inputs.attention_mask,
                max_length=512,
                do_sample=False
            )

        audio_arr = generation.cpu().numpy().squeeze()

        # Validate waveform
        if audio_arr.size == 0 or np.all(audio_arr == 0) or np.isnan(audio_arr).any():
            print("❌ Generated audio is invalid or silent. Skipping playback.")
            return

        # Save with correct sampling rate
        sf.write(output_file, audio_arr, model.config.sampling_rate)
        print(f"✅ Audio saved: {output_file}")
        playsound(output_file)

    except Exception as e:
        print(f"🔥 Error in TTS generation: {e}")





✅ Notebook Cells (Phase 5: TTS Routing & Playback)


*🔧 Cell 1: Install & Import Required Libraries*

In [15]:
# Indian language codes
INDIAN_LANGS = set([
    'hi', 'te', 'ta', 'bn', 'ml', 'kn', 'mr', 'gu', 'pa', 'ur', 'ne', 'or', 'as', 'sd', 'si'
])

def nllb_translate_and_classify(text, src_lang_code, tgt_lang_code):
    try:
        src_nllb = NLLB_LANG_CODE_MAP.get(src_lang_code, 'eng_Latn')
        tgt_nllb = NLLB_LANG_CODE_MAP.get(tgt_lang_code, 'eng_Latn')

        translated = nllb_pipeline(text, src_lang=src_nllb, tgt_lang=tgt_nllb, max_length=512)
        translated_text = translated[0]['translation_text']

        lang_type = 'indian' if tgt_lang_code in INDIAN_LANGS else 'international'
        return translated_text, tgt_lang_code, lang_type

    except Exception as e:
        print(f"[ERROR] NLLB Translation Failed: {e}")
        return None, None, None


In [16]:

import os
# International language voices (macOS Voice IDs)
LANG_VOICE_MAP = {
    "en": "com.apple.voice.compact.en-US.Samantha",
    "es": "com.apple.voice.compact.es-ES.Monica",
    "fr": "com.apple.voice.compact.fr-FR.Thomas",
    "zh": "com.apple.voice.compact.zh-CN.Tingting",
    "ar": "com.apple.voice.compact.ar-001.Maged",
    "pt": "com.apple.voice.compact.pt-BR.Luciana",
    "ru": "com.apple.voice.compact.ru-RU.Milena",
    "ja": "com.apple.voice.compact.ja-JP.Kyoko",
    "de": "com.apple.voice.compact.de-DE.Anna"
}

# Unified TTS Playback
def play_tts_output(text, lang_type, tgt_lang_code):
    if lang_type == "indian":
        # Indic-Parler TTS
        description = "A calm neutral Indian voice with natural pace and studio quality."
        desc_inputs = description_tokenizer(description, return_tensors="pt").to(device)
        prompt_inputs = tokenizer(text, return_tensors="pt").to(device)
        with torch.no_grad():
            generation = model.generate(
                input_ids=desc_inputs.input_ids,
                attention_mask=desc_inputs.attention_mask,
                prompt_input_ids=prompt_inputs.input_ids,
                prompt_attention_mask=prompt_inputs.attention_mask
                )
        audio_arr = generation.cpu().numpy().squeeze()
        output_file = "output_indic.wav"
        sf.write(output_file, audio_arr, samplerate=16000, subtype='PCM_16')
        playsound(output_file)
        print(f"✅ Played via IndicParler [{tgt_lang_code}]")

    elif lang_type == "international":
        voice_id = LANG_VOICE_MAP.get(tgt_lang_code, LANG_VOICE_MAP["en"])
        engine = pyttsx3.init()
        engine.setProperty("voice", voice_id)
        engine.setProperty("rate", 180)
        engine.setProperty("volume", 1.0)
        engine.say(text)
        engine.runAndWait()
        engine.stop()
        print(f"✅ Played via pyttsx3 [{tgt_lang_code}] voice: {voice_id.split('.')[-1]}")
def play_tts_output(text, lang_type, tgt_lang_code):
    """
    Unified TTS playback.  
    For 'indian' → use Indic‐Parler TTS, with empty‐waveform checks.  
    For 'international' → use pyttsx3 fallback.
    """
    # 1️⃣ Sanity check: skip if text is empty
    if not text or not text.strip():
        print("⚠️ play_tts_output called with empty text. Skipping TTS.")
        return

    if lang_type == "indian":
        # ⏳ Generate Indic‐Parler audio
        description = "A calm neutral Indian voice with natural pace and studio quality."
        desc_inputs = description_tokenizer(description, return_tensors="pt").to(device)
        prompt_inputs = tokenizer(text, return_tensors="pt").to(device)

        with torch.no_grad():
            generation = model.generate(
                input_ids=desc_inputs.input_ids,
                attention_mask=desc_inputs.attention_mask,
                prompt_input_ids=prompt_inputs.input_ids,
                prompt_attention_mask=prompt_inputs.attention_mask
            )

        audio_arr = generation.cpu().numpy().squeeze()
        if audio_arr.size == 0 or np.all(audio_arr == 0):
            print("❌ Indic-Parler TTS failed—empty waveform. Skipping playback.")
            return

        output_file = "output_indic.wav"
        sf.write(output_file, audio_arr, samplerate=16000, subtype='PCM_16')
        file_size = os.path.getsize(output_file) if os.path.exists(output_file) else 0
        if file_size > 1000:
            print(f"🔊 Playing Indic-Parler output: {output_file} (size: {file_size} bytes)")
            playsound(output_file)
        else:
            print(f"❌ File not valid or too small ({file_size} bytes). Skipping playback.")

        print(f"✅ Played via IndicParler [{tgt_lang_code}]")

    elif lang_type == "international":
        # 🗣️ pyttsx3 fallback
        voice_id = LANG_VOICE_MAP.get(tgt_lang_code, LANG_VOICE_MAP["en"])
        engine = pyttsx3.init()
        engine.setProperty("voice", voice_id)
        engine.setProperty("rate", 180)
        engine.setProperty("volume", 1.0)

        print(f"🔊 Playing via pyttsx3 [{tgt_lang_code}] voice: {voice_id.split('.')[-1]}")
        engine.say(text)
        engine.runAndWait()
        engine.stop()
        print(f"✅ Played via pyttsx3 [{tgt_lang_code}]")

Final Testing module

In [17]:
!huggingface-cli scan-cache


REPO ID                                     REPO TYPE SIZE ON DISK NB FILES LAST_ACCESSED     LAST_MODIFIED  REFS             LOCAL PATH                                                                                       
------------------------------------------- --------- ------------ -------- ----------------- -------------- ---------------- ------------------------------------------------------------------------------------------------ 
Helsinki-NLP/opus-mt-en-hi                  model             4.0M        5 4 weeks ago       4 weeks ago    main             /Users/srivighnateja/.cache/huggingface/hub/models--Helsinki-NLP--opus-mt-en-hi                  
ai4bharat/indic-conformer-600m-multilingual model             2.6G      404 24 seconds ago    39 seconds ago main             /Users/srivighnateja/.cache/huggingface/hub/models--ai4bharat--indic-conformer-600m-multilingual 
ai4bharat/indic-parler-tts                  model             3.8G        7 a few seconds ago 2 months a

In [18]:
# import shutil
# import os

# # Define the path of the cached model
# model_cache_path = '/Users/srivighnateja/.cache/huggingface/hub/models--google--flan-t5-large'

# # Check if the directory exists and delete it
# if os.path.exists(model_cache_path):
#     shutil.rmtree(model_cache_path)
#     print(f"Deleted model cache at {model_cache_path}")
# else:
#     print(f"Model cache not found at {model_cache_path}")


In [19]:
def play_beep(duration=0.07, freq=1000, fs=44100):
    """
    Plays a beep sound of given duration and frequency using sounddevice.
    """
    t = np.linspace(0, duration, int(fs * duration), False)
    beep = np.sin(freq * 2 * np.pi * t)
    sd.play(beep, fs)
    sd.wait()

In [20]:
import os

# 1️⃣ Silence the tokenizers fork warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 2️⃣ Force 8-core CPU for PyTorch
torch.set_num_threads(8)

print("🚀 Warming up all models…")

# 3️⃣ Create 1s silent audio at 16kHz (if not already created)
if not os.path.exists("dummy.wav"):
    dummy_audio = np.zeros(16000, dtype="float32")
    sf.write("dummy.wav", dummy_audio, 16000)

# ✅ Ensure pad_token is set before any TTS warm-up
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if description_tokenizer.pad_token is None:
    description_tokenizer.pad_token = description_tokenizer.eos_token

# 4️⃣ Whisper Warm-up
try:
    for i in range(2):
        t0 = time.time()
        a = whisper.load_audio("dummy.wav")
        a = whisper.pad_or_trim(a)
        m = whisper.log_mel_spectrogram(a).to(whisper_model.device)
        _ = whisper.decode(whisper_model, m)
        t1 = time.time()
        print(f"✅ Whisper warm-up {i+1}/2 in {t1-t0:.2f}s")
except Exception as e:
    print(f"⚠️ Whisper warm-up failed: {e}")

# 5️⃣ IndicConformer Warm-up
try:
    for i in range(2):
        t0 = time.time()
        at, sr = torchaudio.load("dummy.wav")
        if sr != 16000:
            from torchaudio.transforms import Resample
            at = Resample(sr, 16000)(at)
        inp = at.mean(dim=0, keepdim=True).to(device)
        _ = indic_model(inp, "hi", "ctc")
        t1 = time.time()
        print(f"✅ IndicConformer warm-up {i+1}/2 in {t1-t0:.2f}s")
except Exception as e:
    print(f"⚠️ IndicConformer warm-up failed: {e}")

# 6️⃣ NLLB Translation Warm-up
try:
    for i in range(2):
        t0 = time.time()
        _ = nllb_pipeline("Hello world", src_lang="eng_Latn", tgt_lang="tel_Telu")
        t1 = time.time()
        print(f"✅ NLLB translation warm-up {i+1}/2 in {t1-t0:.2f}s")
except Exception as e:
    print(f"⚠️ NLLB warm-up failed: {e}")

🚀 Warming up all models…
✅ Whisper warm-up 1/2 in 8.83s
✅ Whisper warm-up 2/2 in 5.59s
✅ IndicConformer warm-up 1/2 in 6.63s
✅ IndicConformer warm-up 2/2 in 0.38s
✅ NLLB translation warm-up 1/2 in 19.16s
✅ NLLB translation warm-up 2/2 in 13.93s


In [21]:
# 7️⃣ IndicParlerTTS Warm-up
try:
    for i in range(2):
        t0 = time.time()

        # Dummy inputs
        desc_input = description_tokenizer(
            "The voice is monotone and very fast in delivery, with clear audio and no background noise.",
            return_tensors="pt"
        ).to(device)

        prompt_input = tokenizer(
            "नमस्ते दुनिया",  # "Hello world" in Hindi
            return_tensors="pt"
        ).to(device)

        # Inference warm-up
        with torch.no_grad():
            audio_out = model.generate(
                input_ids=desc_input.input_ids,
                attention_mask=desc_input.attention_mask,
                prompt_input_ids=prompt_input.input_ids,
                prompt_attention_mask=prompt_input.attention_mask
            )

        # Optionally write to file for first time
        if i == 0:
            sf.write("warmup_output.wav", audio_out.cpu().numpy().squeeze(), model.config.sampling_rate)

        t1 = time.time()
        print(f"✅ IndicParlerTTS warm-up {i+1}/2 in {t1-t0:.2f}s")

except Exception as e:
    print(f"⚠️ IndicParlerTTS warm-up failed: {e}")


`prompt_attention_mask` is specified but `attention_mask` is not. A full `attention_mask` will be created. Make sure this is the intended behaviour.


✅ IndicParlerTTS warm-up 1/2 in 13.61s
✅ IndicParlerTTS warm-up 2/2 in 11.18s


In [22]:
from time import perf_counter

def optimized_voice_to_voice(duration=4, log_file="/Users/srivighnateja/Desktop/Speech-text/asr_evaluation_log.json"):
    torch.set_num_threads(8)
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    timings = {}
    profiling = {}
    start_all = perf_counter()

    # ─── PHASE 1: Start Beep + Record ───────────────────────
    play_beep()
    t0 = perf_counter()
    audio_path = record_audio(duration=duration, filename="input.wav", playback=False)
    t1 = perf_counter()
    timings['record'] = t1 - t0

    # ─── PHASE 2: Language Detection ────────────────────────
    t0 = perf_counter()
    src_code, src_lang, is_indic = detect_input_language_whisper(audio_path, whisper_model)
    t1 = perf_counter()
    timings['lang_detect'] = t1 - t0
    profiling['source_language'] = f"{src_lang} ({src_code})"
    profiling['model_routed'] = 'IndicConformer' if is_indic else 'Whisper'

    # ─── PHASE 3: Transcription (ASR) ───────────────────────
    t0 = perf_counter()
    transcription = transcribe_audio(audio_path, whisper_model)
    t1 = perf_counter()
    timings['asr'] = t1 - t0
    profiling['transcription'] = transcription

    # ─── Fine-grained: Time from LangID → TTS Playback ──────
    langid_to_tts_start = perf_counter()

    # ─── PHASE 4: Target Language Detection ─────────────────
    play_beep()
    t0 = perf_counter()
    tgt_code, tgt_lang = detect_target_language_by_voice()
    if not tgt_code:
        tgt_code, tgt_lang = detect_target_language_manually()
    t1 = perf_counter()
    timings['target_lang_select'] = t1 - t0
    profiling['target_language'] = f"{tgt_lang} ({tgt_code})"

    # ─── PHASE 5: Translation ───────────────────────────────
    t0 = perf_counter()
    translated_text = translate_with_nllb(transcription, src_code, tgt_code)
    t1 = perf_counter()
    timings['translation'] = t1 - t0
    profiling['translated_text'] = translated_text

    # ─── PHASE 6: TTS Synthesis ─────────────────────────────
    t0 = perf_counter()
    if tgt_code in indian_languages:
        desc = description_tokenizer("The voice is monotone and very fast in delivery, with clear audio and no background noise.", return_tensors="pt").to(device)
        prompt = tokenizer(translated_text, return_tensors="pt").to(device)
        with torch.no_grad():
            audio_arr = model.generate(
                input_ids=desc.input_ids,
                attention_mask=desc.attention_mask,
                prompt_input_ids=prompt.input_ids,
                prompt_attention_mask=prompt.attention_mask
            ).cpu().numpy().squeeze()
        sf.write("output.wav", audio_arr, model.config.sampling_rate)
        t1 = perf_counter()
        playsound("output.wav")
    else:
        engine.say(translated_text)
        engine.runAndWait()
    t1 = perf_counter()
    timings['tts'] = t1 - t0

    # ─── Final Metrics ─────────────────────────────────────
    total_runtime = perf_counter() - start_all
    langid_to_tts_total = perf_counter() - langid_to_tts_start
    system = {
        "CPU": platform.processor(),
        "Cores": psutil.cpu_count(logical=False),
        "RAM_GB": round(psutil.virtual_memory().total / 1e9, 2),
        "Platform": platform.system() + " " + platform.release()
    }

    entry = {
        "timestamp": datetime.datetime.now().isoformat(),
        **timings,
        "pipeline_core_latency": round(langid_to_tts_total, 2),
        "total_runtime": round(total_runtime, 2),
        **profiling,
        **system
    }

    with open(log_file, "a") as f:
        f.write(f"{entry}\n")

    # ─── REPORTING ─────────────────────────────────────────
    print("\n🔍 Phase-Wise Latency Report:")
    for phase, t in timings.items():
        print(f" • {phase:20s}: {t:.2f} sec")
    print(f" • {'Lang→TTS Pipeline':20s}: {langid_to_tts_total:.2f} sec")
    print(f" • {'Total Runtime':20s}: {total_runtime:.2f} sec")

    print("\n🧾 Language & Routing Info:")
    print(f" • Source Language        : {profiling['source_language']}")
    print(f" • Target Language        : {profiling['target_language']}")
    print(f" • Model Routed Through   : {profiling['model_routed']}")

    print("\n🧠 Transcription → Translation:")
    print(f" • ASR Output             : {transcription}")
    print(f" • Translation Output     : {translated_text}")

    print("\n🖥️ System Info:")
    for k, v in system.items():
        print(f" • {k:20s}: {v}")

    return entry

# ▶️ Run it:
optimized_voice_to_voice(duration=4)

🎤 Recording (4s)…
✅ Filtered audio saved: input.wav
🌐 Detected Language Code: te
🌐 Interpreted as: Telugu
🌐 Detected Language Code: te
🌐 Interpreted as: Telugu
🛤️ Routing to IndicConformer for Telugu...
📝 IndicConformer Transcription: ఈ రోజు చాలా మంచి రోజు నాకు చాలా సంతోషంగా ఉంది
🗣️ Attempt 1/2: Speak target language name (e.g., Tamil, Hindi, German)
🎤 Recording (1.7s)…
✅ Filtered audio saved: target_attempt_1.wav
📝 You said: English.
✅ Interpreted as: English

🔍 Phase-Wise Latency Report:
 • record              : 4.39 sec
 • lang_detect         : 3.23 sec
 • asr                 : 1.83 sec
 • target_lang_select  : 7.88 sec
 • translation         : 12.39 sec
 • tts                 : 3.44 sec
 • Lang→TTS Pipeline   : 23.95 sec
 • Total Runtime       : 35.87 sec

🧾 Language & Routing Info:
 • Source Language        : Telugu (te)
 • Target Language        : english (en)
 • Model Routed Through   : IndicConformer

🧠 Transcription → Translation:
 • ASR Output             : ఈ రోజు చాలా మంచి ర

{'timestamp': '2025-06-29T20:43:23.698318',
 'record': 4.3927346669952385,
 'lang_detect': 3.226943042012863,
 'asr': 1.8286797089967877,
 'target_lang_select': 7.879896083002677,
 'translation': 12.394115333008813,
 'tts': 3.4372319579997566,
 'pipeline_core_latency': 23.95,
 'total_runtime': 35.87,
 'source_language': 'Telugu (te)',
 'model_routed': 'IndicConformer',
 'transcription': 'ఈ రోజు చాలా మంచి రోజు నాకు చాలా సంతోషంగా ఉంది',
 'target_language': 'english (en)',
 'translated_text': 'Today is a very good day. I am very happy.',
 'CPU': 'arm',
 'Cores': 8,
 'RAM_GB': 8.59,
 'Platform': 'Darwin 24.5.0'}