In [None]:
!pip install anvil-uplink

In [None]:
import anvil.server
anvil.server.connect('server_6CU44U7Y7SWNGRX2APBMPKTK-P3ND7TTTPMEKZ7IK')

In [None]:
!pip install -q torch transformers sentencepiece indic-nlp-library


In [None]:
!pip install -q git+https://github.com/VarunGumma/IndicTransToolkit


In [None]:
!pip install -q flash-attn --no-build-isolation


In [None]:
# Install required components
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

In [None]:
import sys
sys.path.append('/content/indic_nlp_resources')

**IndicLID -> Getting the code of the input language**

In [None]:
!pip3 install fasttext
!pip3 install transformers

In [None]:
!git clone https://github.com/AI4Bharat/IndicLID.git

In [None]:
%cd "/content/IndicLID/Inference"

In [None]:
%mkdir models
%cd "/content/IndicLID/Inference/models"

In [None]:
!wget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-bert.zip
!wget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-ftn.zip
!wget https://github.com/AI4Bharat/IndicLID/releases/download/v1.0/indiclid-ftr.zip

In [None]:
!unzip indiclid-bert.zip
!unzip indiclid-ftn.zip
!unzip indiclid-ftr.zip

In [None]:
# %cd "/content/IndicLID/"
%cd "/content/IndicLID/Inference"

**Language Convertion**

**Combined Code -> Regional text to English**

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit import IndicProcessor
from ai4bharat.IndicLID import IndicLID
import os

os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
# Define device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize language detection model
lid_model = IndicLID()

# Load translation model and tokenizer
model_name = "ai4bharat/indictrans2-indic-en-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
).to(DEVICE)

# Initialize processor
ip = IndicProcessor(inference=True)

# Mapping of LID language outputs to IndicTrans language codes
# This mapping connects the language detected by IndicLID to the code expected by IndicTrans
lid_to_indictrans_mapping = {
    'Hindi': 'hin_Deva',
    'Bengali': 'ben_Beng',
    'Telugu': 'tel_Telu',
    'Marathi': 'mar_Deva',
    'Tamil': 'tam_Taml',
    'Urdu': 'urd_Arab',
    'Gujarati': 'guj_Gujr',
    'Kannada': 'kan_Knda',
    'Odia': 'ori_Orya',
    'Punjabi': 'pan_Guru',
    'Malayalam': 'mal_Mlym',
    'Assamese': 'asm_Beng',
    'Maithili': 'mai_Deva',
    'Santali': 'sat_Olck',
    'Kashmiri': 'kas_Arab',
    'Nepali': 'npi_Deva',
    'Sindhi': 'snd_Arab',
    'Dogri': 'doi_Deva',
    'Konkani': 'kok_Deva',
    'Manipuri': 'mni_Beng',
    'Bodo': 'brx_Deva',
    'Sanskrit': 'san_Deva'
}




In [None]:
def detect_and_translate(input_text):
    """
    Detect the language of input text and translate it to English

    Args:
        input_text (str): Text in any Indian language

    Returns:
        dict: Dictionary containing original text, detected language, and translated text
    """
    # Detect language
    batch_size = 1
    lid_output = lid_model.batch_predict([input_text], batch_size)

    # Extract detected language
    detected_lang = lid_output[0][1]  # First element is language name
    confidence = lid_output[0][2]     # Second element is confidence score



    # If language is not detected or not in our mapping
    if not detected_lang:
        print("Here?")
        return {
            "original_text": input_text,
            "detected_language": detected_lang if detected_lang else "Unknown",
            "translated_text": "Could not translate - language not supported or detected",
            "confidence": confidence if detected_lang else 0.0
        }

    # Get the corresponding language code for translation
    src_lang = detected_lang

    try:
        # Preprocess input text
        print("Try")
        batch = ip.preprocess_batch([input_text], src_lang=src_lang, tgt_lang="eng_Latn")

        # Tokenize and move to correct device
        print("Try again")
        inputs = tokenizer(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)

        # Generate translation
        with torch.no_grad():
            print("We are here")
            generated_tokens = model.generate(**inputs, max_length=256, num_beams=5)

        # Decode the translation
        with tokenizer.as_target_tokenizer():
            decoded_translation = tokenizer.batch_decode(
                generated_tokens.cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )[0]

        # Postprocess and store the translation
        translated_text = ip.postprocess_batch([decoded_translation], lang="eng_Latn")[0]

        return {
            "original_text": input_text,
            "detected_language": detected_lang,
            "translated_text": translated_text,
            "confidence": confidence
        }

    except Exception as e:
        print(f"Error processing sentence in {src_lang}: {e}")
        return {
            "original_text": input_text,
            "detected_language": detected_lang,
            "translated_text": f"Translation failed: {str(e)}",
            "confidence": confidence
        }


In [None]:
# Example usage
if __name__ == "__main__":
    # Test with multiple inputs
    test_inputs = [
    "मैं गर्भवती हूँ। मेरी आय 15000 रुपये प्रति माह है। मैं किन योजनाओं का लाभ उठा सकती हूँ?",  # Hindi
    "আমি গর্ভবতী। আমার আয় প্রতি মাসে ১৫০০০ টাকা। আমি কোন কোন স্কিমের সুবিধা নিতে পারি?",  # Bengali
    "నేను గర్భిణి ని. నా ఆదాయం నెలకు 15000 రూపాయలు. నేను ఏ పథకాలను పొందగలను?",  # Telugu
    "मी गर्भवती आहे. माझे उत्पन्न प्रति महिना १५००० रुपये आहे. मी कोणत्या योजना मिळवू शकते?",  # Marathi
    "நான் கர்ப்பிணி. என் வருமானம் மாதம் 15000 ரூபாய். எந்த திட்டங்களை பெற முடியும்?",  # Tamil
    "میں حاملہ ہوں۔ میری آمدنی 15000 روپے ماہانہ ہے۔ میں کون سی اسکیمیں حاصل کر سکتی ہوں؟",  # Urdu
    "હું ગર્ભવતી છું. મારી આવક 15000 રૂપિયા પ્રતિ મહિને છે. હું કઈ યોજનાઓનો લાભ લઈ શકું?",  # Gujarati
    "ನಾನು ಗರ್ಭಿಣಿ. ನನ್ನ ಆದಾಯ ತಿಂಗಳಿಗೆ 15000 ರೂಪಾಯಿ. ನಾನು ಯಾವ ಯೋಜನೆಗಳನ್ನು ಪಡೆಯಬಹುದು?",  # Kannada
    "ମୁଁ ଗର୍ଭବତୀ । ମୋର ରୋଜଗାର ମାସକୁ ୧୫୦୦୦ ଟଙ୍କା । ମୁଁ କେଉଁ ଯୋଜନାର ଲାଭ ନେଇପାରିବି?",  # Odia
    "ਮੈਂ ਗਰਭਵਤੀ ਹਾਂ। ਮੇਰੀ ਆਮਦਨ 15000 ਰੁਪਏ ਪ੍ਰਤੀ ਮਹੀਨਾ ਹੈ। ਮੈਂ ਕਿਹੜੀਆਂ ਯੋਜਨਾਵਾਂ ਦਾ ਲਾਭ ਲੈ ਸਕਦੀ ਹਾਂ?",  # Punjabi
    "ഞാൻ ഗർഭിണിയാണ്. എന്റെ വരുമാനം പ്രതിമാസം 15000 രൂപയാണ്. ഞാൻ ഏത് പദ്ധതികൾ പ്രയോജനപ്പെടുത്താൻ കഴിയും?",  # Malayalam
    "মই গৰ্ভৱতী। মোৰ মাহেকীয়া আয় ১৫০০০ টকা। কোন স্কীমসমূহ মই লাভ কৰিব পাৰোঁ?",  # Assamese
    "हम गर्भवती छी। हमर आय 15000 टाका प्रति माह अछि। हम किन योजनाक लाभ उठा सकैत छी?",  # Maithili
    "Ang gayeraak'na. Anga aay 15000 rupiya mahina. Ang kon'ko scheme horom availing kana?",  # Santali
    "بے حامِلہ آہِم۔ میژھ آمدَن 15000 روپَیہ ماہانہ آہے۔ بے کوژ سکیمز حاصل کرِتھ سُنہ؟",  # Kashmiri
    "म गर्भवती छु। मेरो आम्दानी प्रति महिना १५००० रुपैयाँ छ। म कुन योजनाहरू प्राप्त गर्न सक्छु?",  # Nepali
    "آءٌ حامله آھيان. منھنجو آمدني 15000 رپيا مھينا آھي. ڪھڙيون اسڪيمون حاصل ڪري سگھان ٿي؟",  # Sindhi
    "मैं गर्भवती हूँ। मेरी आय 15000 रुपये प्रति महीना है। मैं किन योजनाओं का लाभ उठा सकती हूँ?",  # Dogri
    "हांव गर्भवती आसा. माझो उत्पन्न 15000 रुपये प्रति म्हयना. हांव कोणाच्या योजना लाबू शकता?",  # Konkani
    "èi chanu thoklaba nupi ni. eiga chatpa thoujannasi tháng 15000-gi. eiga karigumba scheme-sing eigi oiriba ngamgani?",  # Manipuri
    "आं गोर्बोआव दं। आंनि आयआ दानफ्रोमबो 15000 रां। आं बबे स्किमफोरनि मुलाम्फा लानो हागोन?",  # Bodo
    "अहम् गर्भिणी अस्मि। मम आयः मासे १५००० रूप्यकाणि अस्ति। अहम् कासु योजनानाम् लाभम् प्राप्नुयाम्?"  # Sanskrit
]

    for i, text in enumerate(test_inputs):
        print(f"\nTesting input {i+1}:")
        result = detect_and_translate(text)
        print(result)
        print(f"Original: {result['original_text']}")
        print(f"Detected Language: {result['detected_language']} (confidence: {result['confidence']})")
        print(f"Translation: {result['translated_text']}")

**AUDIO**

In [None]:
!pip install transformers torchaudio onnx onnxruntime onnxruntime-gpu


In [None]:
!pip install -U openai-whisper

**Breaking into pieces -> Regional Speech to English text conversion**

NOTE -> Add an audio file to see the output

In [None]:
import torch
import torchaudio
from transformers import AutoModel, AutoProcessor, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor, AutoConfig
from IndicTransToolkit import IndicProcessor
import numpy as np
import time
import gc
import whisper
from transformers import Wav2Vec2ForCTC

# Set device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Memory optimization function
def optimize_memory():
    # Clear CUDA cache if using GPU
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
    # Force garbage collection
    gc.collect()

# Initialize processor
ip = IndicProcessor(inference=True)

# === LANGUAGE DETECTION FUNCTIONS ===

def load_language_detection_model():
    """
    Load the MMS-LID model and feature extractor.
    Returns:
        tuple: (model, feature_extractor, languages)
    """
    print("Loading MMS-LID model...")
    model_name = "facebook/mms-lid-126"
    model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name).to(DEVICE)
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    languages = model.config.id2label
    return model, feature_extractor, languages

def get_audio_info(audio_path):
    """
    Get information about the audio file.
    Args:
        audio_path (str): Path to the audio file
    Returns:
        tuple: (waveform, sample_rate, total_duration)
    """
    waveform, sample_rate = torchaudio.load(audio_path)
    total_duration = waveform.shape[1] / sample_rate
    return waveform, sample_rate, total_duration

def determine_chunk_parameters(total_duration, waveform_length, sample_rate, chunk_duration, chunk_offset):
    """
    Determine the parameters for audio chunking.
    """
    use_entire_audio = False

    # Check if audio is shorter than the requested chunk duration
    if total_duration <= chunk_duration:
        # Use the entire audio file
        frame_offset = 0
        num_frames = waveform_length
        chunk_offset = 0
        processing_duration = total_duration
        use_entire_audio = True
        print(f"Audio duration ({total_duration:.2f}s) is shorter than requested chunk duration ({chunk_duration:.2f}s)")
        print(f"Processing entire audio file")
    else:
        # Process only a chunk
        # Determine offset position for the chunk (if not provided)
        if chunk_offset is None:
            # Default to 1/3 of the way through the file, which often contains clearer speech
            chunk_offset = min(total_duration / 3, total_duration / 2)

        # Make sure we have enough audio left for the chunk
        if chunk_offset + chunk_duration > total_duration:
            chunk_offset = max(0, total_duration - chunk_duration)

        # Convert offset to frames
        frame_offset = int(chunk_offset * sample_rate)
        num_frames = int(chunk_duration * sample_rate)
        processing_duration = chunk_duration

        # Make sure we don't request more frames than available
        if frame_offset + num_frames > waveform_length:
            num_frames = max(0, waveform_length - frame_offset)
            processing_duration = num_frames / sample_rate

        print(f"Processing {processing_duration:.2f}s of audio at position {chunk_offset:.2f}s")
        print(f"Original file duration: {total_duration:.2f}s")

    return frame_offset, num_frames, chunk_offset, processing_duration, use_entire_audio

def preprocess_audio(waveform, sample_rate, feature_extractor):
    """
    Preprocess the audio waveform for the model.
    """
    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Resample if necessary
    if sample_rate != feature_extractor.sampling_rate:
        resampler = torchaudio.transforms.Resample(sample_rate, feature_extractor.sampling_rate)
        waveform = resampler(waveform)

    # Convert to numpy array (expected by feature extractor)
    speech_array = waveform.squeeze().numpy()

    # Extract features
    inputs = feature_extractor(
        speech_array,
        sampling_rate=feature_extractor.sampling_rate,
        return_tensors="pt"
    ).to(DEVICE)

    return inputs

def get_language_predictions(model, inputs, languages, top_k=1):
    """
    Get language predictions from the model.
    """
    # Get model prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    # Apply softmax to get probabilities
    probs = torch.nn.functional.softmax(logits, dim=-1).squeeze().cpu().numpy()

    # Get top-k predictions
    top_indices = np.argsort(probs)[::-1][:top_k]
    results = []

    for idx in top_indices:
        lang_code = languages[idx]
        probability = probs[idx]
        results.append((lang_code, probability))

    return results

def detect_language(audio_path, chunk_duration=5.0, chunk_offset=None, top_k=1):
    """
    Detect the language of an audio file using the MMS-LID model.
    Returns:
        tuple: (detected_language_code, probability, processing_time)
    """
    print("Detecting language...")
    start_time = time.time()

    # Step 1: Load the model and feature extractor
    model, feature_extractor, languages = load_language_detection_model()

    # Step 2: Get audio information
    waveform, sample_rate, total_duration = get_audio_info(audio_path)

    # Step 3: Determine chunk parameters
    frame_offset, num_frames, chunk_offset, processing_duration, use_entire_audio = determine_chunk_parameters(
        total_duration, waveform.shape[1], sample_rate, chunk_duration, chunk_offset
    )

    # Step 4: Extract the audio chunk if not using entire audio
    if not use_entire_audio:
        waveform, sample_rate = torchaudio.load(
            audio_path,
            frame_offset=frame_offset,
            num_frames=num_frames
        )

    # Step 5: Preprocess the audio
    inputs = preprocess_audio(waveform, sample_rate, feature_extractor)

    # Step 6: Get language predictions
    results = get_language_predictions(model, inputs, languages, top_k)

    # Calculate processing time
    end_time = time.time()
    processing_time = end_time - start_time

    # Get the top language
    top_lang, top_prob = results[0]
    print(f"Detected language: {top_lang} with {top_prob*100:.2f}% confidence")

    # Clean up
    del model
    optimize_memory()

    return top_lang, top_prob, processing_time

# === TRANSCRIPTION FUNCTIONS ===

def transcribe_english_audio(audio_path):
    """
    Transcribe English speech from audio to text using Whisper.
    """
    print("Transcribing English audio...")

    # Load the base English-only model
    model = whisper.load_model("tiny.en")  # Options: tiny.en, base.en, small.en

    # Transcribe with language explicitly set to English
    result = model.transcribe(audio_path, language="en", fp16=False)

    # Clean up
    del model
    optimize_memory()

    return result["text"]

def process_audio_in_chunks(model, audio_path, language_code, decoding_method="rnnt", chunk_duration=30):
    """
    Process audio in chunks using IndicConformer with the detected language.
    """
    # Map MMS-LID language codes to IndicConformer codes
    indic_lang_mapping = {
        "eng": "en",
        "asm": "as",         # Assamese
        "ben": "bn",         # Bengali
        "brx": "brx",        # Bodo
        "doi": "doi",        # Dogri
        "guj": "gu",         # Gujarati
        "hin": "hi",         # Hindi
        "kan": "kn",         # Kannada
        "kas": "ks",         # Kashmiri
        "gom": "kok",        # Konkani
        "mai": "mai",        # Maithili
        "mal": "ml",         # Malayalam
        "mni_Mtei": "mni",   # Manipuri
        "mar": "mr",         # Marathi
        "npi": "ne",         # Nepali
        "ori": "or",         # Odia
        "pan_Guru": "pa",    # Punjabi
        "san_Deva": "sa",    # Sanskrit
        "sat_Olck": "sat",   # Santali
        "snd_Arab": "sd",    # Sindhi
        "tam_Taml": "ta",    # Tamil
        "tel_Telu": "te",    # Telugu
        "urd_Arab": "ur"     # Urdu
    }

    # Map the language code
    indic_lang = indic_lang_mapping.get(language_code, "hi")  # Default to Hindi if mapping not found
    print(f"Using language code for transcription: {indic_lang}")

    # Load audio file
    wav, sr = torchaudio.load(audio_path)

    # Resample to 16kHz if needed
    target_sample_rate = 16000
    if sr != target_sample_rate:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sample_rate)
        wav = resampler(wav)

    # Convert stereo to mono if necessary
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)

    # Calculate chunk size (in samples)
    chunk_size = int(chunk_duration * target_sample_rate)
    total_samples = wav.shape[1]

    # Process in chunks
    transcriptions = []
    for i in range(0, total_samples, chunk_size):
        # Extract chunk
        end_idx = min(i + chunk_size, total_samples)
        chunk = wav[:, i:end_idx]

        # Process chunk with detected language
        try:
            with torch.no_grad():  # Disable gradient tracking to save memory
                transcription = model(chunk, indic_lang, decoding_method)
            transcriptions.append(transcription)

            # Free memory
            optimize_memory()

            print(f"Processed chunk {i//chunk_size + 1}/{(total_samples + chunk_size - 1)//chunk_size}")
        except Exception as e:
            print(f"Error processing chunk {i//chunk_size + 1}: {e}")

    # Combine all transcriptions
    return " ".join(transcriptions)

def transcribe_audio(audio_path, language_code):
    """
    Transcribe audio using the detected language.
    """
    if language_code == "eng":
        # For English, use Whisper
        return transcribe_english_audio(audio_path)
    else:
        # For Indic languages, use IndicConformer
        try:
            # Load model with memory optimization
            print("Loading IndicConformer model...")
            model = AutoModel.from_pretrained(
                "ai4bharat/indic-conformer-600m-multilingual",
                trust_remote_code=True,
                torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
            ).to(DEVICE)
            model.eval()  # Set to evaluation mode

            optimize_memory()

            print("Starting RNNT transcription...")
            rnnt_result = process_audio_in_chunks(model, audio_path, language_code, "rnnt")
            print(f"RNNT Transcription: {rnnt_result}")

            # Clean up
            del model
            optimize_memory()

            return rnnt_result

        except Exception as e:
            print(f"Error during transcription: {e}")
            return None

# === TRANSLATION FUNCTIONS ===

def load_translation_model():
    """
    Load the IndicTrans2 translation model.
    """
    print("Loading IndicTrans2 model...")
    model_name = "ai4bharat/indictrans2-indic-en-1B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
    ).to(DEVICE)

    return model, tokenizer

def translate_to_english(text, src_lang):
    """
    Translate text from source language to English using IndicTrans2.
    """
    # Map MMS-LID language codes to IndicTrans2 codes
    indictrans_lang_mapping = {
        "eng": "eng_Latn",
        "asm": "asm_Beng",   # Assamese
        "ben": "ben_Beng",   # Bengali
        "brx": "brx_Deva",   # Bodo
        "doi": "doi_Deva",   # Dogri
        "guj": "guj_Gujr",   # Gujarati
        "hin": "hin_Deva",   # Hindi
        "kan": "kan_Knda",   # Kannada
        "kas": "kas_Deva",   # Kashmiri
        "gom": "gom_Deva",   # Konkani
        "mai": "mai_Deva",   # Maithili
        "mal": "mal_Mlym",   # Malayalam
        "mni_Mtei": "mni_Mtei", # Manipuri
        "mar": "mar_Deva",   # Marathi
        "npi": "nep_Deva",   # Nepali
        "ori": "ory_Orya",   # Odia
        "pan_Guru": "pan_Guru", # Punjabi
        "san_Deva": "san_Deva", # Sanskrit
        "sat_Olck": "sat_Olck", # Santali
        "snd_Arab": "snd_Arab", # Sindhi
        "tam_Taml": "tam_Taml", # Tamil
        "tel_Telu": "tel_Telu", # Telugu
        "urd_Arab": "urd_Arab"  # Urdu
    }

    # Get the mapped language code for IndicTrans2
    src_lang_code = indictrans_lang_mapping.get(src_lang, "hin_Deva")  # Default to Hindi if not found

    # If already English, return the text
    if src_lang == "eng":
        return text

    try:
        # Load translation model
        model, tokenizer = load_translation_model()

        # Preprocess input text for translation
        batch = ip.preprocess_batch([text], src_lang=src_lang_code, tgt_lang="eng_Latn")

        # Tokenize the preprocessed batch
        inputs = tokenizer(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)

        # Generate translation
        with torch.no_grad():
            generated_tokens = model.generate(**inputs, max_length=256, num_beams=5)

        # Decode the translation
        with tokenizer.as_target_tokenizer():
            decoded_translation = tokenizer.batch_decode(
                generated_tokens.cpu().tolist(),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )[0]

        translated_text = ip.postprocess_batch([decoded_translation], lang="eng_Latn")[0]

        # Clean up
        del model, tokenizer
        optimize_memory()

        return translated_text

    except Exception as e:
        print(f"Error during translation: {e}")
        return f"Translation failed: {str(e)}"

# === MAIN PROCESSING FUNCTION ===

def process_audio_with_auto_language(audio_path):
    """
    Complete end-to-end processing:
    1. Detect language
    2. Transcribe audio in detected language
    3. Translate transcription to English
    """
    # Step 1: Detect language
    detected_lang, confidence, detect_time = detect_language(audio_path)
    print(f"Language detection completed in {detect_time:.2f}s")

    # Step 2: Transcribe audio in detected language
    transcription = transcribe_audio(audio_path, detected_lang)
    if not transcription:
        return {
            "detected_language": detected_lang,
            "confidence": confidence,
            "transcription": None,
            "translation": None,
            "error": "Transcription failed"
        }

    # Step 3: Translate to English (if not already English)
    if detected_lang == "eng":
        translation = transcription
    else:
        translation = translate_to_english(transcription, detected_lang)

    return {
        "detected_language": detected_lang,
        "confidence": f"{confidence*100:.2f}%",
        "transcription": transcription,
        "translation": translation
    }

# Main function
if __name__ == "__main__":

    audio_file = "/content/New Recording 2.wav"  # Replace with your audio file path

    print("Starting audio processing pipeline...")
    results = process_audio_with_auto_language(audio_file)

    print("\n===== RESULTS =====")
    print(f"Detected Language: {results['detected_language']} (Confidence: {results['confidence']})")
    print(f"\nOriginal Transcription: {results['transcription']}")
    print(f"\nEnglish Translation: {results['translation']}")

Using device: cuda
Starting audio processing pipeline...
Detecting language...
Loading MMS-LID model...
Processing 5.00s of audio at position 3.60s
Original file duration: 10.79s
Detected language: eng with 77.46% confidence
Language detection completed in 5.83s
Transcribing English audio...

===== RESULTS =====
Detected Language: eng (Confidence: 77.46%)

Original Transcription:  Hello ladies and gentlemen today I'm making this way file to see if my speech to text conversion app is working or not Thank you for giving me a sad time and I'll see you around. Thank you

English Translation:  Hello ladies and gentlemen today I'm making this way file to see if my speech to text conversion app is working or not Thank you for giving me a sad time and I'll see you around. Thank you


**To extract certain info from English Text**

In [None]:
import gc
gc.enable()
def optimize_memory():
    # Clear CUDA cache if using GPU
    if DEVICE == "cuda":
        torch.cuda.empty_cache()
    # Force garbage collection
    gc.collect()

