In [None]:
!pip install gradio transformers torch soundfile librosa gtts

Collecting gradio
  Downloading gradio-5.25.2-py3-none-any.whl.metadata (16 kB)
Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.6-py3-none-manylinux_2_17_x86_64.ma

In [None]:
import gradio as gr
from transformers import pipeline, AutoProcessor, AutoModelForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, MT5ForConditionalGeneration
import torch
import soundfile as sf
import librosa
import os
from gtts import gTTS

# Initialize ASR components
processor = AutoProcessor.from_pretrained("iamTangsang/Wav2Vec2_XLS-R-300m_Nepali_ASR")
asr_model = AutoModelForCTC.from_pretrained("iamTangsang/Wav2Vec2_XLS-R-300m_Nepali_ASR")

# Initialize punctuation restoration components
punctuation_tokenizer = AutoTokenizer.from_pretrained("iamTangsang/nepali-punctuation-restoration-mt5")
punctuation_model = MT5ForConditionalGeneration.from_pretrained("iamTangsang/nepali-punctuation-restoration-mt5")

# Initialize translation components
tokenizer = AutoTokenizer.from_pretrained("iamTangsang/Final-Model-Ne-En")
translation_model = AutoModelForSeq2SeqLM.from_pretrained("iamTangsang/Final-Model-Ne-En")

def restore_punctuation(text):
    # Tokenize the unpunctuated text
    inputs = punctuation_tokenizer(text, return_tensors="pt", padding=True)

    # Generate punctuated text
    with torch.no_grad():
        outputs = punctuation_model.generate(**inputs)

    # Decode the punctuated output
    punctuated_text = punctuation_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return punctuated_text

def translate_text(nepali_text):
    # Tokenize and generate translation
    inputs = tokenizer(nepali_text, return_tensors="pt", padding=True)
    outputs = translation_model.generate(**inputs)
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

def text_to_speech(text):
    # Generate a unique filename
    output_file = "translated_speech.mp3"

    # Convert text to speech
    tts = gTTS(text=text, lang='en', slow=False)
    tts.save(output_file)

    return output_file

def process_speech(audio):
    # Read the audio file
    speech, sr = sf.read(audio)
    if sr != 16000:
        speech = librosa.resample(speech, orig_sr=sr, target_sr=16000)

    # Process audio for ASR
    inputs = processor(speech, sampling_rate=16000, return_tensors="pt", padding=True)

    # Generate logits and decode using the processor
    with torch.no_grad():
        logits = asr_model(**inputs).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    nepali_transcription = processor.batch_decode(predicted_ids)[0]

    # Translate the raw Nepali transcription
    raw_english_translation = translate_text(nepali_transcription)

    # Restore punctuation in the Nepali transcription
    punctuated_nepali = restore_punctuation(nepali_transcription)

    # Translate the punctuated Nepali transcription
    punctuated_english_translation = translate_text(punctuated_nepali)

    # Generate TTS for the final translation
    tts_audio = text_to_speech(punctuated_english_translation)

    # Return all outputs
    return nepali_transcription, raw_english_translation, punctuated_nepali, punctuated_english_translation, tts_audio

# Gradio interface
iface = gr.Interface(
    fn=process_speech,
    inputs=gr.Audio(type="filepath"),
    outputs=[
        gr.Textbox(label="Raw Nepali Transcription (without punctuation)"),
        gr.Textbox(label="English Translation of Raw Nepali Transcription"),
        gr.Textbox(label="Nepali Transcription (with punctuation)"),
        gr.Textbox(label="English Translation of Nepali Transcription with Punctuation"),
        gr.Audio(label="Text-to-Speech of Final Translation")
    ],
    title="Nepali Speech Recognition and Translation with Punctuation and TTS",
    description="Upload or record Nepali speech to see raw transcription, its translation, transcription with restored punctuation, its translation, and hear the translated English text."
)

# Launch the Gradio app
iface.launch(debug=True)

Error while fetching `HF_TOKEN` secret value from your vault: 'TypeError: Failed to fetch'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


preprocessor_config.json:   0%|          | 0.00/214 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/30.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/19.3k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/802 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/707k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/791k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.55M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://1457e5874eb02b07ec.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 2136, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/gradio/blocks.py", line 1662, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
           ^^^^^