<a href="https://colab.research.google.com/github/ShreejayShakya28/ASR-LLM-Pipeline/blob/main/Gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Drive

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Install for ASR

In [1]:
!pip install nemo_toolkit['asr'] gradio soundfile librosa



# Initialize ASR Model

In [5]:
import torch
import nemo.collections.asr as nemo_asr

torch.set_grad_enabled(False)

device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_PATH = "/content/drive/MyDrive/fastconformer.nemo"

asr_model = nemo_asr.models.EncDecCTCModelBPE.restore_from(
    restore_path=MODEL_PATH,
    map_location=device
)

asr_model.eval()

print("Model loaded:", type(asr_model))
print("Encoder:", asr_model.cfg.encoder._target_)
print("Tokenizer vocab size:", asr_model.tokenizer.vocab_size)

[NeMo W 2026-02-04 07:14:01 nemo_logging:405] Megatron num_microbatches_calculator not found, using Apex version.


[NeMo I 2026-02-04 07:14:07 nemo_logging:393] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2026-02-04 07:14:08 nemo_logging:405] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /data/NeMo_ASR_SET/English/v2.0/train/tarred_audio_manifest.json
    sample_rate: 16000
    batch_size: 64
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20.0
    min_duration: 0.1
    shuffle_n: 2048
    is_tarred: true
    tarred_audio_filepaths: /data/NeMo_ASR_SET/English/v2.0/train/audio__OP_0..4095_CL_.tar
    
[NeMo W 2026-02-04 07:14:08 nemo_logging:405] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath:
    - /data/ASR/LibriSpeech/librisp

[NeMo I 2026-02-04 07:14:08 nemo_logging:393] PADDING: 0
[NeMo I 2026-02-04 07:14:09 nemo_logging:393] Model EncDecCTCModelBPE was successfully restored from /content/drive/MyDrive/fastconformer.nemo.
Model loaded: <class 'nemo.collections.asr.models.ctc_bpe_models.EncDecCTCModelBPE'>
Encoder: nemo.collections.asr.modules.ConformerEncoder
Tokenizer vocab size: 1024


# LLM Sangam Repo

In [3]:
# Clone repo
!git clone https://github.com/ShreejayShakya28/ASR-LLM-Pipeline
%cd ASR-LLM-Pipeline/SLM

# Install dependencies
!pip install -r requirements.txt

# Copy model to local Colab storage
!cp "/content/drive/MyDrive/gpt2-medium355M-sft.pth" /content/model.pth

# Run inference
from inference import load_model, run_inference
import torch

MODEL_PATH = "/content/model.pth"
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Using device: {device}")

# Load model
model, config = load_model(MODEL_PATH, device=device)
print("Model loaded successfully!")



fatal: destination path 'ASR-LLM-Pipeline' already exists and is not an empty directory.
/content/ASR-LLM-Pipeline/SLM
Using device: cuda
Model loaded successfully!


In [4]:
# Example 1
response = run_inference(
    model,
    config,
    instruction="What is the tallest building in the world?",
    device=device
)
print(f"\nResponse: {response}")


Response: The tallest building in the world is the Empire State Building in New York City.


# Check Cuda

In [6]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
Tesla T4


# Inference Audio Class

In [7]:
import numpy as np
import soundfile as sf
import tempfile
import os
import torch

torch.set_grad_enabled(False)

@torch.inference_mode()
def transcribe_audio(audio):
    if audio is None:
        return ""

    sample_rate, audio_np = audio

    # mono
    if audio_np.ndim > 1:
        audio_np = audio_np.mean(axis=1)

    # save wav
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        sf.write(f.name, audio_np, sample_rate)
        wav_path = f.name

    # === ASR inference ===
    result = asr_model.transcribe([wav_path])

    os.remove(wav_path)

    # Case 1: High-level API returns string
    if isinstance(result[0], str):
        return result[0]

    # Case 2: Low-level API returns Hypothesis
    return result[0].text

# Gradio Interface

In [9]:
import gradio as gr
import numpy as np
import soundfile as sf
import tempfile
import os
import torch

torch.set_grad_enabled(False)

@torch.inference_mode()
def transcribe_audio(audio):
    if audio is None:
        return ""

    sample_rate, audio_np = audio

    # mono
    if audio_np.ndim > 1:
        audio_np = audio_np.mean(axis=1)

    # save wav
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        sf.write(f.name, audio_np, sample_rate)
        wav_path = f.name

    # === ASR inference ===
    result = asr_model.transcribe([wav_path])

    os.remove(wav_path)

    # Case 1: High-level API returns string
    if isinstance(result[0], str):
        return result[0]

    # Case 2: Low-level API returns Hypothesis
    return result[0].text


def process_audio_pipeline(audio):
    """Process audio through ASR + LLM pipeline"""
    if audio is None:
        return "", ""

    # Step 1: Get transcription from ASR
    transcription = transcribe_audio(audio)

    # Step 2: Feed transcription to LLM
    llm_response = run_inference(
        model=model,
        config=config,
        instruction=transcription,
        device=device,
        max_new_tokens=256
    )

    return transcription, llm_response


# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # üéôÔ∏è ASR + LLM Pipeline
        Speak or upload audio ‚Üí Get transcription ‚Üí Get AI response
        """
    )

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="numpy",
                label="üé§ Speak or Upload Audio"
            )
            submit_btn = gr.Button("Process", variant="primary", size="lg")

        with gr.Column():
            transcription_output = gr.Textbox(
                label="üìù Transcription (ASR Output)",
                lines=3,
                interactive=False
            )
            llm_output = gr.Textbox(
                label="ü§ñ AI Response (LLM Output)",
                lines=6,
                interactive=False
            )

    gr.Markdown(
        """
        ### Example Questions to Try:
        - "What is the tallest building in the world?"
        - "Write the synonyms of the word evil"
        - "What is the formula for speed?"
        """
    )

    submit_btn.click(
        fn=process_audio_pipeline,
        inputs=audio_input,
        outputs=[transcription_output, llm_output]
    )

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cae04f12d801325e1b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


