# Load Voice Classifier

In [1]:
import torch
from transformers import pipeline

2025-12-14 00:04:11.587783: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device

device = "cuda" if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 7 else "cpu"
device

    Found GPU0 NVIDIA GeForce MX250 which is of cuda capability 6.1.
    Minimum and Maximum cuda capability supported by this version of PyTorch is
    (7.0) - (12.0)
    
    Please install PyTorch with a following CUDA
    configurations:  12.6 following instructions at
    https://pytorch.org/get-started/locally/
    
NVIDIA GeForce MX250 with CUDA capability sm_61 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_70 sm_75 sm_80 sm_86 sm_90 sm_100 sm_120.
If you want to use the NVIDIA GeForce MX250 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/



'cpu'

In [3]:
classifier = pipeline(
    "audio-classification", model="MIT/ast-finetuned-speech-commands-v2", device=device
)

print(classifier.device)

Device set to use cpu


cpu


In [4]:
print(classifier.model.config.id2label)

{0: 'backward', 1: 'follow', 2: 'five', 3: 'bed', 4: 'zero', 5: 'on', 6: 'learn', 7: 'two', 8: 'house', 9: 'tree', 10: 'dog', 11: 'stop', 12: 'seven', 13: 'eight', 14: 'down', 15: 'six', 16: 'forward', 17: 'cat', 18: 'right', 19: 'visual', 20: 'four', 21: 'wow', 22: 'no', 23: 'nine', 24: 'off', 25: 'three', 26: 'left', 27: 'marvin', 28: 'yes', 29: 'up', 30: 'sheila', 31: 'happy', 32: 'bird', 33: 'go', 34: 'one'}


In [5]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live


def launch_fn(
    wake_word="up",
    prob_threshold=0.5,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):
    if wake_word not in classifier.model.config.label2id.keys():
        raise ValueError(
            f"Wake word {wake_word} not in set of valid class labels, pick a wake word in the set {classifier.model.config.label2id.keys()}."
        )

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening for wake word...")
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == wake_word:
            if prediction["score"] > prob_threshold:
                return True

                
launch_fn(wake_word='stop',debug=True)

Listening for wake word...


  waveform = torch.from_numpy(waveform).unsqueeze(0)


{'score': 0.3233458399772644, 'label': 'five'}
{'score': 0.061256542801856995, 'label': 'two'}
{'score': 0.062310680747032166, 'label': 'three'}
{'score': 0.11555057018995285, 'label': 'forward'}
{'score': 0.19891703128814697, 'label': 'forward'}
{'score': 0.18264399468898773, 'label': 'forward'}
{'score': 0.18264399468898773, 'label': 'forward'}
{'score': 0.18264399468898773, 'label': 'forward'}
{'score': 0.604997992515564, 'label': 'stop'}


True

# Load Speech Recognitions Pipeline

In [6]:
transcriber = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base", device=device
)

Device set to use cpu


In [7]:
import sys

def speech_transcriber(chunk_length_s=5.0, stream_chunk_s=1.0):
    sampling_rate = transcriber.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    print("Listening...")
    for item in transcriber(mic, generate_kwargs={"max_new_tokens": 128}):
        sys.stdout.write("\033[K")
        print(item["text"], end="\r")
        if not item["partial"][0]:
            break

    return item["text"]

speech_transcriber()

Listening...


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


[K How are you?ë‹¤.

' How are you?'

In [8]:
from transformers import pipeline
import sys
import torch
import soundfile as sf



def transcribe(file_path:str, transcriber_pipeline=transcriber, chunk_length_s=5.0, stream_chunk_s=1.0):
    # Read audio file
    audio, sampling_rate = sf.read(file_path)
    
    # Check if the file needs to be split into chunks
    if len(audio) > chunk_length_s * sampling_rate:
        num_chunks = int(len(audio) / (chunk_length_s * sampling_rate))
    else:
        num_chunks = 1
    
    transcription = ""

    for i in range(num_chunks):
        start_idx = int(i * chunk_length_s * sampling_rate)
        end_idx = int((i + 1) * chunk_length_s * sampling_rate)
        
        chunk = audio[start_idx:end_idx]
        
        # Transcribe the audio chunk
        result = transcriber_pipeline(chunk)
        
        # Print and collect transcription
        text = result['text']
        sys.stdout.write("\033[K")
        transcription += text
    
    return transcription

# Call the transcribe function
sample_file = r'./temp/recorded_audio.wav'
transcription = transcribe(sample_file)
print("\nFull Transcription:\n", transcription)

`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.


[K
Full Transcription:
  10.5.7.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.20.


# Load Large Language Model

In [9]:
from llama_cpp import Llama

model_path = r'./models/gemma-2-2b-it-Q4_K_M.gguf'

llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=2,
    verbose=False
)

llama_context: n_ctx_per_seq (4096) < n_ctx_train (8192) -- the full capacity of the model will not be utilized
llama_kv_cache_unified_iswa: using full-size SWA cache (ref: https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)


In [10]:
def build_prompt(question):
    return f"<start_of_turn>user\nAnswer in max 5 lines.\nQuestion: {question}\n<end_of_turn>\n<start_of_turn>model\n"

def generate_response(question):
    prompt = build_prompt(question)
    output = llm(prompt, max_tokens=120, temperature=0.4, stop=["<end_of_turn>"])
    return output["choices"][0]["text"].strip()

In [11]:
response =generate_response("What does Hugging Face do?")
response

'Hugging Face is a platform that provides tools and resources for the machine learning community. It offers a vast library of pre-trained models, datasets, and tools for building and deploying AI applications.  They also host a community forum and contribute to open-source projects.'

# Load Text To Speech Model

In [12]:
from gtts import gTTS
from pydub import AudioSegment
from pydub.playback import play

def speak(text):
    tts = gTTS(text=text, lang="en")
    temp_file = "temp/response.mp3"
    tts.save(temp_file)
    audio = AudioSegment.from_mp3(temp_file)
    play(audio)
for i in range(3):
    speak('Om Namah Shivaye')

Input #0, wav, from '/tmp/tmpv09l1gtb.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:01.54, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   1.42 M-A: -0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   




Input #0, wav, from '/tmp/tmpes5w5rhd.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:01.54, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   1.41 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   




Input #0, wav, from '/tmp/tmpbzbyv09a.wav':   0KB sq=    0B f=0/0   
  Duration: 00:00:01.54, bitrate: 384 kb/s
  Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, 1 channels, s16, 384 kb/s
   1.40 M-A:  0.000 fd=   0 aq=    0KB vq=    0KB sq=    0B f=0/0   




In [13]:
from gtts import gTTS
from IPython.display import Audio

def synthesizer(text, output_path="temp/response.mp3", play_ipython=False):
    tts = gTTS(text=text, lang="en")
    tts.save(output_path)
    if play_ipython:
        display(Audio(output_path, autoplay=True))
    return output_path

# Usage
audio_path = synthesizer(response, play_ipython=True)

# Building VoiceBots

In [14]:
launch_fn()
transcription = speech_transcriber()
response = generate_response(transcription)
audio_path = synthesizer(response)
audio_path

display(Audio(audio_path, autoplay=True))

Listening for wake word...
Listening...
[K How are you?

## Excite Function

In [15]:
from transformers.pipelines.audio_utils import ffmpeg_microphone_live

def excite_fn(
    excite_word=["go","stop"],
    prob_threshold=0.5,
    chunk_length_s=2.0,
    stream_chunk_s=0.25,
    debug=False,
):

    sampling_rate = classifier.feature_extractor.sampling_rate

    mic = ffmpeg_microphone_live(
        sampling_rate=sampling_rate,
        chunk_length_s=chunk_length_s,
        stream_chunk_s=stream_chunk_s,
    )

    #print("speak 'GO to intrupt or 'STOP' to stop")
    count=0
    for prediction in classifier(mic):
        prediction = prediction[0]
        if debug:
            print(prediction)
        if prediction["label"] == excite_word[0]:
            if prediction["score"] > prob_threshold:
                return 'G0'
        elif prediction["label"] == excite_word[1]:
            if prediction["score"] > prob_threshold:
                return 'STOP'
        count+=1
        if count==20:
            return

In [16]:
excite_fn(debug=True)

{'score': 0.36717551946640015, 'label': 'off'}
{'score': 0.39673149585723877, 'label': 'off'}
{'score': 0.271932989358902, 'label': 'off'}
{'score': 0.3515584468841553, 'label': 'off'}
{'score': 0.33420172333717346, 'label': 'off'}
{'score': 0.33139172196388245, 'label': 'off'}
{'score': 0.3313915729522705, 'label': 'off'}
{'score': 0.33139151334762573, 'label': 'off'}
{'score': 0.7201547622680664, 'label': 'happy'}
{'score': 0.6831589937210083, 'label': 'happy'}
{'score': 0.6954798698425293, 'label': 'happy'}
{'score': 0.6954799294471741, 'label': 'happy'}
{'score': 0.6954799294471741, 'label': 'happy'}
{'score': 0.6954798698425293, 'label': 'happy'}
{'score': 0.5090014338493347, 'label': 'seven'}
{'score': 0.4898102581501007, 'label': 'happy'}
{'score': 0.4898100197315216, 'label': 'happy'}
{'score': 0.4898102581501007, 'label': 'happy'}
{'score': 0.4898103177547455, 'label': 'happy'}
{'score': 0.2904677391052246, 'label': 'stop'}


## Voicebot

In [18]:
import pygame
import time
from IPython.display import Audio, display

pygame.mixer.init()

def chatbot():
    print("SPEAK 'UP' TO START")
    launch_fn() # SPEAK UP TO START
    while True:
        transcription = speech_transcriber()
        print("Query >>>", transcription)

        response = generate_response(transcription)
        print("Response >>>", response)

        audio_path = synthesizer(response, play_ipython=False)

        sound = pygame.mixer.Sound(audio_path)
        length = sound.get_length()
        #display(Audio(audio_path, autoplay=True))

        start_time = time.time()

        channel = sound.play()
        
        print("speak 'GO to intrupt or 'STOP' to stop")

        while channel.get_busy():
            action = excite_fn()
        
            if action == "G0":
                channel.stop()
                break
        
            if action == "STOP":
                channel.stop()
                print("Thanks")
                return

chatbot()

SPEAK 'UP' TO START
Listening for wake word...
Listening...
Query >>>  How are you?
Response >>> I'm doing well, thank you!  How are you? ðŸ˜Š 
I'm ready to assist you with any questions or tasks you have. 
Let me know what I can help you with!
speak 'GO to intrupt or 'STOP' to stop
Listening...
Query >>>  Thank you.
Response >>> You're welcome!  Is there anything else I can help you with? ðŸ˜Š
speak 'GO to intrupt or 'STOP' to stop
Listening...
Query >>>  What is the bidet today?
Response >>> A bidet is a water-based toilet accessory that provides a hygienic and refreshing cleansing experience for the user after using the toilet. 

It is a popular choice for those seeking a more eco-friendly and hygienic alternative to toilet paper, and is increasingly common in modern bathrooms.
speak 'GO to intrupt or 'STOP' to stop
Listening...
Query >>>  Am mai ascumat-o vidÄƒr.
Response >>> The sentence "Am mai ascumat-o vidÄƒr" is in Romanian and translates to "I have eaten her." 

However, it's

## APP

In [None]:
import gradio as gr
import numpy as np
import tempfile
from scipy.io.wavfile import write

# ---------- Transcription ----------
def transcribe_fn(audio):
    sr, y = audio
    y = y.astype(np.float32)
    y = y / (np.max(np.abs(y)) + 1e-9)
    return transcriber({"sampling_rate": sr, "raw": y})["text"]

# ---------- Save audio to temp WAV ----------
def save_audio(audio_array, sampling_rate=16000):
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        write(f.name, sampling_rate, audio_array)
        return f.name

# ---------- Chatbot ----------
def chatbot(audio_input):
    transcription = transcribe_fn(audio_input)
    print("Query >>>", transcription)

    response = generate_response(transcription)
    print("Response >>>", response)

    audio_tensor = synthesise(response)   # torch tensor
    audio_np = audio_tensor.cpu().numpy()

    path = save_audio(audio_np)
    return path

# ---------- Gradio UI ----------
interface = gr.Interface(
    fn=chatbot,
    title="Voice Chatbot",
    description="Speak â†’ LLM â†’ Spoken response",
    inputs=gr.Audio(source="microphone", type="numpy"),
    outputs=gr.Audio(type="filepath"),
)

interface.launch()
