#### Microphone 

In [7]:
import pyaudio
import soundfile as sf
import time
import wave
import speech_recognition as sr 
from typing import Optional


# NOTE: speech_recognition is no longer needed for the time-based recording method.
# It would only be needed if you wanted to add back a VAD-based recording method.

class Microphone:
    def __init__(self, mic_index: int = 4, sample_rate: int = 44100):
        """
        Initializes the Microphone utility class.
        
        Args:
            mic_index (int): The index of the microphone to use. Run list_all_devices() to see options.
            sample_rate (int): The sample rate to capture audio at.
        """
        self.format = pyaudio.paInt16      # 16-bit resolution
        self.channels = 1                  # Mono
        self.sample_rate = sample_rate            # Samples per second
        self.chunk = 1024                  # Samples per frame
        self.mic_index = mic_index
        self.audio_interface = pyaudio.PyAudio()

    def list_all_devices(self):
        """
        Lists all available audio input devices found by PyAudio.
        """
        print("\n--- Available Audio Input Devices ---")
        device_count = self.audio_interface.get_device_count()
        if device_count == 0:
            print("No audio devices found.")
            return

        for i in range(device_count):
            try:
                device_info = self.audio_interface.get_device_info_by_index(i)
                if device_info.get('maxInputChannels') > 0:
                    print(f"Device Index: {i}")
                    print(f"  Name: {device_info.get('name')}")
                    print(f"  Max Input Channels: {device_info.get('maxInputChannels')}")
                    print(f"  Default Sample Rate: {int(device_info.get('defaultSampleRate'))} Hz\n")
            except Exception as e:
                print(f"Could not get info for device index {i}: {e}")
        print("-----------------------------------")

    
    def inspect_audio(self, file_path: str):
        """
        Inspects an audio file and prints its key properties.

        Args:
            file_path (str): The path to the audio file.
        """
        try:
            info = sf.info(file_path)
            print("\n--- Audio File Information ---")
            print(f"File Path:    {file_path}")
            print(f"Sample Rate:  {info.samplerate} Hz")
            print(f"Channels:     {info.channels}")
            print(f"Duration:     {info.duration:.2f} seconds")
            print(f"Format:       {info.format_info}")
            print("----------------------------")
        except Exception as e:
            print(f"❌ Error inspecting file: {e}")
            print("Please ensure the file path is correct and it's a valid audio file.")


    def record(self, duration: int = 5) -> str:
        """
        Records audio from the microphone for a fixed duration.
        The filename is automatically generated based on the current timestamp.

        Args:
            duration (int): The number of seconds to record for.
        
        Returns:
            str: The filename of the saved audio.
        """
        print(f"\nPreparing to record for {duration} seconds...")
        
        stream = self.audio_interface.open(
            format=self.format,
            channels=self.channels,
            rate=self.sample_rate,
            input=True,
            frames_per_buffer=self.chunk,
            input_device_index=self.mic_index
        )
        
        print("🔴 Recording started...")
        
        frames = []
        # Loop to record audio chunk by chunk for the specified duration
        for _ in range(0, int(self.sample_rate / self.chunk * duration)):
            # --- THE FIX: Add exception_on_overflow=False to stream.read() ---
            # This tells the stream to not crash if it overflows.
            data = stream.read(self.chunk, exception_on_overflow=False)
            frames.append(data)
            
        print("✅ Recording finished.")
        
        # Stop and close the audio stream
        stream.stop_stream()
        stream.close()
        
        # Generate filename with timestamp
        timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
        file_path = f"recording_at_{timestamp}.wav"

        # Save the recorded data as a WAV file
        with wave.open(file_path, 'wb') as wf:
            wf.setnchannels(self.channels)
            wf.setsampwidth(self.audio_interface.get_sample_size(self.format))
            wf.setframerate(self.sample_rate)
            wf.writeframes(b''.join(frames))
            
        print(f"Audio successfully saved to {file_path}")
        return file_path


    # Record with VAD by recognition lib 
    def record_with_vad(self, audio_path: str = 'output.wav', timeout: int = 10, phrase_time_limit = 7): 
        recognizer = sr.Recognizer() 
        recognizer.pause_threshold = 2.0 # 2 seconds 

        source = sr.Microphone(device_index= self.mic_index, sample_rate= self.sample_rate )

        # Calibrate for ambient noise         
        print('Calibrating gfor ambient noise, please wait...')
        with source as mic: # Wait, whaht ?? 
            recognizer.adjust_for_ambient_noise(mic)
            print('Calibration complete. Listening for speech....')

         
            try: 
                audio_data = recognizer.listen(mic, 
                                               timeout= timeout)

                print('Speech detected! Saving the recording...')

                with open(audio_path, 'wb') as file: 
                    file.write(audio_data.get_wav_data())
                
                print(f'Audio sucessfully save to: {audio_path}')
                return audio_path

            except sr.WaitTimeoutError as e: 
                print(f'No speech detected within the timeout period')
                return None 
            except Exception as e: 
                print(f'An error occurred: {e}')
                return None 
        

#### Silero VAD 

In [8]:
import numpy as np 
import torch 
import pyaudio
import time 
import wave 

class Silero_VAD: 
    def __init__(self, sample_rate = 16000, chunk_size=512): 
        self.sample_rate = sample_rate
        self.chunk_size = chunk_size
        
        print("Loading Silero VAD model...")
        self.model, _  = torch.hub.load(repo_or_dir='snakers4/silero-vad',
                                        model='silero_vad', 
                                        force_reload=False) # <-- FIX #2: Prevents re-downloading

        self.audio_interface = pyaudio.PyAudio() 
        print("VAD initialized successfully.")

    def listen(self, silence_chunks_needed=8): 
        # create a stream 
        stream = self.audio_interface.open(
            format=pyaudio.paInt16, 
            channels=1, 
            rate=self.sample_rate, 
            input=True, 
            frames_per_buffer=self.chunk_size
        )

        # listen loop 
        print('\n🎤 Listening for speech...')
        recorded_frames = [] 
        is_speaking = False 
        silence_counter = 0 
        
        while True: 
            audio_chunk = stream.read(self.chunk_size)
            audio_int16 = torch.from_numpy(np.frombuffer(audio_chunk, dtype=np.int16))
            audio_float32 = audio_int16.to(torch.float32) / 32768.0 

            speech_confidence = self.model(audio_float32, self.sample_rate).item() 
            
            if speech_confidence > 0.5: 
                if not is_speaking: 
                    print("   (Speech started...)")
                    is_speaking = True 
                silence_counter = 0 
                recorded_frames.append(audio_chunk)
            
            elif is_speaking:
                silence_counter += 1 
                recorded_frames.append(audio_chunk)
                if silence_counter > silence_chunks_needed: 
                    print("   (Speech ended due to pause.)")
                    break 
            
        stream.stop_stream() 
        stream.close() 

        timestamp = time.strftime("%Y-%m-%d_%H-%M-%S")
        file_path = f"silero_vad_{timestamp}.wav"
        
        with wave.open(file_path, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(self.audio_interface.get_sample_size(pyaudio.paInt16))
            wf.setframerate(self.sample_rate)
            wf.writeframes(b''.join(recorded_frames))
            
        print(f"✅ Recording saved to: {file_path}")
        return file_path

#### PhoASR

In [9]:
import torch
import librosa
import argparse
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from typing import Optional

class PhoASR:
    """
    A class to handle Vietnamese speech-to-text transcription using PhoWhisper models.
    """
    def __init__(self, model_name: str = "vinai/PhoWhisper-base"):
        """
        Initializes the PhoASR transcriber.

        This method loads the specified PhoWhisper model and processor from Hugging Face
        and prepares them for transcription, automatically selecting the best available device (GPU or CPU).

        Args:
            model_name (str): The name of the PhoWhisper model to use from Hugging Face.
                              Examples: "vinai/PhoWhisper-small", "vinai/PhoWhisper-base".
        """
        print(f"--- Initializing PhoASR with model: '{model_name}' ---")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model_name = model_name
        self.processor = None
        self.model = None

        try:
            self.processor = WhisperProcessor.from_pretrained(self.model_name)
            self.model = WhisperForConditionalGeneration.from_pretrained(self.model_name).to(self.device)
            print(f"✅ Model loaded successfully on device: '{self.device}'.")
        except Exception as e:
            print(f"❌ Error loading model: {e}")
            print("Please check the model name and your internet connection.")
            # Set model to None to prevent usage if initialization fails
            self.model = None

    def transcribe(self, audio_path: str) -> Optional[str]:
        """
        Transcribes an audio file into Vietnamese text.

        Args:
            audio_path (str): The path to the audio file (e.g., 'recording.wav', 'song.mp3').

        Returns:
            Optional[str]: The transcribed text as a string, or None if an error occurs.
        """
        if not self.model or not self.processor:
            print("❌ Model not initialized. Cannot transcribe.")
            return None

        print(f"\n--- Processing Audio File: {audio_path} ---")
        try:
            # Load the audio file. librosa automatically resamples it to 16,000 Hz,
            # which is required by the Whisper model.
            speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
            print(f"Audio loaded and resampled to {sampling_rate} Hz.")
        except Exception as e:
            print(f"❌ Error loading audio file: {e}")
            print("Please check the file path and ensure it is a valid audio format.")
            return None
            
        print("Transcribing... (This may take a moment)")
        
        # Preprocess the audio waveform to create input features for the model
        input_features = self.processor(
            speech_array, 
            sampling_rate=16000, 
            return_tensors="pt"
        ).input_features.to(self.device)
        
        # Generate the sequence of token IDs from the input features
        # We explicitly set the task and language for better performance and to avoid warnings.
        predicted_ids = self.model.generate(
            input_features, 
            task="transcribe", 
            language="vi"
        )
        
        # Decode the token IDs back into a human-readable text string
        transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        print("✅ Transcription complete.")
        return transcription


### Main loop

In [10]:
vad = Silero_VAD() 
asr = PhoASR()

is_active = True
print("\n========================================")
print("🤖 AI Waiter is ready to take orders.")
print("========================================")

# --- The Conversation Loop ---
while is_active:
    try:
        # 1. Listen for customer speech using the VAD
        customer_audio_file = vad.listen()

        # 2. Transcribe the captured audio file using PhoWhisper
        customer_text = asr.transcribe(customer_audio_file)
        print(f"👤 CUSTOMER SAID: {customer_text}")

        # 3. (Future Step) Process text with NLU and get a response
        if "tạm biệt" in customer_text.lower():
            is_active = False
            response = "Cảm ơn quý khách. Hẹn gặp lại!"
        else:
            response = "Vâng ạ, tôi đã hiểu. Quý khách còn yêu cầu gì nữa không?"
        
        # 4. (Future Step) Speak the response using TTS
        print(f"🤖 AI WAITER SAYS: {response}")
        # speak(response)

        print("\n--------------------------------------")

    except KeyboardInterrupt:
        print("\nConversation ended by user. Shutting down.")
        is_active = False
    except Exception as e:
        print(f"An error occurred: {e}")
        is_active = False


Loading Silero VAD model...


Using cache found in /home/lequocthinh/.cache/torch/hub/snakers4_silero-vad_master


VAD initialized successfully.
--- Initializing PhoASR with model: 'vinai/PhoWhisper-base' ---
✅ Model loaded successfully on device: 'cuda'.

🤖 AI Waiter is ready to take orders.

🎤 Listening for speech...
   (Speech started...)
   (Speech ended due to pause.)
✅ Recording saved to: silero_vad_2025-10-04_23-01-11.wav

--- Processing Audio File: silero_vad_2025-10-04_23-01-11.wav ---


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Audio loaded and resampled to 16000 Hz.
Transcribing... (This may take a moment)
✅ Transcription complete.
👤 CUSTOMER SAID: cho tôi hai cái vi da.
🤖 AI WAITER SAYS: Vâng ạ, tôi đã hiểu. Quý khách còn yêu cầu gì nữa không?

--------------------------------------

🎤 Listening for speech...
   (Speech started...)
   (Speech ended due to pause.)
✅ Recording saved to: silero_vad_2025-10-04_23-01-19.wav

--- Processing Audio File: silero_vad_2025-10-04_23-01-19.wav ---
Audio loaded and resampled to 16000 Hz.
Transcribing... (This may take a moment)
✅ Transcription complete.
👤 CUSTOMER SAID: tôi muốn ăn một tư phở.
🤖 AI WAITER SAYS: Vâng ạ, tôi đã hiểu. Quý khách còn yêu cầu gì nữa không?

--------------------------------------

🎤 Listening for speech...
   (Speech started...)
   (Speech ended due to pause.)
✅ Recording saved to: silero_vad_2025-10-04_23-01-31.wav

--- Processing Audio File: silero_vad_2025-10-04_23-01-31.wav ---
Audio loaded and resampled to 16000 Hz.
Transcribing... (This m