## Wake word

In [15]:
!pip3 install pvporcupine
!pip install pyyaml
!pip install sounddevice
!pip install scipy
!pip install matplotlib
!pip install soundfile
!pip install librosa
# !apt install libportaudio2 libportaudiocpp0 portaudio19-dev

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.61.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)
Collecting scikit-learn>=1.1.0 (from librosa)
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting joblib>=1.0 (from librosa)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.5.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting lazy_loader>=0.1 (from librosa)
  Using cached lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Collecting msgpack>=1.0 (from librosa)
  Downloading msgpa

In [17]:
from pathlib import Path
import yaml

CONFIG_PATH = Path('/home/sunzid/Study/Course/EECS/Robotics/mobile_robot/config.yaml')


def load_config(file_path: Path = CONFIG_PATH):
    if not file_path.exists():
        raise FileNotFoundError(f"Config file not found at {file_path}")
    with file_path.open('r', encoding='utf-8') as f:
        cfg = yaml.safe_load(f)
    # return a tuple (porcupine_key, openai_key, chat_model)
    return (
        cfg['PORCUPINE_KEY']
        # cfg['OPENAI_KEY'],
        # cfg['OPENAI_CHAT_MODEL']
    )


In [18]:
import pvporcupine

# porcupine_key, openai_key, chat_model = load_config()
porcupine_key = load_config()

porcupine = pvporcupine.create(
    access_key=porcupine_key,
    keywords=['jarvis']
)


In [11]:
import sounddevice as sd
from scipy.io.wavfile import write

fs = 44100  # Sample rate
seconds = 5  # Duration of recording

print("Recording...")
myrecording = sd.rec(int(seconds * fs), samplerate=fs, channels=2)
sd.wait()  # Wait until recording is finished
print("Recording complete.")
write('output.wav', fs, myrecording)  # Save as WAV file

Recording...
Recording complete.


In [10]:
import sounddevice as sd
print(sd.query_devices())


  0 HDA NVidia: HDMI 0 (hw:0,3), ALSA (0 in, 8 out)
  1 HDA NVidia: HDMI 1 (hw:0,7), ALSA (0 in, 8 out)
  2 HDA NVidia: HDMI 2 (hw:0,8), ALSA (0 in, 8 out)
  3 HDA NVidia: HDMI 3 (hw:0,9), ALSA (0 in, 8 out)
  4 HD-Audio Generic: EI322QUR (hw:1,3), ALSA (0 in, 2 out)
  5 HD-Audio Generic: ALC256 Analog (hw:2,0), ALSA (2 in, 2 out)
  6 hdmi, ALSA (0 in, 8 out)
  7 pipewire, ALSA (64 in, 64 out)
* 8 default, ALSA (64 in, 64 out)


In [19]:
import pvporcupine
import numpy as np
import soundfile as sf

# 1. Load your recorded audio
data, samplerate = sf.read('output.wav')

# Porcupine expects:
# - mono audio
# - 16-bit int PCM
# - sample rate: typically 16000 Hz
# - frame length: usually 512 samples

# Ensure mono
if data.ndim == 2:
    data = data[:, 0]  # take left channel

# Resample if needed (Porcupine default is 16000 Hz)
if samplerate != 16000:
    import librosa
    data = librosa.resample(data, orig_sr=samplerate, target_sr=16000)
    samplerate = 16000

# Convert to int16 if needed
if data.dtype != np.int16:
    data = (data * 32767).astype(np.int16)


frame_length = porcupine.frame_length

# 3. Iterator to get audio frames
def audio_frame_generator(data, frame_length):
    for i in range(0, len(data) - frame_length + 1, frame_length):
        yield data[i:i + frame_length]

# 4. Detection loop
for audio_frame in audio_frame_generator(data, frame_length):
    keyword_index = porcupine.process(audio_frame)
    if keyword_index >= 0:
        print("✅ Keyword detected!")
        break

porcupine.delete()


✅ Keyword detected!


## Full Loop

Create a python virtual environment and install all inside.

┌────────────────────────────┐
│  Audio Capture Node        │──► raw audio ──┐
└────────────────────────────┘               │
                                             ▼
┌────────────────────────────┐         ┌──────────────────┐
│  Wake‑Word Detector        │────────►│  Dialog Manager  │
│  (e.g. Porcupine, Snowboy) │         └──────────────────┘
└────────────────────────────┘                  │
       ▲             │                          │
       │             │ yes “Hey Robot!”          ▼
       │ no trigger  │                  ┌─────────────────┐
       └─────────────┘                  │  Qwen2.5 Omni   │
                                        │  (ASR+LLM+MM)   │
                                        └─────────────────┘
                                                 │
                                            text/audio
                                                 ▼
                                      ┌─────────────────────┐
                                      │  Intent Parser /    │
                                      │  End‑of‑Dialog DET  │
                                      └─────────────────────┘
                                       ┌─────────┴─────────┐
                                       │                   │
                                       ▼                   ▼
                             “continue conv.”       “move + conv.” 
                              loop back to           dispatch to
                              Dialog Manager         Motion Planner
                                                         │
                                                         ▼
                                               ┌────────────────────┐
                                               │  Motion Controller │
                                               │  (ROS2 action lib) │
                                               └────────────────────┘
                                                         │
                                                         ▼
                                                    Bluetooth
                                                    Speaker


In [None]:
# Pseudocode for top‐level state machine
state = "IDLE"

while rclpy.ok():
    if state == "IDLE":
        if wake_word_detector.has_triggered():
            state = "CONVERSATION"
            dialog_manager.start_dialog()
    elif state == "CONVERSATION":
        response = dialog_manager.poll_response()
        intent = intent_parser.parse(response)
        if intent == "move":
            state = "MOVE"
        elif intent == "end_conversation":
            dialog_manager.end_dialog()
            state = "IDLE"
        else:
            # still chatting
            continue
    elif state == "MOVE":
        # extract target from intent (e.g. “go to kitchen” → pose)
        target_pose = intent_parser.extract_pose(response)
        motion_planner.go_to(target_pose)
        # Optionally chat en route or upon arrival
        state = "CONVERSATION"
    rclpy.spin_once()


In [None]:
import rclpy
from rclpy.node import Node
from rclpy.qos import qos_profile_sensor_data

# Placeholders for external dependencies
# from wake_word import WakeWordDetector
# from qwen_client import QwenClient
# from tts import TTSClient
# from intent_parser import IntentParser
# from nav2_msgs.action import NavigateToPose
# from rclpy.action import ActionClient

class AssistantNode(Node):
    def __init__(self):
        super().__init__('assistant_node')

        # State: IDLE, CONVERSATION, MOVE
        self.state = 'IDLE'

        # Audio subscription
        self.audio_sub = self.create_subscription(
            # Assume audio raw message type
            # AudioData, '/audio_raw', self.audio_callback,
            msg_type=None,
            topic='/audio_raw',
            callback=self.audio_callback,
            qos_profile=qos_profile_sensor_data)

        # Publisher for TTS audio
        self.tts_pub = self.create_publisher(
            # AudioData,
            msg_type=None,
            topic='/tts_audio',
            qos_profile=10)

        # Wake-word detector
        # self.wake_detector = WakeWordDetector()

        # Qwen client as ROS service
        # self.qwen_client = QwenClient(self)

        # TTS client
        # self.tts_client = TTSClient(self)

        # Intent parser
        # self.intent_parser = IntentParser()

        # Navigation action client
        # self.nav_client = ActionClient(self, NavigateToPose, 'navigate_to_pose')

        # Buffer for audio frames
        self.audio_buffer = []

        # Timer to process state machine
        self.create_timer(0.1, self.tick)

    def audio_callback(self, msg):
        # Append audio to buffer
        self.audio_buffer.append(msg.data)

        # If in IDLE, feed to wake-word detector
        if self.state == 'IDLE':
            # if self.wake_detector.is_triggered(self.audio_buffer):
            if self.mock_wake_trigger():
                self.get_logger().info('Wake word detected!')
                self.state = 'CONVERSATION'
                # initialize dialog context
                # self.qwen_client.start_session()
                self.audio_buffer.clear()

    def tick(self):
        if self.state == 'CONVERSATION':
            # Collect user speech until silence or end token
            user_text = self.mock_asr(self.audio_buffer)
            if not user_text:
                return
            self.audio_buffer.clear()

            # Send to Qwen
            # response = self.qwen_client.query(user_text)
            response = self.mock_qwen(user_text)
            self.get_logger().info(f'Qwen response: {response}')

            # Publish TTS
            # audio_resp = self.tts_client.text_to_speech(response)
            audio_resp = self.mock_tts(response)
            self.tts_pub.publish(audio_resp)

            # Parse intent
            # intent, params = self.intent_parser.parse(response)
            intent, params = self.mock_intent(response)

            if intent == 'move':
                target = params.get('location')
                self.target_pose = self.resolve_pose(target)
                self.state = 'MOVE'

            elif intent == 'end':
                self.get_logger().info('Conversation ended')
                self.state = 'IDLE'

            # else: continue conversation

        elif self.state == 'MOVE':
            # Dispatch navigation
            # goal = NavigateToPose.Goal()
            # goal.pose = self.target_pose
            # self.nav_client.wait_for_server()
            # self.nav_client.send_goal_async(goal)
            self.mock_navigate(self.target_pose)
            self.get_logger().info(f'Moving to {self.target_pose}')
            # After move, return to conversation
            self.state = 'CONVERSATION'

    # ----- Mock implementations -----
    def mock_wake_trigger(self):
        # Replace with real detection
        return True

    def mock_asr(self, buffer):
        # Replace with real ASR
        return 'Move to kitchen'

    def mock_qwen(self, text):
        # Replace with real Qwen query
        if 'move' in text:
            return 'Sure, navigating to the kitchen now.'
        return 'Hello! How can I assist you?'

    def mock_tts(self, text):
        # Create a dummy audio message
        class AudioMsg:
            def __init__(self, data): self.data = data
        return AudioMsg(data=text.encode())

    def mock_intent(self, response_text):
        if 'navigating' in response_text.lower():
            return 'move', {'location': 'kitchen'}
        if 'bye' in response_text.lower():
            return 'end', {}
        return 'chat', {}

    def resolve_pose(self, location_str):
        # Map location string to coordinates
        poses = {'kitchen': (1.0, 2.0, 0.0)}
        return poses.get(location_str, (0.0, 0.0, 0.0))

    def mock_navigate(self, pose):
        # Simulate navigation
        pass


def main(args=None):
    rclpy.init(args=args)
    node = AssistantNode()
    try:
        rclpy.spin(node)
    except KeyboardInterrupt:
        pass
    node.destroy_node()
    rclpy.shutdown()

if __name__ == '__main__':
    main()
