## Wake-word detection & Qwen 2 Audio-7B demo
**Sections:**
1. Wake-word detection with Porcupine
2. Multimodal audio understanding with Qwen 2 Audio-7B

In [None]:
# Install/update required packages
%pip install --quiet pvporcupine sounddevice soundfile librosa pyyaml matplotlib scipy transformers --upgrade


In [None]:
from pathlib import Path
import yaml

CONFIG_PATH = Path.home() / "Study" / "Course" / "EECS" / "Robotics" / "mobile_robot" / "config.yaml"

def load_config(file_path: Path = CONFIG_PATH) -> str:
    if not file_path.exists():
        raise FileNotFoundError(f"Config file not found at {file_path}")
    cfg = yaml.safe_load(file_path.read_text())
    if "PORCUPINE_KEY" not in cfg:
        raise KeyError("Missing 'PORCUPINE_KEY' in config")
    return cfg["PORCUPINE_KEY"]


In [None]:
import pvporcupine
porcupine = pvporcupine.create(access_key=load_config(), keywords=["jarvis"])
print("Porcupine frame_length:", porcupine.frame_length)


In [None]:
import sounddevice as sd
sd.query_devices()


In [None]:
import sounddevice as sd
from scipy.io.wavfile import write

fs, duration = 16000, 5
print("Recording...")
rec = sd.rec(int(fs*duration), samplerate=fs, channels=1, dtype='int16')
sd.wait()
write("output.wav", fs, rec)
print("Saved output.wav")


In [None]:
import numpy as np, soundfile as sf, librosa

data, sr = sf.read("output.wav")
if data.ndim > 1:
    data = data[:,0]
if sr != 16000:
    data = librosa.resample(data, sr, 16000)
    sr = 16000
if data.dtype != np.int16:
    data = (data * 32767).astype(np.int16)

for i in range(0, len(data) - porcupine.frame_length + 1, porcupine.frame_length):
    if porcupine.process(data[i:i+porcupine.frame_length]) >= 0:
        print("Detected at sample", i)
        break
porcupine.delete()


## Qwen 2 Audio-7B Multimodal Audio Understanding

In [None]:
from io import BytesIO
from urllib.request import urlopen
import shutil, os, warnings, librosa, torch
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor

# Check free disk space
cache_dir = os.getenv("HF_HOME", os.path.expanduser("~/.cache/huggingface/hub"))
free_mb = shutil.disk_usage(cache_dir).free // (1024*1024)
if free_mb < 8000:
    warnings.warn(f"Low disk space: {free_mb}MB free, needs ~8000MB")

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-Audio-7B-Instruct", device_map="auto", torch_dtype="auto"
)

# Prepare sample conversations
convo1 = [
    {"role":"user","content":[{"type":"audio","audio_url":"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},{"type":"text","text":"What's that sound?"}]},
    {"role":"assistant","content":"It is the sound of glass shattering."},
    {"role":"user","content":[{"type":"audio","audio_url":"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},{"type":"text","text":"What can you hear?"}]}
]
convo2 = [
    {"role":"user","content":[{"type":"audio","audio_url":"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},{"type":"text","text":"What does the person say?"}]}
]
conversations = [convo1, convo2]

# Build prompts and load audio
texts = [processor.apply_chat_template(c, add_generation_prompt=True, tokenize=False) for c in conversations]
audios = []
for c in conversations:
    for msg in c:
        for elem in msg.get("content", []):
            if elem["type"] == "audio":
                y, _ = librosa.load(
                    BytesIO(urlopen(elem["audio_url"]).read()),
                    sr=processor.feature_extractor.sampling_rate
                )
                audios.append(y)

# Tokenize + move to GPU
inputs = processor(text=texts, audios=audios, return_tensors="pt", padding=True)
inputs = {k:(v.to(model.device) if torch.is_tensor(v) else v) for k,v in inputs.items()}

# Generate and print responses
gen = model.generate(**inputs, max_length=256)
gen = gen[:, inputs['input_ids'].size(1):]
out = processor.batch_decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=False)
for i, resp in enumerate(out,1):
    print(f"Conversation {i}: {resp}")
