## Environment
* Install Anaconda (install > nano ~/.bashrc > ~/anaconda3/bin/conda init > source ~/.bashrc)
* Create Conda environment `conda create --name <your_environment_name> python=3.11`
* Install CUDA toolkit
* Install PyTorch
* Install HuggingFace

In [2]:
# Check CUDA version
!nvidia-smi

# Check if CUDA is available
import torch
print("CUDA available:", torch.cuda.is_available())

Mon Jun  9 21:46:24 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.144.03             Driver Version: 550.144.03     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2060        Off |   00000000:01:00.0 Off |                  N/A |
| N/A   45C    P8              1W /   90W |       9MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# Install/update required packages
%pip install --quiet pvporcupine sounddevice soundfile librosa pyyaml matplotlib scipy transformers --upgrade


## Wake word detection using Porcupine

In [None]:
from pathlib import Path
import yaml

CONFIG_PATH = Path.home() / "Study" / "Course" / "EECS" / "Robotics" / "mobile_robot" / "config.yaml"

def load_config(file_path: Path = CONFIG_PATH) -> str:
    if not file_path.exists():
        raise FileNotFoundError(f"Config file not found at {file_path}")
    cfg = yaml.safe_load(file_path.read_text())
    if "PORCUPINE_KEY" not in cfg:
        raise KeyError("Missing 'PORCUPINE_KEY' in config")
    return cfg["PORCUPINE_KEY"]


In [None]:
import pvporcupine
porcupine = pvporcupine.create(access_key=load_config(), keywords=["jarvis"])
print("Porcupine frame_length:", porcupine.frame_length)


In [None]:
import sounddevice as sd
sd.query_devices()


In [None]:
import sounddevice as sd
from scipy.io.wavfile import write

fs, duration = 16000, 5
print("Recording...")
rec = sd.rec(int(fs*duration), samplerate=fs, channels=1, dtype='int16')
sd.wait()
write("output.wav", fs, rec)
print("Saved output.wav")


In [None]:
import numpy as np, soundfile as sf, librosa

data, sr = sf.read("output.wav")
if data.ndim > 1:
    data = data[:,0]
if sr != 16000:
    data = librosa.resample(data, sr, 16000)
    sr = 16000
if data.dtype != np.int16:
    data = (data * 32767).astype(np.int16)

for i in range(0, len(data) - porcupine.frame_length + 1, porcupine.frame_length):
    if porcupine.process(data[i:i+porcupine.frame_length]) >= 0:
        print("Detected at sample", i)
        break
porcupine.delete()


## Speech-to-text: Whisper

## Llama

In [None]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])


## Text-to-speech: eSpeak NG

# Trash

## Qwen 2 Audio-7B Multimodal Audio Understanding

In [None]:
%pip install librosa
%pip install accelerate

In [3]:
from io import BytesIO
from urllib.request import urlopen
import librosa
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")


ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [None]:

conversation1 = [
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
        {"type": "text", "text": "What's that sound?"},
    ]},
    {"role": "assistant", "content": "It is the sound of glass shattering."},
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
        {"type": "text", "text": "What can you hear?"},
    ]}
]

conversation2 = [
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
        {"type": "text", "text": "What does the person say?"},
    ]},
]

conversations = [conversation1, conversation2]

text = [processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) for conversation in conversations]

audios = []
for conversation in conversations:
    for message in conversation:
        if isinstance(message["content"], list):
            for ele in message["content"]:
                if ele["type"] == "audio":
                    audios.append(
                        librosa.load(
                            BytesIO(urlopen(ele['audio_url']).read()), 
                            sr=processor.feature_extractor.sampling_rate)[0]
                    )

inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs['input_ids'] = inputs['input_ids'].to("cuda")
inputs.input_ids = inputs.input_ids.to("cuda")

generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]

response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)