## Environment
* Install Anaconda (install > nano ~/.bashrc > ~/anaconda3/bin/conda init > source ~/.bashrc)
* Create Conda environment `conda create --name <your_environment_name> python=3.11`
* Install CUDA toolkit
* Install PyTorch (about 7GBs)
* Install HuggingFace

To free up space:
```bash
conda env remove --name <environment_name>
conda clean --all
rm -rf ~/.cache/huggingface/hub
rm -rf ~/.cache/huggingface/transformers
rm -rf ~/.cache/huggingface/datasets
```

In [1]:
# Check CUDA version
!nvidia-smi

# Check if CUDA is available
import torch
print("CUDA available:", torch.cuda.is_available())

Tue Jun 24 13:45:32 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.07             Driver Version: 570.133.07     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 2060        Off |   00000000:01:00.0  On |                  N/A |
| 43%   48C    P3             30W /  184W |     601MiB /  12288MiB |     27%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
# Install/update required packages
%pip install --quiet pvporcupine sounddevice soundfile librosa pyyaml matplotlib scipy transformers --upgrade


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
coqui-tts 0.26.2 requires transformers<4.52,>=4.47.0, but you have transformers 4.52.4 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


## Wake word detection

### Porcupine

In [2]:
from pathlib import Path
import yaml

CONFIG_PATH = Path.home() / "Study" / "Course" / "EECS" / "Robotics" / "mobile_robot" / "config.yaml"

def load_config(file_path: Path = CONFIG_PATH) -> str:
    if not file_path.exists():
        raise FileNotFoundError(f"Config file not found at {file_path}")
    cfg = yaml.safe_load(file_path.read_text())
    if "PORCUPINE_KEY" not in cfg:
        raise KeyError("Missing 'PORCUPINE_KEY' in config")
    return cfg["PORCUPINE_KEY"]


In [3]:
import pvporcupine
porcupine = pvporcupine.create(access_key=load_config(), keywords=["jarvis"])
print("Porcupine frame_length:", porcupine.frame_length)


Porcupine frame_length: 512


In [4]:
import sounddevice as sd
sd.query_devices()


  0 HDA NVidia: HDMI 0 (hw:0,3), ALSA (0 in, 8 out)
  1 HDA NVidia: HDMI 1 (hw:0,7), ALSA (0 in, 8 out)
  2 HDA NVidia: HDMI 2 (hw:0,8), ALSA (0 in, 8 out)
  3 HDA NVidia: HDMI 3 (hw:0,9), ALSA (0 in, 8 out)
  4 HD-Audio Generic: HDMI 0 (hw:1,3), ALSA (0 in, 8 out)
  5 HD-Audio Generic: ALC256 Analog (hw:2,0), ALSA (2 in, 2 out)
  6 hdmi, ALSA (0 in, 8 out)
  7 pipewire, ALSA (64 in, 64 out)
* 8 default, ALSA (64 in, 64 out)

In [6]:
import sounddevice as sd
from scipy.io.wavfile import write

fs, duration = 16000, 5
print("Recording...")
rec = sd.rec(int(fs*duration), samplerate=fs, channels=1, dtype='int16')
sd.wait()
write("output.wav", fs, rec)
print("Saved output.wav")


Recording...
Saved output.wav


In [4]:
import numpy as np, soundfile as sf, librosa

data, sr = sf.read("output.wav")
if data.ndim > 1:
    data = data[:,0]
if sr != 16000:
    data = librosa.resample(data, sr, 16000)
    sr = 16000
if data.dtype != np.int16:
    data = (data * 32767).astype(np.int16)

for i in range(0, len(data) - porcupine.frame_length + 1, porcupine.frame_length):
    if porcupine.process(data[i:i+porcupine.frame_length]) >= 0:
        print("Detected 'Jarvis' at sample", i)
        break
porcupine.delete()


Detected 'Jarvis' at sample 32256


## Image Object Detection
### YOLO


In [None]:
!pip install ultralytics

In [None]:
import time
from ultralytics import YOLO

model8s = YOLO("models/YOLO/yolov8s.pt")

In [None]:
for results in model8s.track(source=0, classes=[0], conf=0.25, device="cuda:0"):
    human_near = (results[0].boxes.cls.cpu() == 0).any().item()
    print("Human detected:", human_near)
    break


## Speech-to-text

### Whisper
- By openai
- High accuracy
- Resoruce intensive


In [None]:
!pip install -U openai-whisper
!sudo apt update && sudo apt install ffmpeg
!pip install setuptools-rust

In [None]:
import whisper

stt_model = whisper.load_model("turbo")


In [None]:
result = stt_model.transcribe("Camilla.wav")
print(result["text"])

### Vosk
- Less accurate than whisper
- Less resource intensive

In [None]:
!pip3 install vosk

In [None]:
!wget https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
!unzip vosk-model-small-en-us-0.15.zip



In [11]:
import sounddevice as sd
from scipy.io.wavfile import write

fs, duration = 16000, 5
print("Recording...")
rec = sd.rec(int(fs*duration), samplerate=fs, channels=1, dtype='int16')
sd.wait()
write("output.wav", fs, rec)
print("Saved output.wav")


Recording...
Saved output.wav


In [12]:
import os
import wave
import json
from vosk import Model, KaldiRecognizer, SetLogLevel

SetLogLevel(0)  # Suppress verbose logging

model = Model("stt/vosk-model-small-en-us-0.15")
wf = wave.open("output.wav", "rb")
assert wf.getnchannels() == 1 and wf.getsampwidth() == 2 and wf.getframerate() in (8000,16000,44100), \
       "Use mono WAV with 16-bit samples"

rec = KaldiRecognizer(model, wf.getframerate())
rec.SetWords(True)

results = []

while True:
    data = wf.readframes(4000)
    if len(data) == 0:
        break
    if rec.AcceptWaveform(data):
        result = json.loads(rec.Result())
        results.append(result.get("text", ""))
    else:
        # Optional: collect partial results if desired
        partial = json.loads(rec.PartialResult()).get("partial", "")
        if partial:
            print("Partial:", partial)

# Don't forget the final result!
final_result = json.loads(rec.FinalResult())
results.append(final_result.get("text", ""))

# Join non-empty results
transcript = " ".join([r for r in results if r])
print("Transcript:", transcript)


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=10 max-active=3000 lattice-beam=2
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:6:7:8:9:10
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from stt/vosk-model-small-en-us-0.15/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:282) Loading HCL and G from stt/vosk-model-small-en-us-0.15/graph/HCLr.fst stt/vosk-model-small-en-us-0.15/graph/Gr.fst
LOG (VoskAPI:ReadDataFiles():model.cc:308) Loading winfo stt/vosk-model-small-en-us-0.15/graph/phones/word_boundary.int


Partial: what
Partial: what
Partial: what are
Partial: what are
Partial: what are your
Partial: what are your capabilities
Partial: what are your capabilities
Partial: what are your capabilities
Partial: what are your capabilities
Partial: what are your capabilities
Transcript: what are your capabilities


In [13]:
results

['what are your capabilities', '']

## LLM

### Text-to-text

#### deepseek-r1

In [None]:
!pip install accelerate
!pip install bitsandbytes

In [21]:
!pip install bitsandbytes -U "huggingface_hub[cli]"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting bitsandbytes
  Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting huggingface_hub[cli]
  Downloading huggingface_hub-0.33.1-py3-none-any.whl.metadata (14 kB)
Collecting InquirerPy==0.3.4 (from huggingface_hub[cli])
  Downloading InquirerPy-0.3.4-py3-none-any.whl.metadata (8.1 kB)
Collecting pfzy<0.4.0,>=0.3.1 (from InquirerPy==0.3.4->huggingface_hub[cli])
  Downloading pfzy-0.3.4-py3-none-any.whl.metadata (4.9 kB)
Downloading bitsandbytes-0.46.0-py3-none-manylinux_2_24_x86_64.whl (67.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.0/67.0 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.33.1-py3-none-any.whl (515 kB)
Downloading InquirerPy-0.3.4-py3-none-any.whl (67 kB)
Downloading pfzy-0.3.4-py3-none-any.whl (8.5 kB)
Installing collected packages: pfzy, InquirerPy, huggingface_hub, bitsandbytes
[2K  Attempting uninstall: huggingface_hub
[2K    Found existi

In [None]:
!huggingface-cli scan-cache

In [None]:
!huggingface-cli delete-cache

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

dsr1_model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"

dsr1_tokenizer = AutoTokenizer.from_pretrained(dsr1_model_name)

dsr1_model = AutoModelForCausalLM.from_pretrained(
    dsr1_model_name,
    quantization_config=bnb,
    device_map="auto",
)


In [13]:
print(f"{dsr1_model.get_memory_footprint() / (1 << 30):.2f} GiB")


1.48 GiB


In [20]:
def generate(prompt):
    inputs = dsr1_tokenizer(prompt, return_tensors="pt").to(dsr1_model.device)
    outputs = dsr1_model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,            # Enables sampling
        temperature=0.6,           # Set desired creativity level
        top_p=0.9,                 # Optional: nucleus sampling
        repetition_penalty=1.1     # Optional: reduce repeats
    )
    return dsr1_tokenizer.decode(outputs[0], skip_special_tokens=True)

In [21]:
context = """Here is the relevant info:
- Alice: senior engineer with 10 years at WidgetCorp.
- Project Orion: deadline May 2026, focused on AI safety."""

user_prompt = "Explain how we should conduct a risk assessment meeting next week."

prompt = f"{context}\n\n{user_prompt}"
print(generate(prompt))


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


Here is the relevant info:
- Alice: senior engineer with 10 years at WidgetCorp.
- Project Orion: deadline May 2026, focused on AI safety.

Explain how we should conduct a risk assessment meeting next week. The meeting should include:

a) How to determine whether there are any risks associated with the project.

b) If so, what steps would be taken to address those risks.

c) What if there are no risks?

d) In addition, you


#### Llama

In [1]:
%pip install accelerate --quiet
%pip install PyYAML --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
from pathlib import Path
import yaml
import os
import torch
from transformers import pipeline

ROOT_PATH = Path.home() / "Study" / "Course" / "EECS" / "Robotics" / "mobile_robot" / "LLM"

with open(ROOT_PATH / "config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

LLAMA_CFG = cfg['llama']

os.environ["HF_TOKEN"] = cfg['huggingface']['api_token']

device = "cuda" if torch.cuda.is_available() else "cpu"


In [2]:

llama_pipe = pipeline(
    "text-generation",
    model=LLAMA_CFG['light_model_id'],
    torch_dtype=torch.bfloat16,
    device_map="auto",
    max_new_tokens=LLAMA_CFG['max_new_tokens'],
    do_sample=LLAMA_CFG['do_sample']
)


with open(ROOT_PATH / "knowledge_base.yaml", "r") as f:
    knowledge_base = yaml.safe_load(f)

formatted_kb = yaml.dump(knowledge_base, default_flow_style=False)



Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


#### Function Test

In [9]:
class ChatSession:
    def __init__(self,
                 llm_task: str,
                 formatted_kb: str,
                 text_pipe,               # your HF/text‐generation pipeline fn
                 output_instructions: str,
                 delimiter: str = "###"):
        """
        llm_task:      short description of the LLM’s task
        formatted_kb:  pre‐formatted knowledge base text
        text_pipe:     a function like transformers.pipeline(…) that returns
                       [{'generated_text': str}, …]
        output_instructions: any extra “assistant should do X” text
        delimiter:     how you split raw generations to find the assistant’s reply
        """
        self.llm_task = llm_task
        self.formatted_kb = formatted_kb
        self.text_pipe = text_pipe
        self.output_instructions = output_instructions
        self.delimiter = delimiter

        # initialize conversation state
        self.conversation_history = ""
        self.previous_response = ""

    def conversation(self, question: str):
        """Add the user’s question, generate an answer, and print it."""
        # 1) build or append history
        if not self.conversation_history:
            self.conversation_history = (
                f"Task: {self.llm_task}\n"
                f"Knowledge_base:\n{self.formatted_kb}\n"
                f"User: {question}"
            )
        else:
            self.conversation_history += (
                f"\nAssistant: {self.previous_response}\n"
                f"User: {question}"
            )

        # 2) generate & log
        self.previous_response = self.llm_generate_response()
        print("LLM Response:", self.previous_response)
        return self.previous_response

    def llm_generate_response(self) -> str:
        """Run the pipeline on the full prompt and return the last segment."""
        prompt = f"{self.conversation_history}\n{self.output_instructions}"
        out = self.text_pipe(
            prompt,
            max_new_tokens=1000,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            repetition_penalty=1.2
        )
        raw = out[0]['generated_text']
        parts = raw.split(self.delimiter)
        # return just the assistant’s final segment
        return parts[-1].strip() if len(parts) > 1 else raw.strip()

    def clear_conversation(self, force: bool = False):
        """
        Reset history if force=True, otherwise do nothing.
        (You can add whatever condition you like.)
        """
        if force:
            self.conversation_history = ""
            self.previous_response = ""
            print("Conversation history cleared.")
        else:
            print("Continuing conversation. Pass force=True to clear.")

In [None]:
delimiter = "#####"

llm_task = """
You are a friendly retail customer service assistant robot.
You will be provided with a customer's question or request related to products, product locations, store policies.
You will also be provided with a store's knowledge base containing information about products, promotions, and store policies.

You must select one of the following two response types based on the customer's question or request:

Response Type 1 - Product information:
  - If the customer's question is about details of a specific product or promotion, briefly provide the relevant information from the store's knowledge base.

Response Type 2 - Product location:
  - If the customer asks for guidance to find or navigate directly to a specific product or product category, you must provide ONLY the precise coordinates (x, y) for navigation from the store's knowledge base.
  - If exact coordinates aren't available for the product, provide aisle information instead.

Response Type 3 - End interaction:
  - If the customer indicates they need no further assistance (e.g., “That's all, thanks” or “No, I'm good”), respond with type 3 and a polite closing message such as “Happy to help—have a great day!”

If neither the answer nor the coordinates or aisle are found in the store's knowledge base, politely respond with: "I will get someone who can help you with your query."
"""

outputInstructions = f"""
You must reply **exactly** (with no extra text or reasoning) in this format:

Response type: {delimiter} <1, 2, or 3>  
Response to user: {delimiter} <your reply>

- Do NOT output any internal reasoning, thought process, or steps.
- Do NOT use any other words or punctuation beyond what’s above.
"""


##### Llama

In [None]:
# Example usage in a notebook:
from transformers import pipeline

sess = ChatSession(
    llm_task=llm_task,
    formatted_kb=formatted_kb,
    text_pipe=llama_pipe,
    output_instructions=outputInstructions
)

sess.conversation("Recommend an healthy milk")
sess.conversation("Can you take me to it?")
sess.clear_conversation(force=True)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


LLM Response: ## (12.4, 5.8)
LLM Response: ## (12.4, 5.8)
Conversation history cleared.


##### R1

In [14]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline
)
from functools import partial


# 1) set up 4-bit quantization
bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# 2) load tokenizer & model
DS_R1_CFG = cfg['ds_r1']['model_id']
tokenizer = AutoTokenizer.from_pretrained(DS_R1_CFG)
model     = AutoModelForCausalLM.from_pretrained(
    DS_R1_CFG,
    quantization_config=bnb,
    device_map="auto",
)

# 3) build a text-generation pipeline around it

dsr1_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    do_sample=False,
    temperature=0.0,
    max_new_tokens=30,
    eos_token_id=tokenizer.convert_tokens_to_ids(delimiter),
)

Device set to use cpu
The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
# 4) plug it into ChatSession exactly as before
sess = ChatSession(
    llm_task=llm_task,
    formatted_kb=formatted_kb,
    text_pipe=dsr1_pipe,
    output_instructions=outputInstructions
)

sess.conversation("Recommend a healthy milk")
sess.conversation("Can you take me to it?")
sess.clear_conversation(force=True)


LLM Response: ## <your reply>

- If you pick **1**, give a brief product/promotion info from the KB.  
- If you pick **2**, give **ONLY** coordinates `(x, y)` or, if not in the KB, aisle info.  
- If you pick **3**, give a polite closing (e.g. “Happy to help—have a great day!”).  
- Do NOT include any markdown formatting, do NOT use any special characters except those already specified above.

Alright, let me try to figure out how to approach this task step by step. The problem involves creating an appropriate response using the given instructions, so my goal here is to understand what exactly needs to be done and then apply it correctly.

Firstly, the task outline gives clear steps that dictate how the AI should handle different scenarios. Let's break them down into manageable parts to avoid confusion.

The first part is determining which response type to choose. There are three possible options, each serving a distinct purpose. So, depending on whether the user asked about product in

### Audio/text to audio/text
#### Qwen 2 Audio-7B Multimodal Audio Understanding

In [None]:
%pip install librosa
%pip install accelerate

In [None]:
from io import BytesIO
from urllib.request import urlopen
import librosa
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")


ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [None]:

conversation1 = [
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
        {"type": "text", "text": "What's that sound?"},
    ]},
    {"role": "assistant", "content": "It is the sound of glass shattering."},
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
        {"type": "text", "text": "What can you hear?"},
    ]}
]

conversation2 = [
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
        {"type": "text", "text": "What does the person say?"},
    ]},
]

conversations = [conversation1, conversation2]

text = [processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) for conversation in conversations]

audios = []
for conversation in conversations:
    for message in conversation:
        if isinstance(message["content"], list):
            for ele in message["content"]:
                if ele["type"] == "audio":
                    audios.append(
                        librosa.load(
                            BytesIO(urlopen(ele['audio_url']).read()), 
                            sr=processor.feature_extractor.sampling_rate)[0]
                    )

inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs['input_ids'] = inputs['input_ids'].to("cuda")
inputs.input_ids = inputs.input_ids.to("cuda")

generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]

response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

## Text-to-speech

### eSpeak NG
- Fully opensource
- Mechanical sounding

In [None]:
# !sudo apt-get install espeak-ng
# !sudo apt install mbrola mbrola-us1

In [None]:
import subprocess

def tts(text: str, voice: str = "en-us+f3", speed: int = 120, pitch: int = 50):
    # voice example: "mb-us1" or "en-us+f3"
    subprocess.call([
        "espeak-ng",
        "-v", voice,
        "-s", str(speed),
        "-p", str(pitch),
        text
    ])

if __name__ == "__main__":
    tts("Hello from Ubuntu Python, sounding nicer!", voice="mb-us1", speed=120, pitch=80)


### Coqui-tts
- Free for non-commercial use
- Natural sounding
- Many voice options

In [None]:
!pip install coqui-tts


In [None]:
from TTS.api import TTS
import torch
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# tts = TTS(model_name="tts_models/en/vctk/vits").to(device)
# tts.tts_to_file(text="Natural voice is here!", file_path="out.wav")


# Initialize TTS
tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)


 > You must confirm the following:
 | > "I have purchased a commercial license from Coqui: licensing@coqui.ai"
 | > "Otherwise, I agree to the terms of the non-commercial CPML: https://coqui.ai/cpml" - [y/n]


100%|██████████| 1.87G/1.87G [02:45<00:00, 11.3MiB/s]
100%|██████████| 4.37k/4.37k [00:00<00:00, 38.7kiB/s]
100%|██████████| 361k/361k [00:00<00:00, 1.98MiB/s]
100%|██████████| 32.0/32.0 [00:00<00:00, 142iB/s]
100%|██████████| 7.75M/7.75M [00:14<00:00, 4.58MiB/s]

In [None]:
# List speakers
print(tts_model.speakers)


['Claribel Dervla', 'Daisy Studious', 'Gracie Wise', 'Tammie Ema', 'Alison Dietlinde', 'Ana Florence', 'Annmarie Nele', 'Asya Anara', 'Brenda Stern', 'Gitta Nikolina', 'Henriette Usha', 'Sofia Hellen', 'Tammy Grit', 'Tanja Adelina', 'Vjollca Johnnie', 'Andrew Chipper', 'Badr Odhiambo', 'Dionisio Schuyler', 'Royston Min', 'Viktor Eka', 'Abrahan Mack', 'Adde Michal', 'Baldur Sanjin', 'Craig Gutsy', 'Damien Black', 'Gilberto Mathias', 'Ilkin Urbano', 'Kazuhiko Atallah', 'Ludvig Milivoj', 'Suad Qasim', 'Torcull Diarmuid', 'Viktor Menelaos', 'Zacharie Aimilios', 'Nova Hogarth', 'Maja Ruoho', 'Uta Obando', 'Lidiya Szekeres', 'Chandra MacFarland', 'Szofi Granger', 'Camilla Holmström', 'Lilya Stainthorpe', 'Zofija Kendrick', 'Narelle Moon', 'Barbora MacLean', 'Alexandra Hisakawa', 'Alma María', 'Rosemary Okafor', 'Ige Behringer', 'Filip Traverse', 'Damjan Chapman', 'Wulf Carlevaro', 'Aaron Dreschner', 'Kumar Dahl', 'Eugenio Mataracı', 'Ferran Simen', 'Xavier Hayasaka', 'Luis Moray', 'Marcos Ru

In [None]:
# TTS to a file, use a preset speaker
tts_model.tts_to_file(
  text="The quick brown fox jumps over the lazy dog!",
  speaker="Camilla Holmström",
  language="en",
  file_path="Camilla.wav"
)

# tts_model.tts_to_file(
#   text="The quick brown fox jumps over the lazy dog!",
#   speaker="Suad Qasim",
#   language="en",
#   file_path="Suad.wav"
# )

'Annmarie.wav'

In [None]:
# Generate text using the model
tts_model.tts_to_file(
  text=generated_text,
  speaker="Camilla Holmström",
  language="en",
  file_path="Camilla.wav"
)


'Camilla.wav'

## Commercial Assistant Loop

In [None]:
import os
import wave
import json
import yaml
import torch
import pvporcupine
import sounddevice as sd
import numpy as np
import soundfile as sf
from scipy.io.wavfile import write
from vosk import Model, KaldiRecognizer, SetLogLevel
from transformers import pipeline as hf_pipeline
from TTS.api import TTS

# ─── LOAD CONFIG ─────────────────────────────────────────────────────────────
with open("config.yaml", "r") as f:
    cfg = yaml.safe_load(f)

PORCUPINE_KEYWORDS = cfg['porcupine']['keywords']
PORCUPINE_ACCESS_KEY = cfg['porcupine']['access_key']
AUDIO_SR         = cfg['audio']['sample_rate']
QUESTION_DUR     = cfg['audio']['question_duration']
VOSK_MODEL_PATH  = cfg['vosk']['model_path']
LLAMA_CFG        = cfg['llama']
TTS_CFG          = cfg['tts']
HF_TOKEN = cfg['huggingface']['token']

os.environ["HF_TOKEN"] = HF_TOKEN
# ─── INIT MODELS ─────────────────────────────────────────────────────────────
porcupine = pvporcupine.create(access_key=PORCUPINE_ACCESS_KEY, keywords=PORCUPINE_KEYWORDS)

SetLogLevel(0)
vosk_model = Model(VOSK_MODEL_PATH)

text_pipe = hf_pipeline(
    "text-generation",
    model=LLAMA_CFG['light_model_id'],
    torch_dtype=torch.bfloat16,
    device_map="auto",
    max_new_tokens=LLAMA_CFG['max_new_tokens'],
    do_sample=LLAMA_CFG['do_sample']
)

device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = TTS(TTS_CFG['eng_model_name']).to(device)


def listen_for_wakeword():
    print("🔊 Listening for wake word...")
    with sd.InputStream(
        samplerate=AUDIO_SR,
        blocksize=porcupine.frame_length,
        dtype="int16",
        channels=1
    ) as stream:
        while True:
            pcm, _ = stream.read(porcupine.frame_length)
            pcm = pcm.flatten().tolist()
            if porcupine.process(pcm) >= 0:
                print("✨ Wake word 'Jarvis' detected!")
                return


def record_question(duration=QUESTION_DUR):
    print(f"🎤 Recording question for {duration}s…")
    audio = sd.rec(int(duration * AUDIO_SR), samplerate=AUDIO_SR, channels=1, dtype="int16")
    sd.wait()
    return audio.flatten()


def transcribe_audio(audio: np.ndarray):
    tmp = "tmp_question.wav"
    write(tmp, AUDIO_SR, audio)
    wf = wave.open(tmp, "rb")

    rec = KaldiRecognizer(vosk_model, wf.getframerate())
    rec.SetWords(True)

    segments = []
    while True:
        data = wf.readframes(4000)
        if not data:
            break
        if rec.AcceptWaveform(data):
            segments.append(json.loads(rec.Result()).get("text", ""))
    segments.append(json.loads(rec.FinalResult()).get("text", ""))
    os.remove(tmp)

    transcript = " ".join([s for s in segments if s])
    print("📝 Transcribed question:", transcript)
    return transcript


def generate_response(prompt: str):
    print("🤖 Generating response…")
    out = text_pipe(prompt)
    raw = out[0]["generated_text"]
    answer = raw[len(prompt):].strip()
    print("💬 Answer:", answer)
    return answer


def speak_and_play(text: str, fname="response.wav"):
    print("🔊 Synthesizing speech…")
    tts_model.tts_to_file(
        text=text,
        speaker=TTS_CFG['eng_speaker'],
        # language=TTS_CFG['language'], #if english only model is used
        file_path=fname
    )
    data, sr = sf.read(fname)
    sd.play(data, sr)
    sd.wait()
    os.remove(fname)


if __name__ == "__main__":
    # print("Available speakers:", tts_model.speakers)
    # print("Supported langs:",   tts_model.languages)
    try:
        while True:
            listen_for_wakeword()
            q_audio = record_question()
            q_text  = transcribe_audio(q_audio)
            if not q_text.strip():
                print("⚠️ No speech detected—back to listening.")
                continue

            ans = generate_response(q_text)
            if not ans.strip():
                print("⚠️ No answer generated—back to listening.")
                continue
            # Speak the answer and play it
            speak_and_play(ans)

    except KeyboardInterrupt:
        print("\n👋 Exiting.")
    finally:
        porcupine.delete()


# Trash