# Package Installation

In [3]:
!pip install ffmpeg-python



In [4]:
!pip install speechbrain



In [20]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

# Hugging Face Login

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Wake Word Detection

## 1.Recording Audio

In [5]:
# Javascript code record audio from Google Colab
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
from scipy.io.wavfile import read as wav_read
import io
import ffmpeg

AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");

my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);

var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;

var handleSuccess = function(stream) {
  gumStream = stream;
  var options = {
    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
    mimeType : 'audio/webm;codecs=opus'
    //mimeType : 'audio/webm;codecs=pcm'
  };
  //recorder = new MediaRecorder(stream, options);
  recorder = new MediaRecorder(stream);
  recorder.ondataavailable = function(e) {
    var url = URL.createObjectURL(e.data);
    var preview = document.createElement('audio');
    preview.controls = true;
    preview.src = url;
    document.body.appendChild(preview);

    reader = new FileReader();
    reader.readAsDataURL(e.data);
    reader.onloadend = function() {
      base64data = reader.result;
      //console.log("Inside FileReader:" + base64data);
    }
  };
  recorder.start();
  };

recordButton.innerText = "Recording... press to stop";

navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);


function toggleRecording() {
  if (recorder && recorder.state == "recording") {
      recorder.stop();
      gumStream.getAudioTracks()[0].stop();
      recordButton.innerText = "Saving the recording... pls wait!"
  }
}

// https://stackoverflow.com/a/951057
function sleep(ms) {
  return new Promise(resolve => setTimeout(resolve, ms));
}

var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()

sleep(2000).then(() => {
  // wait 2000ms for the data to be available...
  // ideally this should use something like await...
  //console.log("Inside data:" + base64data)
  resolve(base64data.toString())

});

}
});

</script>
"""

def get_audio():
  display(HTML(AUDIO_HTML))
  data = eval_js("data")
  binary = b64decode(data.split(',')[1])

  process = (ffmpeg
    .input('pipe:0')
    .output('pipe:1', format='wav')
    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
  )
  output, err = process.communicate(input=binary)

  riff_chunk_size = len(output) - 8
  # Break up the chunk size into four bytes, held in b.
  q = riff_chunk_size
  b = []
  for i in range(4):
      q, r = divmod(q, 256)
      b.append(r)

  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
  riff = output[:4] + bytes(b) + output[8:]

  sr, audio = wav_read(io.BytesIO(riff))

  return audio, sr

In [10]:
audio, sr = get_audio() # Recording audio

In [11]:
import scipy
scipy.io.wavfile.write('recording.wav', sr, audio)  # Saving audio file

## 2. Wake Word Detection Model

In [6]:
import torchaudio
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import soundfile as sf
import librosa

def detect_wake_word(filename="recording.wav", wake_word="marvin", prob_threshold=0.2):
    # Load the pre-trained Wav2Vec2 model and processor from transformers
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

    # Load the audio file
    waveform, sample_rate = sf.read(filename)

    # Resample if necessary
    if sample_rate != 16000:
        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=16000)
        sample_rate = 16000

    # Normalize the audio data
    waveform = (waveform - waveform.mean()) / waveform.std()

    # Process the audio data
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt", padding=True)

    # Run the audio through the model
    with torch.no_grad():
        logits = model(inputs.input_values).logits

    # Compute probabilities
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0].lower()

    print(f"Transcription: {transcription}")

    # Check if the wake word is present in the transcription
    wake_word_detected = wake_word in transcription
    prob = 1.0 if wake_word_detected else 0.0  # Placeholder, real implementation would use a better scoring

    if wake_word_detected and prob > prob_threshold:
        print("Wake word detected with high probability!")
        return True
    else:
        print("Wake word not detected or probability too low.")
        return False

# Detect wake word in the recorded audio file
detect_wake_word()


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Transcription: hey marvin
Wake word detected with high probability!


True

# Transcription (Automatic Speech Recognition)

In [8]:
transcriber = pipeline(
    "automatic-speech-recognition", model="openai/whisper-base.en", device=device
)

In [9]:
import sys
import soundfile as sf
from transformers import pipeline

def transcribe(filename="recording.wav", chunk_length_s=10.0, stream_chunk_s=1.0):
    # Load the Whisper model
    transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en", device=device)

    # Read the audio file
    data, samplerate = sf.read(filename)

    # Resample if necessary
    if samplerate != transcriber.feature_extractor.sampling_rate:
        import librosa
        data = librosa.resample(data, orig_sr=samplerate, target_sr=transcriber.feature_extractor.sampling_rate)
        # samplerate = transcriber.feature_extractor.sampling_rate # No need to update samplerate here

    # Normalize the audio data
    data = (data - data.mean()) / data.std()

    # Process the audio data
    # Pass the audio data directly to the transcriber pipeline, removing the sampling_rate argument
    print("Transcribing audio...")
    transcription = transcriber(data, generate_kwargs={"max_new_tokens": 128}) # Remove sampling_rate here

    print(transcription["text"])
    return transcription["text"]

# Transcribe the recorded audio file
transcribed_text = transcribe()
print("Transcription:", transcribed_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Transcribing audio...
 Hey Marvin
Transcription:  Hey Marvin


# Querying LLM

In [10]:
from huggingface_hub import HfFolder
import requests


def query(text, model_id="mistralai/Mixtral-8x7B-Instruct-v0.1"):
    api_url = f"https://api-inference.huggingface.co/models/{model_id}"
    headers = {"Authorization": f"Bearer {HfFolder().get_token()}"}
    payload = {"inputs": text}

    print(f"Querying...: {text}")
    response = requests.post(api_url, headers=headers, json=payload)

    #print(response.json())

    return response.json()[0]["generated_text"][len(text) + 1 :]

In [11]:
query("What does Hugging Face do?")

Querying...: What does Hugging Face do?


'\nHugging Face is an NLP startup launched in 2016 with headquarters in New York. It has become a popular face in the NLP world owing to its expertise in state-of-the-art NLP model development, education, and applications.\n\nThe team at Hugging Face continues to innovate in an ongoing quest to develop increasingly sophisticated models capable of reading, generating, understanding, and translating human language. Leaderboards like GLUE, S'

# Text-to-Speech

In [12]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

In [13]:
from datasets import load_dataset

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

Downloading data:   0%|          | 0.00/21.3M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/7931 [00:00<?, ? examples/s]

In [14]:
def synthesise(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(
        inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder
    )
    return speech.cpu()

In [15]:
from IPython.display import Audio

audio = synthesise(
    "Hugging Face is a company that provides natural language processing and machine learning tools for developers."
)

Audio(audio, rate=16000)

# Voice Assistant - MARVIN

In [22]:
# Voice Assistant
import scipy

audio, sr = get_audio()
scipy.io.wavfile.write('recording.wav', sr, audio)

if detect_wake_word():
  print("Wake word detected!")
  transcribed_text = transcribe()
  print("Transcription:", transcribed_text)
  response=query(transcribed_text)
  print(response)
  audio = synthesise(response)
  Audio(audio, rate=16000)
else:
  print("Wake word not detected.")

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

Transcription: hello marvin
Wake word detected with high probability!
Wake word detected!
Transcribing audio...




 Hello Marvin
Transcription:  Hello Marvin
Querying...:  Hello Marvin


this is a general question. I experienced an issue in the LPC0147 evaluation board and realized, that the problem was an incorrect configuration in LPC0147_get\_Analog\_Values() in the  lpc\_fish147.c. There a pin of the sensor is configured as an input pin by LPC0147_get\_Analog\_Values() which was  already defined as an input pin in correct_
