In [1]:
# Installs microWakeWord. Be sure to restart the session after this is finished.
import platform

if platform.system() == "Darwin":
    # `pymicro-features` is installed from a fork to support building on macOS
    !pip install 'git+https://github.com/puddly/pymicro-features@puddly/minimum-cpp-version'

# `audio-metadata` is installed from a fork to unpin `attrs` from a version that breaks Jupyter
!pip install 'git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f'

!git clone https://github.com/kahrendt/microWakeWord
!pip install -e ./microWakeWord

Collecting git+https://github.com/whatsnowplaying/audio-metadata@d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f
  Cloning https://github.com/whatsnowplaying/audio-metadata (to revision d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f) to /tmp/pip-req-build-2nxmdk0b
  Running command git clone --filter=blob:none --quiet https://github.com/whatsnowplaying/audio-metadata /tmp/pip-req-build-2nxmdk0b
  Running command git rev-parse -q --verify 'sha^d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f'
  Running command git fetch -q https://github.com/whatsnowplaying/audio-metadata d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f
  Running command git checkout -q d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f
  Resolved https://github.com/whatsnowplaying/audio-metadata to commit d4ebb238e6a401bb1a5aaaac60c9e2b3cb30929f
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
fatal: destination path 'microWakeWord' already ex

In [3]:
# Generates 1 sample of the target word for manual verification.

target_word = 'jain'  # Phonetic spellings may produce better samples

import os
import sys
import platform

from IPython.display import Audio

if not os.path.exists("./piper-sample-generator"):
    if platform.system() == "Darwin":
        !git clone -b mps-support https://github.com/kahrendt/piper-sample-generator
    else:
        !git clone https://github.com/rhasspy/piper-sample-generator

    !wget -O piper-sample-generator/models/en_US-libritts_r-medium.pt 'https://github.com/rhasspy/piper-sample-generator/releases/download/v2.0.0/en_US-libritts_r-medium.pt'

    # Install system dependencies
    !pip install torch torchaudio piper-phonemize-cross==1.2.1

    if "piper-sample-generator/" not in sys.path:
        sys.path.append("piper-sample-generator/")

!python3 piper-sample-generator/generate_samples.py "{target_word}" \
--max-samples 1 \
--batch-size 1 \
--output-dir generated_samples

Audio("generated_samples/0.wav", autoplay=True)

DEBUG:__main__:Loading //piper-sample-generator/models/en_US-libritts_r-medium.pt
INFO:__main__:Successfully loaded the model
DEBUG:__main__:Batch 1/1 complete
INFO:__main__:Done


In [4]:
# Generates a larger amount of wake word samples.
# Start here when trying to improve your model.
# See https://github.com/rhasspy/piper-sample-generator for the full set of
# parameters. In particular, experiment with noise-scales and noise-scale-ws,
# generating negative samples similar to the wake word, and generating many more
# wake word samples, possibly with different phonetic pronunciations.

!python3 piper-sample-generator/generate_samples.py "{target_word}" \
--max-samples 1000 \
--batch-size 100 \
--output-dir generated_samples

DEBUG:__main__:Loading //piper-sample-generator/models/en_US-libritts_r-medium.pt
INFO:__main__:Successfully loaded the model
DEBUG:__main__:Batch 1/10 complete
DEBUG:__main__:Batch 2/10 complete
DEBUG:__main__:Batch 3/10 complete
DEBUG:__main__:Batch 4/10 complete
DEBUG:__main__:Batch 5/10 complete
DEBUG:__main__:Batch 6/10 complete
DEBUG:__main__:Batch 7/10 complete
DEBUG:__main__:Batch 8/10 complete
DEBUG:__main__:Batch 9/10 complete
DEBUG:__main__:Batch 10/10 complete
INFO:__main__:Done


In [None]:
# # Downloads audio data for augmentation. This can be slow!
# # Borrowed from openWakeWord's automatic_model_training.ipynb, accessed March 4, 2024
# #
# # **Important note!** The data downloaded here has a mixture of difference
# # licenses and usage restrictions. As such, any custom models trained with this
# # data should be considered as appropriate for **non-commercial** personal use only.


# import datasets
# import scipy
# import os

# import numpy as np

# from pathlib import Path
# from tqdm import tqdm

# ## Download MIR RIR data

# output_dir = "./mit_rirs"
# if not os.path.exists(output_dir):
#     os.mkdir(output_dir)
#     rir_dataset = datasets.load_dataset("davidscripka/MIT_environmental_impulse_responses", split="train", streaming=True)
#     # Save clips to 16-bit PCM wav files
#     for row in tqdm(rir_dataset):
#         name = row['audio']['path'].split('/')[-1]
#         scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

# ## Download noise and background audio

# # Audioset Dataset (https://research.google.com/audioset/dataset/index.html)
# # Download one part of the audioset .tar files, extract, and convert to 16khz
# # For full-scale training, it's recommended to download the entire dataset from
# # https://huggingface.co/datasets/agkphysics/AudioSet, and
# # even potentially combine it with other background noise datasets (e.g., FSD50k, Freesound, etc.)

# if not os.path.exists("audioset"):
#     os.mkdir("audioset")

#     fname = "bal_train09.tar"
#     out_dir = f"audioset/{fname}"
#     link = "https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/" + fname
#     !wget -O {out_dir} {link}
#     !cd audioset && tar -xf bal_train09.tar

#     output_dir = "./audioset_16k"
#     if not os.path.exists(output_dir):
#         os.mkdir(output_dir)

#     # Save clips to 16-bit PCM wav files
#     audioset_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("audioset/audio").glob("**/*.flac")]})
#     audioset_dataset = audioset_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))
#     for row in tqdm(audioset_dataset):
#         name = row['audio']['path'].split('/')[-1].replace(".flac", ".wav")
#         scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

# # Free Music Archive dataset
# # https://github.com/mdeff/fma
# # (Third-party mchl914 extra small set)

# output_dir = "./fma"
# if not os.path.exists(output_dir):
#     os.mkdir(output_dir)
#     fname = "fma_xs.zip"
#     link = "https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/" + fname
#     out_dir = f"fma/{fname}"
#     !wget -O {out_dir} {link}
#     !cd {output_dir} && unzip -q {fname}

#     output_dir = "./fma_16k"
#     if not os.path.exists(output_dir):
#         os.mkdir(output_dir)

#     # Save clips to 16-bit PCM wav files
#     fma_dataset = datasets.Dataset.from_dict({"audio": [str(i) for i in Path("fma/fma_small").glob("**/*.mp3")]})
#     fma_dataset = fma_dataset.cast_column("audio", datasets.Audio(sampling_rate=16000))
#     for row in tqdm(fma_dataset):
#         name = row['audio']['path'].split('/')[-1].replace(".mp3", ".wav")
#         scipy.io.wavfile.write(os.path.join(output_dir, name), 16000, (row['audio']['array']*32767).astype(np.int16))

In [5]:
#!git clone https://huggingface.co/datasets/davidscripka/MIT_environmental_impulse_responses

Cloning into 'MIT_environmental_impulse_responses'...
remote: Enumerating objects: 280, done.[K
remote: Total 280 (delta 0), reused 0 (delta 0), pack-reused 280 (from 1)[K
Receiving objects: 100% (280/280), 41.11 KiB | 13.70 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Filtering content: 100% (270/270), 7.99 MiB | 3.16 MiB/s, done.


In [6]:
import os
import numpy as np
from pathlib import Path
from tqdm import tqdm
import torchaudio
import torch

print("🔄 Обработка MIT RIR (вручную) — исправлено: 1D → 2D и сохранение через torch...")

output_dir = "./mit_rirs"
os.makedirs(output_dir, exist_ok=True)

base_path = "MIT_environmental_impulse_responses/16khz"

if not Path(base_path).exists():
    raise FileNotFoundError(f"❌ Папка с аудио не найдена: {base_path}")

wav_files = list(Path(base_path).glob("*.wav"))
print(f"🔍 Найдено {len(wav_files)} WAV-файлов.")

for wav_path in tqdm(wav_files, desc="Конвертация RIR (MIT)"):
    filename = wav_path.name
    output_path = os.path.join(output_dir, filename)

    if os.path.exists(output_path):
        continue

    try:
        # Загружаем аудио
        waveform, sample_rate = torchaudio.load(str(wav_path))

        # Проверка частоты дискретизации
        if sample_rate != 16000:
            print(f"⚠️ Пропускаем {filename}: {sample_rate} Гц ≠ 16000 Гц")
            continue

        # Обработка: если 1D — превратить в 2D
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)  # [1, T]
        elif waveform.dim() != 2:
            print(f"⚠️ Пропускаем {filename}: неожиданная форма {waveform.shape}")
            continue

        # Нормализация: в [-1, 1] → float32
        # Важно: делаем это **на тензоре**, не на numpy!
        waveform = torch.clamp(waveform, -1.0, 1.0)

        # Конвертация в int16 (но оставляем как тензор)
        # torchaudio.save сам умеет работать с float32 → int16
        # Поэтому не нужно .numpy() → .astype(np.int16)

        # Сохраняем напрямую
        torchaudio.save(output_path, waveform, 16000)

    except Exception as e:
        print(f"❌ Ошибка при обработке {filename}: {e}")
        continue

print("✅ Все RIR-файлы успешно сохранены в ./mit_rirs")

🔄 Обработка MIT RIR (вручную) — исправлено: 1D → 2D и сохранение через torch...
🔍 Найдено 270 WAV-файлов.


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
  s = torchaudio.io.StreamWriter(uri, format=muxer, buffer_size=buffer_size)
Конвертация RIR (MIT): 100%|███████████████████████████████████████████████████| 270/270 [00:00<00:00, 3193.25it/s]

✅ Все RIR-файлы успешно сохранены в ./mit_rirs





In [7]:
# 2. Download AudioSet background audio (partially)
# ===================================================================

print("\n🔄 Загрузка части AudioSet...")

audioset_dir = "audioset"
os.makedirs(audioset_dir, exist_ok=True)

fname = "bal_train09.tar"
out_path = os.path.join(audioset_dir, fname)
link = "https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/" + fname

# Скачивание (если не загружено)
if not os.path.exists(out_path):
    print(f"📥 Скачиваем {fname}...")
    # Используем !wget, если в среде Jupyter
    os.system(f"wget -O {out_path} {link}")
else:
    print(f"📄 Файл {fname} уже существует.")

# Распаковка
extract_dir = os.path.join(audioset_dir, "audio")
os.makedirs(extract_dir, exist_ok=True)

if not os.listdir(extract_dir):
    print("📦 Распаковка bal_train09.tar...")
    os.system(f"cd {audioset_dir} && tar -xf {fname}")
else:
    print("📄 Аудиофайлы уже распакованы.")

# Конвертация в 16 кГц и сохранение
output_dir = "./audioset_16k"
os.makedirs(output_dir, exist_ok=True)

# Собираем все FLAC-файлы
flac_files = list(Path("audioset/audio").glob("**/*.flac"))
print(f"🔍 Найдено {len(flac_files)} FLAC-файлов.")

for flac_path in tqdm(flac_files, desc="Конвертация AudioSet"):
    filename = flac_path.name.replace(".flac", ".wav")
    output_path = os.path.join(output_dir, filename)

    if os.path.exists(output_path):
        continue

    try:
        # Загружаем через torchaudio
        waveform, sample_rate = torchaudio.load(str(flac_path))

        # Проверка частоты дискретизации
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        # Убедимся, что это 2D: [C, T]
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)  # Моно → [1, T]
        elif waveform.dim() != 2:
            print(f"⚠️ Пропускаем {flac_path.name}: неожиданная форма {waveform.shape}")
            continue

        # Нормализация: в [-1, 1]
        waveform = torch.clamp(waveform, -1.0, 1.0)

        # Сохраняем напрямую — без numpy!
        torchaudio.save(output_path, waveform, 16000)

    except Exception as e:
        print(f"❌ Ошибка при обработке {flac_path.name}: {e}")
        continue

print("✅ Все аудиофайлы AudioSet успешно сохранены в ./audioset_16k")


🔄 Загрузка части AudioSet...
📥 Скачиваем bal_train09.tar...


--2025-08-25 19:20:45--  https://huggingface.co/datasets/agkphysics/AudioSet/resolve/main/data/bal_train09.tar
Resolving huggingface.co (huggingface.co)... 18.165.122.30, 18.165.122.101, 18.165.122.120, ...
Connecting to huggingface.co (huggingface.co)|18.165.122.30|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/64897793837ad032c6c25d5b/2da2b65f06f00bed3429be9aa923ef69e211152b1fb32ba30d24630ef3095c32?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20250825%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250825T192045Z&X-Amz-Expires=3600&X-Amz-Signature=1dc09368cde8a08e2a37932d154b13a2c307c45cb8571259d0ee3df1ae7ea56a&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27bal_train09.tar%3B+filename%3D%22bal_train09.tar%22%3B&response-content-type=application%2Fx-tar&x-id=GetObject&Expires=1756153245&Policy=eyJTdGF0Z

📦 Распаковка bal_train09.tar...
🔍 Найдено 685 FLAC-файлов.


Конвертация AudioSet: 100%|██████████████████████████████████████████████████████| 685/685 [00:09<00:00, 73.49it/s]

✅ Все аудиофайлы AudioSet успешно сохранены в ./audioset_16k





In [8]:
# 3. Download Free Music Archive (FMA) dataset
# ===================================================================

print("\n🔄 Загрузка FMA (small subset)...")

fma_dir = "fma"
os.makedirs(fma_dir, exist_ok=True)

fname = "fma_xs.zip"
out_path = os.path.join(fma_dir, fname)
link = "https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/" + fname

if not os.path.exists(out_path):
    print(f"📥 Скачиваем {fname}...")
    os.system(f"wget -O {out_path} {link}")
else:
    print(f"📄 Файл {fname} уже существует.")

# Распаковка
extract_dir = os.path.join(fma_dir, "fma_small")
os.makedirs(extract_dir, exist_ok=True)

if not os.listdir(extract_dir):
    print("📦 Распаковка fma_xs.zip...")
    os.system(f"cd {fma_dir} && unzip -q {fname}")
else:
    print("📄 Файлы уже распакованы.")

# Конвертация MP3 → WAV (16 кГц)
output_dir = "./fma_16k"
os.makedirs(output_dir, exist_ok=True)

mp3_files = list(Path("fma/fma_small").glob("**/*.mp3"))
print(f"🔍 Найдено {len(mp3_files)} MP3-файлов.")

for mp3_path in tqdm(mp3_files, desc="Конвертация FMA"):
    filename = mp3_path.name.replace(".mp3", ".wav")
    output_path = os.path.join(output_dir, filename)

    if os.path.exists(output_path):
        continue

    try:
        # Загружаем через torchaudio
        waveform, sample_rate = torchaudio.load(str(mp3_path))

        # Проверка частоты дискретизации
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)

        # Убедимся, что это 2D: [C, T]
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)  # Моно → [1, T]
        elif waveform.dim() != 2:
            print(f"⚠️ Пропускаем {mp3_path.name}: неожиданная форма {waveform.shape}")
            continue

        # Нормализация: в [-1, 1]
        waveform = torch.clamp(waveform, -1.0, 1.0)

        # Сохраняем напрямую — без numpy!
        torchaudio.save(output_path, waveform, 16000)

    except Exception as e:
        print(f"❌ Ошибка при обработке {mp3_path.name}: {e}")
        continue

print("✅ Все аудиофайлы FMA успешно сохранены в ./fma_16k")


🔄 Загрузка FMA (small subset)...
📥 Скачиваем fma_xs.zip...


--2025-08-25 19:21:28--  https://huggingface.co/datasets/mchl914/fma_xsmall/resolve/main/fma_xs.zip
Resolving huggingface.co (huggingface.co)... 18.165.122.101, 18.165.122.120, 18.165.122.11, ...
Connecting to huggingface.co (huggingface.co)|18.165.122.101|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/b6/94/b6949420efe7987c075219703fbd5649e8e30f4aa4783eb019be14dfc9e7f52e/e5876c13cfb0f7ef668327c75d1c40bc4a2ed9d5b8e62ce383d093319c9ff663?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27fma_xs.zip%3B+filename%3D%22fma_xs.zip%22%3B&response-content-type=application%2Fzip&Expires=1756153288&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1NjE1MzI4OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2I2Lzk0L2I2OTQ5NDIwZWZlNzk4N2MwNzUyMTk3MDNmYmQ1NjQ5ZThlMzBmNGFhNDc4M2ViMDE5YmUxNGRmYzllN2Y1MmUvZTU4NzZjMTNjZmIwZjdlZjY2ODMyN2M3NWQxYzQwYmM0YTJlZDlkNWI4ZTYyY2UzODNkMD

📦 Распаковка fma_xs.zip...
🔍 Найдено 210 MP3-файлов.


Конвертация FMA: 100%|███████████████████████████████████████████████████████████| 210/210 [00:07<00:00, 28.17it/s]

✅ Все аудиофайлы FMA успешно сохранены в ./fma_16k





In [9]:
# Sets up the augmentations.
# To improve your model, experiment with these settings and use more sources of
# background clips.

from microwakeword.audio.augmentation import Augmentation
from microwakeword.audio.spectrograms import SpectrogramGeneration

# The clips object should already be initialized by the PatchedClips cell below
# If you get errors, run the PatchedClips cell first to avoid torchcodec issues

# Fix for "RuntimeError: The frame has 0 channels, expected 1" error
# This creates a patched version of Clips that uses torchaudio directly instead of torchcodec

import os
import random
import numpy as np
import torchaudio
import torch
from pathlib import Path
from microwakeword.audio.clips import Clips

class MockDatasetItem:
    """Mock dataset item that mimics the behavior of datasets library items."""
    def __init__(self, audio_data, sample_rate=16000):
        self.audio_data = audio_data
        self.sample_rate = sample_rate
    
    def __getitem__(self, key):
        if key == "audio":
            return {
                "array": self.audio_data,
                "sampling_rate": self.sample_rate
            }
        return None

class MockDataset:
    """Mock dataset that mimics the behavior of datasets.Dataset."""
    def __init__(self, clips_list):
        self.clips_list = clips_list
    
    def __iter__(self):
        for clip in self.clips_list:
            yield MockDatasetItem(clip['audio_data'], clip['sample_rate'])
    
    def __getitem__(self, idx):
        clip = self.clips_list[idx]
        return MockDatasetItem(clip['audio_data'], clip['sample_rate'])
    
    def __len__(self):
        return len(self.clips_list)

class PatchedClips(Clips):
    """
    Patched version of Clips that uses torchaudio directly to avoid torchcodec compatibility issues.
    """
    
    def __init__(self, input_directory, file_pattern='*.wav', **kwargs):
        # Don't call super().__init__ to avoid the datasets setup
        self.input_directory = input_directory
        self.file_pattern = file_pattern
        self.max_clip_duration_s = kwargs.get('max_clip_duration_s', None)
        self.remove_silence = kwargs.get('remove_silence', False)
        self.random_split_seed = kwargs.get('random_split_seed', 10)
        self.split_count = kwargs.get('split_count', 0.1)
        
        # Initialize attributes needed by parent class methods
        self.min_clip_duration_s = 0.0
        self.repeat_clip_min_duration_s = 0.0
        self.trimmed_clip_duration_s = kwargs.get('trimmed_clip_duration_s', None)
        self.trim_zeros = kwargs.get('trim_zeros', False)
        
        if self.remove_silence:
            from microwakeword.audio.audio_utils import remove_silence_webrtc
            self.remove_silence_function = remove_silence_webrtc
        
        self._load_clips_custom()
        self._setup_splits()
    
    def _load_clips_custom(self):
        """Load clips using torchaudio directly."""
        input_path = Path(self.input_directory)
        audio_files = list(input_path.glob(self.file_pattern))
        
        print(f"Loading {len(audio_files)} audio files with torchaudio...")
        
        self.clips_data = []
        for i, audio_file in enumerate(sorted(audio_files)):
            try:
                waveform, sample_rate = torchaudio.load(str(audio_file))
                if waveform.shape[0] > 1:  # Convert to mono
                    waveform = torch.mean(waveform, dim=0, keepdim=True)
                
                audio_array = waveform.squeeze().numpy()
                
                clip_entry = {
                    'path': str(audio_file),
                    'audio_data': audio_array,
                    'sample_rate': sample_rate,
                    'duration': len(audio_array) / sample_rate
                }
                
                if self.max_clip_duration_s is None or clip_entry['duration'] <= self.max_clip_duration_s:
                    self.clips_data.append(clip_entry)
                    
            except Exception as e:
                print(f"Error loading {audio_file}: {e}")
                continue
        
        print(f"Successfully loaded {len(self.clips_data)} valid audio clips")
        
        # Create the clips dataset (needed for get_random_clip compatibility)
        self.clips = MockDataset(self.clips_data)
    
    def _setup_splits(self):
        """Set up train/validation splits."""
        if self.split_count > 0 and self.random_split_seed is not None:
            random.seed(self.random_split_seed)
            
            # Shuffle and split
            clips_copy = self.clips_data.copy()
            random.shuffle(clips_copy)
            
            split_idx = int(len(clips_copy) * self.split_count)
            self.validation_clips_data = clips_copy[:split_idx]
            self.train_clips_data = clips_copy[split_idx:]
            
            # Create mock dataset dict for compatibility with SpectrogramGeneration
            self.split_clips = {
                'train': MockDataset(self.train_clips_data),
                'validation': MockDataset(self.validation_clips_data),
                'test': MockDataset(self.validation_clips_data)  # Use validation as test for now
            }
        else:
            self.train_clips_data = self.clips_data
            self.validation_clips_data = []
            
            # Create mock dataset dict
            self.split_clips = {
                'train': MockDataset(self.train_clips_data),
                'validation': MockDataset([]),
                'test': MockDataset([])
            }
    
    def get_random_clip(self, max_retries=5):
        """Get a random audio clip."""
        return random.choice(self.clips_data)['audio_data']
    
    def get_clip(self, split='train'):
        """Get clips for a specific split."""
        if split == 'train':
            return random.choice(self.train_clips_data)['audio_data']
        elif split == 'validation':
            return random.choice(self.validation_clips_data)['audio_data'] if self.validation_clips_data else self.get_random_clip()
        else:
            return self.get_random_clip()

# Initialize the patched clips class
clips = PatchedClips(
    input_directory='generated_samples',
    file_pattern='*.wav',
    max_clip_duration_s=None,
    remove_silence=False,
    random_split_seed=10,
    split_count=0.1,
)

print(f"✅ PatchedClips initialized: {len(clips.clips_data)} total clips")
if hasattr(clips, 'train_clips_data'):
    print(f"   Train clips: {len(clips.train_clips_data)}, Validation clips: {len(clips.validation_clips_data)}")

# Now initialize the augmenter
augmenter = Augmentation(augmentation_duration_s=3.2,
                         augmentation_probabilities = {
                                "SevenBandParametricEQ": 0.1,
                                "TanhDistortion": 0.1,
                                "PitchShift": 0.1,
                                "BandStopFilter": 0.1,
                                "AddColorNoise": 0.1,
                                "AddBackgroundNoise": 0.75,
                                "Gain": 1.0,
                                "RIR": 0.5,
                            },
                         impulse_paths = ['mit_rirs'],
                         background_paths = ['fma_16k', 'audioset_16k'],
                         background_min_snr_db = -5,
                         background_max_snr_db = 10,
                         min_jitter_s = 0.195,
                         max_jitter_s = 0.205,
                         )

# Test the augmentation pipeline
from IPython.display import Audio
from microwakeword.audio.audio_utils import save_clip

print("Testing augmentation pipeline...")
random_clip = clips.get_random_clip()
augmented_clip = augmenter.augment_clip(random_clip)
save_clip(augmented_clip, 'augmented_clip.wav')
print("✅ Augmentation test successful!")

Audio("augmented_clip.wav", autoplay=True)

Loading 1000 audio files with torchaudio...


    In 2.9, this function's implementation will be changed to use torchaudio.load_with_torchcodec` under the hood. Some parameters like ``normalize``, ``format``, ``buffer_size``, and ``backend`` will be ignored. We recommend that you port your code to rely directly on TorchCodec's decoder instead: https://docs.pytorch.org/torchcodec/stable/generated/torchcodec.decoders.AudioDecoder.html#torchcodec.decoders.AudioDecoder.
    torio.io._streaming_media_decoder.StreamingMediaDecoder has been deprecated. This deprecation is part of a large refactoring effort to transition TorchAudio into a maintenance phase. The decoding and encoding capabilities of PyTorch for both audio and video are being consolidated into TorchCodec. Please see https://github.com/pytorch/audio/issues/3902 for more information. It will be removed from the 2.9 release. 


Successfully loaded 1000 valid audio clips
✅ PatchedClips initialized: 1000 total clips
   Train clips: 900, Validation clips: 100
Testing augmentation pipeline...
✅ Augmentation test successful!


In [None]:
# Augment a random clip and play it back to verify it works well

from IPython.display import Audio
from microwakeword.audio.audio_utils import save_clip

random_clip = clips.get_random_clip()
augmented_clip = augmenter.augment_clip(random_clip)
save_clip(augmented_clip, 'augmented_clip.wav')

Audio("augmented_clip.wav", autoplay=True)

In [10]:
# Augment samples and save the training, validation, and testing sets.
# Validating and testing samples generated the same way can make the model
# benchmark better than it performs in real-word use. Use real samples or TTS
# samples generated with a different TTS engine to potentially get more accurate
# benchmarks.

import os
from mmap_ninja.ragged import RaggedMmap

output_dir = 'generated_augmented_features'

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

splits = ["training", "validation", "testing"]
for split in splits:
  out_dir = os.path.join(output_dir, split)
  if not os.path.exists(out_dir):
      os.mkdir(out_dir)


  split_name = "train"
  repetition = 2

  spectrograms = SpectrogramGeneration(clips=clips,
                                     augmenter=augmenter,
                                     slide_frames=10,    # Uses the same spectrogram repeatedly, just shifted over by one frame. This simulates the streaming inferences while training/validating in nonstreaming mode.
                                     step_ms=10,
                                     )
  if split == "validation":
    split_name = "validation"
    repetition = 1
  elif split == "testing":
    split_name = "test"
    repetition = 1
    spectrograms = SpectrogramGeneration(clips=clips,
                                     augmenter=augmenter,
                                     slide_frames=1,    # The testing set uses the streaming version of the model, so no artificial repetition is necessary
                                     step_ms=10,
                                     )

  RaggedMmap.from_generator(
      out_dir=os.path.join(out_dir, 'wakeword_mmap'),
      sample_generator=spectrograms.spectrogram_generator(split=split_name, repeat=repetition),
      batch_size=100,
      verbose=True,
  )

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [3]:
# Downloads pre-generated spectrogram features (made for microWakeWord in
# particular) for various negative datasets. This can be slow!
import os
output_dir = './negative_datasets'
#if not os.path.exists(output_dir):
os.mkdir(output_dir)
link_root = "https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main/"
filenames = ['dinner_party.zip', 'dinner_party_eval.zip', 'no_speech.zip', 'speech.zip']
for fname in filenames:
    link = link_root + fname

    zip_path = f"negative_datasets/{fname}"
    !wget -O {zip_path} {link}
    !unzip -q {zip_path} -d {output_dir}

--2025-08-25 19:36:37--  https://huggingface.co/datasets/kahrendt/microwakeword/resolve/main/dinner_party.zip
Resolving huggingface.co (huggingface.co)... 18.165.140.122, 18.165.140.79, 18.165.140.20, ...
Connecting to huggingface.co (huggingface.co)|18.165.140.122|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/1e/47/1e471239a0f43987de7dbb41f4104451fd310e7e21a669f6bd46eabbcf977aff/18a0885d595ced7faa73736a8680206f5ba6e80113ca3e6ce43130e510aac18f?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27dinner_party.zip%3B+filename%3D%22dinner_party.zip%22%3B&response-content-type=application%2Fzip&Expires=1756154198&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc1NjE1NDE5OH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zLzFlLzQ3LzFlNDcxMjM5YTBmNDM5ODdkZTdkYmI0MWY0MTA0NDUxZmQzMTBlN2UyMWE2NjlmNmJkNDZlYWJiY2Y5NzdhZmYvMThhMDg4NWQ1OTVjZWQ3ZmFhNzM3MzZhODY4MDIwNmY1YmE2Z

In [4]:
# Save a yaml config that controls the training process
# These hyperparamters can make a huge different in model quality.
# Experiment with sampling and penalty weights and increasing the number of
# training steps.

import yaml
import os

config = {}

config["window_step_ms"] = 10

config["train_dir"] = (
    "trained_models/wakeword"
)


# Each feature_dir should have at least one of the following folders with this structure:
#  training/
#    ragged_mmap_folders_ending_in_mmap
#  testing/
#    ragged_mmap_folders_ending_in_mmap
#  testing_ambient/
#    ragged_mmap_folders_ending_in_mmap
#  validation/
#    ragged_mmap_folders_ending_in_mmap
#  validation_ambient/
#    ragged_mmap_folders_ending_in_mmap
#
#  sampling_weight: Weight for choosing a spectrogram from this set in the batch
#  penalty_weight: Penalizing weight for incorrect predictions from this set
#  truth: Boolean whether this set has positive samples or negative samples
#  truncation_strategy = If spectrograms in the set are longer than necessary for training, how are they truncated
#       - random: choose a random portion of the entire spectrogram - useful for long negative samples
#       - truncate_start: remove the start of the spectrogram
#       - truncate_end: remove the end of the spectrogram
#       - split: Split the longer spectrogram into separate spectrograms offset by 100 ms. Only for ambient sets

config["features"] = [
    {
        "features_dir": "generated_augmented_features",
        "sampling_weight": 2.0,
        "penalty_weight": 1.0,
        "truth": True,
        "truncation_strategy": "truncate_start",
        "type": "mmap",
    },
    {
        "features_dir": "negative_datasets/speech",
        "sampling_weight": 10.0,
        "penalty_weight": 1.0,
        "truth": False,
        "truncation_strategy": "random",
        "type": "mmap",
    },
    {
        "features_dir": "negative_datasets/dinner_party",
        "sampling_weight": 10.0,
        "penalty_weight": 1.0,
        "truth": False,
        "truncation_strategy": "random",
        "type": "mmap",
    },
    {
        "features_dir": "negative_datasets/no_speech",
        "sampling_weight": 5.0,
        "penalty_weight": 1.0,
        "truth": False,
        "truncation_strategy": "random",
        "type": "mmap",
    },
    { # Only used for validation and testing
        "features_dir": "negative_datasets/dinner_party_eval",
        "sampling_weight": 0.0,
        "penalty_weight": 1.0,
        "truth": False,
        "truncation_strategy": "split",
        "type": "mmap",
    },
]

# Number of training steps in each iteration - various other settings are configured as lists that corresponds to different steps
config["training_steps"] = [10000]

# Penalizing weight for incorrect class predictions - lists that correspond to training steps
config["positive_class_weight"] = [1]
config["negative_class_weight"] = [20]

config["learning_rates"] = [
    0.001,
]  # Learning rates for Adam optimizer - list that corresponds to training steps
config["batch_size"] = 128

config["time_mask_max_size"] = [
    0
]  # SpecAugment - list that corresponds to training steps
config["time_mask_count"] = [0]  # SpecAugment - list that corresponds to training steps
config["freq_mask_max_size"] = [
    0
]  # SpecAugment - list that corresponds to training steps
config["freq_mask_count"] = [0]  # SpecAugment - list that corresponds to training steps

config["eval_step_interval"] = (
    500  # Test the validation sets after every this many steps
)
config["clip_duration_ms"] = (
    1500  # Maximum length of wake word that the streaming model will accept
)

# The best model weights are chosen first by minimizing the specified minimization metric below the specified target_minimization
# Once the target has been met, it chooses the maximum of the maximization metric. Set 'minimization_metric' to None to only maximize
# Available metrics:
#   - "loss" - cross entropy error on validation set
#   - "accuracy" - accuracy of validation set
#   - "recall" - recall of validation set
#   - "precision" - precision of validation set
#   - "false_positive_rate" - false positive rate of validation set
#   - "false_negative_rate" - false negative rate of validation set
#   - "ambient_false_positives" - count of false positives from the split validation_ambient set
#   - "ambient_false_positives_per_hour" - estimated number of false positives per hour on the split validation_ambient set
config["target_minimization"] = 0.9
config["minimization_metric"] = None  # Set to None to disable

config["maximization_metric"] = "average_viable_recall"

with open(os.path.join("training_parameters.yaml"), "w") as file:
    documents = yaml.dump(config, file)

In [1]:
# Trains a model. When finished, it will quantize and convert the model to a
# streaming version suitable for on-device detection.
# It will resume if stopped, but it will start over at the configured training
# steps in the yaml file.
# Change --train 0 to only convert and test the best-weighted model.
# On Google colab, it doesn't print the mini-batch results, so it may appear
# stuck for several minutes! Additionally, it is very slow compared to training
# on a local GPU.

!python -m microwakeword.model_train_eval \
--training_config='training_parameters.yaml' \
--train 1 \
--restore_checkpoint 1 \
--test_tf_nonstreaming 0 \
--test_tflite_nonstreaming 0 \
--test_tflite_nonstreaming_quantized 0 \
--test_tflite_streaming 0 \
--test_tflite_streaming_quantized 1 \
--use_weights "best_weights" \
mixednet \
--pointwise_filters "64,64,64,64" \
--repeat_in_block  "1, 1, 1, 1" \
--mixconv_kernel_sizes '[5], [7,11], [9,15], [23]' \
--residual_connection "0,0,0,0" \
--first_conv_filters 32 \
--first_conv_kernel_size 5 \
--stride 3

INFO:absl:Loading and analyzing data sets.
[1mModel: "functional"[0m
┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1mLayer (type)       [0m[1m [0m┃[1m [0m[1mOutput Shape     [0m[1m [0m┃[1m [0m[1m   Param #[0m[1m [0m┃[1m [0m[1mConnected to     [0m[1m [0m┃
┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ input_layer         │ ([32m128[0m, [32m204[0m, [32m40[0m)    │          [32m0[0m │ -                 │
│ ([94mInputLayer[0m)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ expand_dims         │ ([32m128[0m, [32m204[0m, [32m1[0m, [32m40[0m) │          [32m0[0m │ input_layer[[32m0[0m][[32m0[0m] │
│ ([94mExpandDims[0m)        │                   │            │                   │
├─────────────────────┼───────────────────┼────────────┼───────────────────┤
│ stream ([94mStream[

In [6]:
# Downloads the tflite model file. To use on the device, you need to write a
# Model JSON file. See https://esphome.io/components/micro_wake_word for the
# documentation and
# https://github.com/esphome/micro-wake-word-models/tree/main/models/v2 for
# examples. Adjust the probability threshold based on the test results obtained
# after training is finished. You may also need to increase the Tensor arena
# model size if the model fails to load.

from google.colab import files

files.download(f"trained_models/wakeword/tflite_stream_state_internal_quant/stream_state_internal_quant.tflite")

ModuleNotFoundError: No module named 'google.colab'