# Training custom wake word "Тимошка" for openWakeWord

Full pipeline: Russian Piper TTS → Voice Conversion (FreeVC24) → openWakeWord Training → TFLite

**Requirements:** Google Colab with GPU (A100 recommended). Google Drive with ~20 GB free space.

**Key feature:** All intermediate results are saved to Google Drive. If the runtime disconnects,
just reconnect and re-run — everything resumes from where it left off. Maximum data loss: ~1 target voice (~5 min of work).

**TEST_MODE:** Set `TEST_MODE = True` for a ~5 minute end-to-end pipeline validation
(1 voice, 5 samples, 2 neg phrases, 1 target, 100 training steps).

## Stages
1. Mount Google Drive & check space
2. Install dependencies
3. Generate TTS samples (Piper, Russian voices)
4. Prepare target voices from Common Voice
5. Voice conversion (FreeVC24) — with Google Drive checkpointing
6. Download negative data (ACAV100M)
7. Train openWakeWord
8. Convert to TFLite
9. Test the model

## Stage 0: Configuration

In [None]:
#@title Configuration { display-mode: "form" }

#@markdown ### Test mode (quick end-to-end validation, ~5 min)
TEST_MODE = False  #@param {type:"boolean"}

#@markdown ### Number of target voices for voice conversion
#@markdown (ignored in TEST_MODE, which uses 1)
N_TARGET_VOICES = 100  #@param {type:"integer"}

#@markdown ### Training steps (ignored in TEST_MODE, which uses 100)
TRAINING_STEPS = 50000  #@param {type:"integer"}

if TEST_MODE:
    N_TARGET_VOICES = 1
    TRAINING_STEPS = 100
    N_VOICES = 1           # 1 Piper voice instead of 4
    SAMPLES_PER_VOICE = 5  # 5 samples instead of 250
    NEG_PHRASES_COUNT = 2  # 2 negative phrases instead of 19
    NEG_SAMPLES_PER = 5    # 5 negative samples instead of 50
    print("=" * 60)
    print(f"TEST MODE: {N_TARGET_VOICES} target, {TRAINING_STEPS} steps")
    print(f"  {N_VOICES} voice, {SAMPLES_PER_VOICE} pos samples, {NEG_PHRASES_COUNT} neg phrases")
    print("Expected runtime: ~5 minutes")
    print("=" * 60)
else:
    N_VOICES = 4
    SAMPLES_PER_VOICE = 250
    NEG_PHRASES_COUNT = 19
    NEG_SAMPLES_PER = 50
    print(f"FULL MODE: {N_TARGET_VOICES} target voices, {TRAINING_STEPS} training steps")
    print(f"Expected runtime: 4-8 hours on A100")

## Stage 1: Mount Google Drive & check space

In [None]:
import shutil
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Check available space
usage = shutil.disk_usage('/content/drive')
free_gb = usage.free / (1024**3)
total_gb = usage.total / (1024**3)
used_gb = usage.used / (1024**3)

print(f"Google Drive: {used_gb:.1f} GB used / {total_gb:.1f} GB total / {free_gb:.1f} GB free")

REQUIRED_GB = 20 if not TEST_MODE else 2

if free_gb < REQUIRED_GB:
    print(f"\n" + "!" * 60)
    print(f"NOT ENOUGH SPACE! Need {REQUIRED_GB} GB free, have {free_gb:.1f} GB.")
    print(f"Options:")
    print(f"  1. Free up space in Google Drive")
    print(f"  2. Buy Google One: https://one.google.com/about/plans")
    print(f"     - 100 GB: $1.99/month")
    print(f"     - 200 GB: $2.99/month")
    print(f"  3. If you have Colab Pro, you may already have extra storage")
    print(f"!" * 60)
    raise RuntimeError(f"Not enough Google Drive space: {free_gb:.1f} GB free, need {REQUIRED_GB} GB")
else:
    print(f"\nSpace OK: {free_gb:.1f} GB free >= {REQUIRED_GB} GB required")

# Create persistent directory structure on Google Drive
DRIVE_BASE = "/content/drive/MyDrive/timoshka_data"
DRIVE_TTS_POS = os.path.join(DRIVE_BASE, "tts_positive")
DRIVE_TTS_NEG = os.path.join(DRIVE_BASE, "tts_negative")
DRIVE_VOICES = os.path.join(DRIVE_BASE, "piper_voices")
DRIVE_TARGETS = os.path.join(DRIVE_BASE, "voice_targets")
DRIVE_VC_POS = os.path.join(DRIVE_BASE, "vc_positive")
DRIVE_VC_NEG = os.path.join(DRIVE_BASE, "vc_negative")
DRIVE_OUTPUT = os.path.join(DRIVE_BASE, "output")

for d in [DRIVE_BASE, DRIVE_TTS_POS, DRIVE_TTS_NEG, DRIVE_VOICES,
          DRIVE_TARGETS, DRIVE_VC_POS, DRIVE_VC_NEG, DRIVE_OUTPUT]:
    os.makedirs(d, exist_ok=True)

# Local working directories (ephemeral, fast I/O)
LOCAL_BASE = "/content/timoshka"
LOCAL_VC_POS = os.path.join(LOCAL_BASE, "vc_positive")
LOCAL_VC_NEG = os.path.join(LOCAL_BASE, "vc_negative")

for d in [LOCAL_BASE, LOCAL_VC_POS, LOCAL_VC_NEG]:
    os.makedirs(d, exist_ok=True)

print(f"\nDrive base: {DRIVE_BASE}")
print(f"Local base: {LOCAL_BASE}")

# Report what's already on Drive
import glob
for label, path in [("TTS positive", DRIVE_TTS_POS), ("TTS negative", DRIVE_TTS_NEG),
                     ("Voice targets", DRIVE_TARGETS), ("VC positive", DRIVE_VC_POS),
                     ("VC negative", DRIVE_VC_NEG)]:
    count = 0
    for root, dirs, files in os.walk(path):
        count += len([f for f in files if f.endswith('.wav')])
    if count > 0:
        print(f"  {label}: {count} WAV files (resumable!)")

## Stage 2: Install dependencies

In [None]:
# Check GPU
!nvidia-smi

import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
%%bash
# Clone repositories
cd /content

if [ ! -d "openWakeWord" ]; then
    git clone https://github.com/dscripka/openWakeWord.git
fi

if [ ! -d "piper-sample-generator" ]; then
    git clone https://github.com/rhasspy/piper-sample-generator.git
fi

echo "Done cloning repos"

In [None]:
%%bash
# Install dependencies
pip install -q piper-phonemize -f https://github.com/rhasspy/piper-phonemize/releases/latest 2>/dev/null || true
pip install -q webrtcvad
pip install -q -e /content/openWakeWord
pip install -q -e /content/piper-sample-generator
pip install -q "numpy<2"
pip install -q coqui-tts
pip install -q mutagen torchinfo torchmetrics speechbrain
pip install -q audiomentations torch-audiomentations acoustics
pip install -q pronouncing deep-phonemizer
pip install -q onnx onnx2tf tf2onnx flatbuffers

# Fix speechbrain/torchaudio compatibility (torchaudio 2.5+ removed list_audio_backends)
# Applied unconditionally after all pip installs to ensure it's never overwritten
python3 -c "
import pathlib
f = pathlib.Path('/usr/local/lib/python3.12/dist-packages/speechbrain/utils/torch_audio_backend.py')
if not f.exists():
    # Try python3.11 path
    f = pathlib.Path('/usr/local/lib/python3.11/dist-packages/speechbrain/utils/torch_audio_backend.py')
if f.exists():
    code = f.read_text()
    old = 'available_backends = torchaudio.list_audio_backends()'
    new = 'available_backends = torchaudio.list_audio_backends() if hasattr(torchaudio, \"list_audio_backends\") else []'
    if old in code:
        code = code.replace(old, new)
        f.write_text(code)
        print('Patched speechbrain torchaudio compat')
    else:
        print('speechbrain already patched or compatible')
else:
    print('WARNING: speechbrain torch_audio_backend.py not found')
"

echo "\nAll dependencies installed"
echo "IMPORTANT: If this is a fresh runtime, restart it now (Runtime -> Restart session)"
echo "Then re-run all cells from the top."

In [None]:
# Auto-restart runtime if numpy 2.x is still loaded in memory
# (pip installed numpy<2 on disk, but Colab pre-loaded numpy 2.x)
import numpy as np
import os

_RESTART_MARKER = '/content/.deps_installed'

if int(np.__version__.split('.')[0]) >= 2:
    if not os.path.exists(_RESTART_MARKER):
        # First time: mark that deps are installed, then restart
        open(_RESTART_MARKER, 'w').write('1')
        print(f"numpy {np.__version__} in memory, need <2. Restarting runtime...")
        print("After restart, click Runtime -> Run All (Ctrl+F9) to continue.")
        import IPython
        IPython.Application.instance().kernel.do_shutdown(True)
    else:
        raise RuntimeError(
            f"numpy {np.__version__} still loaded after restart! "
            "Try: Runtime -> Restart session, then Run All."
        )
else:
    print(f"numpy {np.__version__} OK")
    # Mark deps as installed (for resume after disconnect)
    open(_RESTART_MARKER, 'w').write('1')

## Stage 3: Generate TTS samples (Piper)

TTS samples are saved to Google Drive. If they already exist, this step is skipped.

In [None]:
%%bash
# Download Russian Piper voices to Google Drive
cd /content/drive/MyDrive/timoshka_data/piper_voices

VOICES=(
    "ru/ru_RU/irina/medium/ru_RU-irina-medium"
    "ru/ru_RU/ruslan/medium/ru_RU-ruslan-medium"
    "ru/ru_RU/denis/medium/ru_RU-denis-medium"
    "ru/ru_RU/dmitri/medium/ru_RU-dmitri-medium"
)

BASE_URL="https://huggingface.co/rhasspy/piper-voices/resolve/main"

# Only download first N_VOICES voices (set by Python config cell)
count=0
for voice in "${VOICES[@]}"; do
    name=$(basename $voice)
    if [ ! -f "${name}.onnx" ]; then
        echo "Downloading ${name}..."
        wget -q -O "${name}.onnx" "${BASE_URL}/${voice}.onnx?download=true"
        wget -q -O "${name}.onnx.json" "${BASE_URL}/${voice}.onnx.json?download=true"
    else
        echo "${name} already on Drive"
    fi
    count=$((count + 1))
done

echo "\nAll voices ready:"
ls -la *.onnx

In [None]:
import subprocess
import glob
import os

voices = sorted(glob.glob(os.path.join(DRIVE_VOICES, "*.onnx")))[:N_VOICES]
print(f"Using {len(voices)} Piper voice(s)")

POSITIVE_PHRASE = "\u0442\u0438\u043c\u043e\u0448\u043a\u0430"  # тимошка

for voice_path in voices:
    voice_name = os.path.basename(voice_path).replace(".onnx", "")
    out_dir = os.path.join(DRIVE_TTS_POS, voice_name)
    os.makedirs(out_dir, exist_ok=True)

    existing = len(glob.glob(os.path.join(out_dir, "*.wav")))
    if existing >= SAMPLES_PER_VOICE:
        print(f"  {voice_name}: {existing} samples on Drive, skipping")
        continue

    print(f"  Generating {SAMPLES_PER_VOICE} positive samples with {voice_name}...")
    subprocess.run([
        "python3", "/content/piper-sample-generator/generate_samples.py",
        POSITIVE_PHRASE,
        "--model", voice_path,
        "--max-samples", str(SAMPLES_PER_VOICE),
        "--output-dir", out_dir,
    ], check=True)
    generated = len(glob.glob(os.path.join(out_dir, "*.wav")))
    print(f"    Generated: {generated}")

total = sum(
    len(glob.glob(os.path.join(DRIVE_TTS_POS, d, "*.wav")))
    for d in os.listdir(DRIVE_TTS_POS)
    if os.path.isdir(os.path.join(DRIVE_TTS_POS, d))
)
print(f"\nTotal positive TTS samples (on Drive): {total}")

In [None]:
ALL_NEGATIVE_PHRASES = [
    "\u0442\u0438\u043c\u043e\u0444\u0435\u0439",      # тимофей
    "\u043a\u043e\u0448\u043a\u0430",                    # кошка
    "\u043c\u043e\u0448\u043a\u0430",                    # мошка
    "\u0440\u043e\u043c\u0430\u0448\u043a\u0430",      # ромашка
    "\u043c\u0430\u0442\u0440\u0451\u0448\u043a\u0430", # матрёшка
    "\u0433\u0430\u0440\u043c\u043e\u0448\u043a\u0430", # гармошка
    "\u043a\u0430\u0440\u0442\u043e\u0448\u043a\u0430", # картошка
    "\u043e\u043a\u0440\u043e\u0448\u043a\u0430",      # окрошка
    "\u043c\u0438\u0448\u043a\u0430",                    # мишка
    "\u043c\u044b\u0448\u043a\u0430",                    # мышка
    "\u0442\u0438\u0448\u043a\u0430",                    # тишка
    "\u0442\u0438\u043c\u043e\u0448\u0430",            # тимоша
    "\u0442\u0438\u043c\u043e\u0448\u0435\u043d\u043a\u043e", # тимошенко
    "\u043c\u043e\u0440\u043e\u0448\u043a\u0430",      # морошка
    "\u043a\u0440\u043e\u0448\u043a\u0430",            # крошка
    "\u0434\u043e\u0440\u043e\u0436\u043a\u0430",      # дорожка
    "\u043b\u043e\u0436\u043a\u0430",                    # ложка
    "\u0442\u0438\u0448\u0438\u043d\u0430",            # тишина
    "\u0442\u0451\u043c\u0443\u0448\u043a\u0430",      # тёмушка
]

NEGATIVE_PHRASES = ALL_NEGATIVE_PHRASES[:NEG_PHRASES_COUNT]
print(f"Using {len(NEGATIVE_PHRASES)} negative phrases, {NEG_SAMPLES_PER} samples each, {len(voices)} voice(s)")

for phrase in NEGATIVE_PHRASES:
    for voice_path in voices:
        voice_name = os.path.basename(voice_path).replace(".onnx", "")
        safe_phrase = phrase.replace("\u0451", "\u0435")
        out_dir = os.path.join(DRIVE_TTS_NEG, f"{safe_phrase}_{voice_name}")
        os.makedirs(out_dir, exist_ok=True)

        existing = len(glob.glob(os.path.join(out_dir, "*.wav")))
        if existing >= NEG_SAMPLES_PER:
            continue

        subprocess.run([
            "python3", "/content/piper-sample-generator/generate_samples.py",
            phrase,
            "--model", voice_path,
            "--max-samples", str(NEG_SAMPLES_PER),
            "--output-dir", out_dir,
        ], check=True)

    print(f"  Done: {phrase}")

total_neg = 0
for root, dirs, files in os.walk(DRIVE_TTS_NEG):
    total_neg += len([f for f in files if f.endswith(".wav")])

print(f"\nTotal negative TTS samples (on Drive): {total_neg}")

In [None]:
# Listen to a few samples
import IPython.display as ipd

sample_files = glob.glob(os.path.join(DRIVE_TTS_POS, "*/*.wav"))[:3]
for f in sample_files:
    print(f"Playing: {os.path.basename(os.path.dirname(f))}/{os.path.basename(f)}")
    ipd.display(ipd.Audio(f))

## Stage 4: Prepare target voices (Common Voice)

Downloads Russian speech samples from Common Voice for voice conversion diversity.

In [None]:
import csv, io, random, subprocess, tarfile, requests

existing_targets = len(glob.glob(os.path.join(DRIVE_TARGETS, "*.wav")))
if existing_targets >= N_TARGET_VOICES:
    print(f"Already have {existing_targets} target voices on Drive, skipping download")
else:
    HF_REPO = "https://huggingface.co/datasets/fsicoli/common_voice_17_0/resolve/main"

    print("Downloading dev.tsv metadata...")
    tsv_url = f"{HF_REPO}/transcript/ru/dev.tsv"
    tsv_resp = requests.get(tsv_url)
    tsv_resp.raise_for_status()
    print(f"  Downloaded {len(tsv_resp.content) / 1024:.0f} KB")

    reader = csv.DictReader(io.StringIO(tsv_resp.text), delimiter="\t")
    rows = list(reader)
    print(f"  {len(rows)} clips in dev split")

    speaker_clips = {}
    for row in rows:
        cid = row["client_id"]
        if cid not in speaker_clips:
            speaker_clips[cid] = row["path"]
    print(f"  {len(speaker_clips)} unique speakers")

    tar_url = f"{HF_REPO}/audio/ru/dev/ru_dev_0.tar"
    tar_path = "/content/timoshka/ru_dev_0.tar"

    if not os.path.exists(tar_path):
        print(f"Downloading dev audio tar (~375 MB)...")
        with requests.get(tar_url, stream=True) as r:
            r.raise_for_status()
            total = int(r.headers.get("content-length", 0))
            downloaded = 0
            with open(tar_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8 * 1024 * 1024):
                    f.write(chunk)
                    downloaded += len(chunk)
                    if total:
                        print(f"\r  {downloaded / 1024**2:.0f} / {total / 1024**2:.0f} MB", end="", flush=True)
        print(f"\n  Download complete")
    else:
        print(f"Dev tar already downloaded")

    speakers = list(speaker_clips.items())
    random.seed(42)  # deterministic for reproducibility
    random.shuffle(speakers)
    wanted_clips = {}
    for i, (cid, clip_path) in enumerate(speakers):
        if len(wanted_clips) >= N_TARGET_VOICES:
            break
        wanted_clips[clip_path] = i

    print(f"Extracting {len(wanted_clips)} clips from tar...")
    saved = existing_targets
    with tarfile.open(tar_path, "r") as tar:
        for member in tar:
            if not member.isfile():
                continue
            basename = os.path.basename(member.name)
            if basename in wanted_clips:
                try:
                    f = tar.extractfile(member)
                    if f is None:
                        continue
                    audio_bytes = f.read()
                    tmp_mp3 = f"/tmp/cv_clip_{saved}.mp3"
                    with open(tmp_mp3, "wb") as tmp:
                        tmp.write(audio_bytes)

                    out_path = os.path.join(DRIVE_TARGETS, f"speaker_{saved:04d}.wav")
                    if os.path.exists(out_path):
                        saved += 1
                        os.remove(tmp_mp3)
                        continue

                    result = subprocess.run([
                        "ffmpeg", "-y", "-i", tmp_mp3,
                        "-ar", "16000", "-ac", "1", "-f", "wav", out_path
                    ], capture_output=True, timeout=30)
                    if result.returncode != 0:
                        continue

                    probe = subprocess.run([
                        "ffprobe", "-v", "error",
                        "-show_entries", "format=duration",
                        "-of", "csv=p=0", out_path
                    ], capture_output=True, text=True, timeout=10)
                    duration = float(probe.stdout.strip())
                    if duration < 3.0 or duration > 15.0:
                        os.remove(out_path)
                        continue

                    saved += 1
                    os.remove(tmp_mp3)
                except Exception as e:
                    print(f"  Error: {basename}: {e}")
            if saved >= N_TARGET_VOICES:
                break

    print(f"\nSaved {saved} target voice files to Drive")

target_files = sorted(glob.glob(os.path.join(DRIVE_TARGETS, "*.wav")))
print(f"\nTarget voice files on Drive: {len(target_files)}")

## Stage 5: Voice Conversion (FreeVC24)

**Resilient to disconnects:** VC results are organized by target voice on Google Drive.
After completing each target voice, results are synced to Drive.
On resume, completed targets are skipped automatically.

Maximum data loss on disconnect: 1 target voice (~3-5 minutes of work).

In [None]:
import time, json, shutil
from pathlib import Path
from TTS.api import TTS
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading FreeVC24 model on {device}...")
vc_model = TTS("voice_conversion_models/multilingual/vctk/freevc24").to(device)
print("Model loaded!")

In [None]:
import time, json, os, glob
from pathlib import Path

def collect_source_files(source_dir):
    """Collect all WAV files from source directory."""
    files = []
    for root, dirs, fnames in os.walk(source_dir):
        for f in sorted(fnames):
            if f.endswith(".wav"):
                files.append(os.path.join(root, f))
    files.sort()
    return files

def make_output_name(src_path, source_dir, tgt_name):
    """Generate output filename from source and target."""
    src_name = Path(src_path).stem
    parent_name = Path(src_path).parent.name
    if parent_name != Path(source_dir).name:
        prefix = f"{parent_name}_{src_name}"
    else:
        prefix = src_name
    return f"{prefix}_vc{tgt_name}.wav"

def run_vc_by_target(source_dir, drive_output_dir, target_dir, label="", max_sources=None):
    """Run voice conversion organized by target voice.
    
    For each target voice:
    1. Check if target is already complete on Drive -> skip
    2. Convert all sources with this target (on local disk for speed)
    3. Copy results to Drive
    4. Mark target as complete
    
    Maximum data loss on crash: 1 target's conversions.
    
    Args:
        max_sources: If set, limit the number of source files to process.
                     Useful in TEST_MODE to avoid processing cached full-mode data.
    """
    source_files = collect_source_files(source_dir)
    if max_sources:
        source_files = source_files[:max_sources]
    target_files = sorted(glob.glob(os.path.join(target_dir, "*.wav")))
    
    total_targets = len(target_files)
    total_conversions = len(source_files) * total_targets
    
    # Status file on Drive tracks completed targets
    status_path = os.path.join(drive_output_dir, "_completion_status.json")
    if os.path.exists(status_path):
        with open(status_path) as f:
            completion = json.load(f)
    else:
        completion = {"completed_targets": [], "total_files": 0}
    
    completed_set = set(completion["completed_targets"])
    
    print(f"{label}Sources: {len(source_files)}, Targets: {total_targets} -> {total_conversions} conversions")
    print(f"{label}Already completed targets: {len(completed_set)}/{total_targets}")
    
    if len(completed_set) == total_targets:
        total_files = completion.get("total_files", 0)
        print(f"{label}ALL TARGETS COMPLETE! {total_files} files on Drive.")
        return total_files
    
    done_total = len(completed_set) * len(source_files)
    t0 = time.time()
    errors = 0
    
    for tgt_idx, tgt_path in enumerate(target_files):
        tgt_name = Path(tgt_path).stem
        
        if tgt_name in completed_set:
            continue
        
        # Create target subdirectory on Drive
        tgt_drive_dir = os.path.join(drive_output_dir, tgt_name)
        os.makedirs(tgt_drive_dir, exist_ok=True)
        
        # Check if this target is partially done on Drive
        existing_on_drive = set(os.listdir(tgt_drive_dir))
        
        tgt_t0 = time.time()
        converted_this_target = 0
        skipped_this_target = 0
        
        for src_idx, src_path in enumerate(source_files):
            out_name = make_output_name(src_path, source_dir, tgt_name)
            
            # Skip if already on Drive
            if out_name in existing_on_drive:
                skipped_this_target += 1
                done_total += 1
                continue
            
            # Convert to local disk first (fast), then copy to Drive
            local_path = os.path.join(LOCAL_VC_POS if 'positive' in label.lower() else LOCAL_VC_NEG, out_name)
            drive_path = os.path.join(tgt_drive_dir, out_name)
            
            try:
                vc_model.voice_conversion_to_file(
                    source_wav=src_path, target_wav=tgt_path, file_path=local_path,
                )
                # Copy to Drive immediately
                shutil.copy2(local_path, drive_path)
                os.remove(local_path)  # free local space
                converted_this_target += 1
                done_total += 1
            except Exception as e:
                errors += 1
                done_total += 1
                if errors <= 10:
                    print(f"{label}Error: {Path(src_path).stem} + {tgt_name}: {e}")
            
            # Progress every 100 conversions
            if (converted_this_target + skipped_this_target) % 100 == 0:
                elapsed = time.time() - t0
                rate = (done_total - len(completed_set) * len(source_files)) / max(elapsed, 1)
                eta = (total_conversions - done_total) / max(rate, 0.01)
                pct = 100 * done_total / total_conversions
                bar_len = int(pct / 2)
                bar = "#" * bar_len + "." * (50 - bar_len)
                print(
                    f"{label}[{bar}] {done_total}/{total_conversions} ({pct:.1f}%) "
                    f"| {rate:.1f}/s | ETA {eta/60:.0f}min | target {tgt_idx+1}/{total_targets} "
                    f"| err={errors}"
                )
        
        # Mark target as complete
        tgt_elapsed = time.time() - tgt_t0
        completed_set.add(tgt_name)
        completion["completed_targets"] = list(completed_set)
        completion["total_files"] = done_total
        with open(status_path, "w") as f:
            json.dump(completion, f)
        
        print(
            f"{label}Target {tgt_name} DONE: {converted_this_target} converted, "
            f"{skipped_this_target} skipped in {tgt_elapsed:.0f}s "
            f"[{len(completed_set)}/{total_targets} targets complete]"
        )
    
    elapsed = time.time() - t0
    print(f"\n{label}ALL DONE: {done_total} conversions, {errors} errors in {elapsed/60:.1f}min")
    return done_total

print("VC functions defined")

In [None]:
# Positive voice conversion
print("=" * 60)
print("POSITIVE SAMPLES: Voice Conversion")
print("=" * 60)

n_pos_vc = run_vc_by_target(
    source_dir=DRIVE_TTS_POS,
    drive_output_dir=DRIVE_VC_POS,
    target_dir=DRIVE_TARGETS,
    label="[POS] ",
    max_sources=SAMPLES_PER_VOICE if TEST_MODE else None,
)
print(f"\nTotal positive VC samples: {n_pos_vc}")

In [None]:
# Negative voice conversion
print("=" * 60)
print("NEGATIVE SAMPLES: Voice Conversion")
print("=" * 60)

n_neg_vc = run_vc_by_target(
    source_dir=DRIVE_TTS_NEG,
    drive_output_dir=DRIVE_VC_NEG,
    target_dir=DRIVE_TARGETS,
    label="[NEG] ",
    max_sources=NEG_SAMPLES_PER if TEST_MODE else None,
)
print(f"\nTotal negative VC samples: {n_neg_vc}")

In [None]:
# Resample check (16kHz mono)
import torchaudio

def ensure_16k_mono(directory):
    """Check a random sample of WAVs in directory for 16kHz mono."""
    import random
    all_wavs = []
    for root, dirs, files in os.walk(directory):
        for f in files:
            if f.endswith('.wav'):
                all_wavs.append(os.path.join(root, f))
    
    sample = random.sample(all_wavs, min(100, len(all_wavs)))
    fixed = 0
    for f in sample:
        try:
            waveform, sr = torchaudio.load(f)
            changed = False
            if waveform.shape[0] > 1:
                waveform = waveform.mean(dim=0, keepdim=True)
                changed = True
            if sr != 16000:
                waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
                changed = True
            if changed:
                torchaudio.save(f, waveform, 16000)
                fixed += 1
        except Exception as e:
            print(f"  Error: {f}: {e}")
    return fixed, len(sample)

print("Spot-checking positive samples...")
fixed_pos, checked_pos = ensure_16k_mono(DRIVE_VC_POS)
print(f"  Checked {checked_pos}, fixed {fixed_pos}")

print("Spot-checking negative samples...")
fixed_neg, checked_neg = ensure_16k_mono(DRIVE_VC_NEG)
print(f"  Checked {checked_neg}, fixed {fixed_neg}")

In [None]:
# Listen to a few voice-converted samples
import IPython.display as ipd
import random

vc_dirs = [d for d in os.listdir(DRIVE_VC_POS) if os.path.isdir(os.path.join(DRIVE_VC_POS, d))]
if vc_dirs:
    sample_dir = os.path.join(DRIVE_VC_POS, random.choice(vc_dirs))
    samples = glob.glob(os.path.join(sample_dir, "*.wav"))[:3]
    for f in samples:
        print(f"Playing: {os.path.basename(f)}")
        ipd.display(ipd.Audio(f))

## Stage 6: Download training data

In [None]:
%%bash
cd /content/timoshka

if [ ! -f "openwakeword_features_ACAV100M_2000_hrs_16bit.npy" ]; then
    echo "Downloading ACAV100M features (~6 GB)..."
    wget -q --show-progress \
        https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/openwakeword_features_ACAV100M_2000_hrs_16bit.npy
else
    echo "ACAV100M features already downloaded"
fi

if [ ! -f "validation_set_features.npy" ]; then
    echo "Downloading validation set..."
    wget -q --show-progress \
        https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/validation_set_features.npy
else
    echo "Validation set already downloaded"
fi

echo "\nData files:"
ls -lh *.npy

In [None]:
%%bash
cd /content/timoshka

if [ ! -d "mit_rirs" ]; then
    echo "Downloading MIT RIRs..."
    mkdir -p mit_rirs
    wget -q --show-progress -O mit_rirs.zip \
        https://mcdermottlab.mit.edu/Reverb/IRMAudio/Audio.zip
    unzip -q mit_rirs.zip -d mit_rirs/ 2>/dev/null || true
    rm -f mit_rirs.zip
else
    echo "MIT RIRs already present"
fi

if [ ! -d "audioset_16k" ]; then
    echo "Downloading AudioSet background noise subset..."
    mkdir -p audioset_16k
    wget -q --show-progress -O audioset_16k.tar.gz \
        https://huggingface.co/datasets/davidscripka/openwakeword_features/resolve/main/audioset_16k_sample.tar.gz \
        2>/dev/null || echo "Note: AudioSet subset not found."
    if [ -f audioset_16k.tar.gz ]; then
        tar -xzf audioset_16k.tar.gz -C audioset_16k/ 2>/dev/null || true
        rm -f audioset_16k.tar.gz
    fi
fi

mkdir -p fma
echo "Background data ready"

## Stage 7: Train openWakeWord

In [None]:
import yaml, os, glob

# Count all VC samples from Drive
n_pos_vc = 0
for root, dirs, files in os.walk(DRIVE_VC_POS):
    n_pos_vc += len([f for f in files if f.endswith('.wav')])

n_neg_vc = 0
for root, dirs, files in os.walk(DRIVE_VC_NEG):
    n_neg_vc += len([f for f in files if f.endswith('.wav')])

n_pos_tts = 0
for root, dirs, files in os.walk(DRIVE_TTS_POS):
    n_pos_tts += len([f for f in files if f.endswith('.wav')])

n_neg_tts = 0
for root, dirs, files in os.walk(DRIVE_TTS_NEG):
    n_neg_tts += len([f for f in files if f.endswith('.wav')])

print(f"VC positive:  {n_pos_vc}")
print(f"VC negative:  {n_neg_vc}")
print(f"TTS positive: {n_pos_tts}")
print(f"TTS negative: {n_neg_tts}")
print(f"Total positive: {n_pos_vc + n_pos_tts}")
print(f"Total negative: {n_neg_vc + n_neg_tts}")

In [None]:
import shutil

# Merge all samples into flat directories for training
# Using local disk for speed (symlinks don't work across Drive/local)
ALL_POSITIVE_DIR = os.path.join(LOCAL_BASE, "all_positive")
ALL_NEGATIVE_DIR = os.path.join(LOCAL_BASE, "all_negative")
os.makedirs(ALL_POSITIVE_DIR, exist_ok=True)
os.makedirs(ALL_NEGATIVE_DIR, exist_ok=True)

def link_or_copy(src, dst):
    if not os.path.exists(dst):
        try:
            os.symlink(src, dst)
        except OSError:
            shutil.copy2(src, dst)

# Positive: VC from Drive + TTS from Drive
print("Linking positive samples...")
count = 0
for root, dirs, files in os.walk(DRIVE_VC_POS):
    for f in files:
        if f.endswith('.wav'):
            src = os.path.join(root, f)
            tgt_name = os.path.basename(root)
            link_or_copy(src, os.path.join(ALL_POSITIVE_DIR, f"{tgt_name}_{f}"))
            count += 1
for root, dirs, files in os.walk(DRIVE_TTS_POS):
    for f in files:
        if f.endswith('.wav'):
            src = os.path.join(root, f)
            parent = os.path.basename(root)
            link_or_copy(src, os.path.join(ALL_POSITIVE_DIR, f"tts_{parent}_{f}"))
            count += 1
print(f"  Linked {count} positive samples")

# Negative: VC from Drive + TTS from Drive
print("Linking negative samples...")
count = 0
for root, dirs, files in os.walk(DRIVE_VC_NEG):
    for f in files:
        if f.endswith('.wav'):
            src = os.path.join(root, f)
            tgt_name = os.path.basename(root)
            link_or_copy(src, os.path.join(ALL_NEGATIVE_DIR, f"{tgt_name}_{f}"))
            count += 1
for root, dirs, files in os.walk(DRIVE_TTS_NEG):
    for f in files:
        if f.endswith('.wav'):
            src = os.path.join(root, f)
            parent = os.path.basename(root)
            link_or_copy(src, os.path.join(ALL_NEGATIVE_DIR, f"tts_{parent}_{f}"))
            count += 1
print(f"  Linked {count} negative samples")

total_pos = len(os.listdir(ALL_POSITIVE_DIR))
total_neg = len(os.listdir(ALL_NEGATIVE_DIR))
print(f"\nAll positive: {total_pos}")
print(f"All negative: {total_neg}")

In [None]:
import yaml

TRAINING_DIR = os.path.join(LOCAL_BASE, "training")
os.makedirs(TRAINING_DIR, exist_ok=True)

config = {
    "model_name": "timoshka",
    "target_phrase": ["\u0442\u0438\u043c\u043e\u0448\u043a\u0430"],
    "custom_negative_phrases": [],
    "n_samples": 0,
    "n_samples_val": 0,
    "augmentation_rounds": 1,
    "augmentation_batch_size": 16,
    "piper_sample_generator_path": "/content/piper-sample-generator",
    "output_dir": TRAINING_DIR,
    "rir_paths": [os.path.join(LOCAL_BASE, "mit_rirs")],
    "background_paths": [
        os.path.join(LOCAL_BASE, "audioset_16k"),
        os.path.join(LOCAL_BASE, "fma"),
    ],
    "background_paths_duplication_rate": [1],
    "feature_data_files": {
        "ACAV100M_sample": os.path.join(
            LOCAL_BASE, "openwakeword_features_ACAV100M_2000_hrs_16bit.npy"
        ),
    },
    "false_positive_validation_data_path": os.path.join(
        LOCAL_BASE, "validation_set_features.npy"
    ),
    "batch_n_per_class": {
        "ACAV100M_sample": 1024,
        "adversarial_negative": 50,
        "positive": 50,
    },
    "model_type": "dnn",
    "layer_size": 32,
    "steps": TRAINING_STEPS,
    "max_negative_weight": 1500,
    "target_false_positives_per_hour": 0.2,
    "target_accuracy": 0.7,
    "target_recall": 0.5,
}

config_path = os.path.join(LOCAL_BASE, "timoshka_config.yaml")
with open(config_path, "w") as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True)

print(f"Config saved ({TRAINING_STEPS} steps)")
print(open(config_path).read())

In [None]:
# Prepare directory structure expected by openWakeWord train.py
phrase_dir = os.path.join(TRAINING_DIR, "\u0442\u0438\u043c\u043e\u0448\u043a\u0430")
pos_link = os.path.join(phrase_dir, "positive")
neg_link = os.path.join(phrase_dir, "negative")

os.makedirs(phrase_dir, exist_ok=True)

if os.path.exists(pos_link):
    os.remove(pos_link)
os.symlink(ALL_POSITIVE_DIR, pos_link)

if os.path.exists(neg_link):
    os.remove(neg_link)
os.symlink(ALL_NEGATIVE_DIR, neg_link)

print(f"Positive -> {ALL_POSITIVE_DIR} ({len(os.listdir(pos_link))} files)")
print(f"Negative -> {ALL_NEGATIVE_DIR} ({len(os.listdir(neg_link))} files)")

In [None]:
import subprocess, sys

# Apply speechbrain patch before running train.py (in case pip overwrote it)
def patch_speechbrain():
    """Fix speechbrain/torchaudio compatibility inline."""
    import pathlib
    for pyver in ["3.12", "3.11", "3.10"]:
        f = pathlib.Path(f'/usr/local/lib/python{pyver}/dist-packages/speechbrain/utils/torch_audio_backend.py')
        if f.exists():
            code = f.read_text()
            old = 'available_backends = torchaudio.list_audio_backends()'
            new = 'available_backends = torchaudio.list_audio_backends() if hasattr(torchaudio, "list_audio_backends") else []'
            if old in code:
                code = code.replace(old, new)
                f.write_text(code)
                print(f"Patched {f}")
            return
    print("WARNING: speechbrain torch_audio_backend.py not found")

patch_speechbrain()

print("Starting augmentation...")
result = subprocess.run(
    [sys.executable, "openWakeWord/openwakeword/train.py",
     "--training_config", "/content/timoshka/timoshka_config.yaml",
     "--augment_clips", "--overwrite"],
    cwd="/content",
    capture_output=True, text=True,
)
print(result.stdout[-3000:] if len(result.stdout) > 3000 else result.stdout)
if result.returncode != 0:
    print(f"\nSTDERR:\n{result.stderr[-3000:] if len(result.stderr) > 3000 else result.stderr}")
    print(f"\nAugmentation failed with exit code {result.returncode}")
else:
    print("Augmentation complete!")

In [None]:
import subprocess, sys

patch_speechbrain()  # ensure patch is applied

print("Starting training...")
result = subprocess.run(
    [sys.executable, "openWakeWord/openwakeword/train.py",
     "--training_config", "/content/timoshka/timoshka_config.yaml",
     "--train_model"],
    cwd="/content",
    capture_output=True, text=True,
)
print(result.stdout[-3000:] if len(result.stdout) > 3000 else result.stdout)
if result.returncode != 0:
    print(f"\nSTDERR:\n{result.stderr[-3000:] if len(result.stderr) > 3000 else result.stderr}")
    print(f"\nTraining failed with exit code {result.returncode}")
else:
    print("Training complete!")

## Stage 8: Convert to TFLite

In [None]:
import subprocess, sys

patch_speechbrain()  # ensure patch is applied

print("Converting to TFLite...")
result = subprocess.run(
    [sys.executable, "openWakeWord/openwakeword/train.py",
     "--training_config", "/content/timoshka/timoshka_config.yaml",
     "--convert_to_tflite"],
    cwd="/content",
    capture_output=True, text=True,
)
print(result.stdout[-3000:] if len(result.stdout) > 3000 else result.stdout)
if result.returncode != 0:
    print(f"\nSTDERR:\n{result.stderr[-3000:] if len(result.stderr) > 3000 else result.stderr}")
    print(f"\nConversion failed with exit code {result.returncode}")
else:
    print("Conversion complete!")

In [None]:
import shutil

TRAINING_DIR = os.path.join(LOCAL_BASE, "training")
tflite_files = glob.glob(os.path.join(TRAINING_DIR, "**/*.tflite"), recursive=True)
onnx_files = glob.glob(os.path.join(TRAINING_DIR, "**/*.onnx"), recursive=True)

print("Generated model files:")
for f in tflite_files + onnx_files:
    size = os.path.getsize(f)
    print(f"  {f} ({size/1024:.1f} KB)")

if tflite_files:
    # Save to Drive for persistence
    drive_model = os.path.join(DRIVE_OUTPUT, "timoshka.tflite")
    shutil.copy2(tflite_files[0], drive_model)
    print(f"\nModel saved to Drive: {drive_model}")
    print(f"Size: {os.path.getsize(drive_model)/1024:.1f} KB")
else:
    print("\nERROR: No .tflite file found. Check training logs above.")

## Stage 9: Test the model

In [None]:
import torchaudio
import numpy as np
from openwakeword.model import Model

model_path = os.path.join(DRIVE_OUTPUT, "timoshka.tflite")
oww_model = Model(wakeword_models=[model_path])
model_name = list(oww_model.models.keys())[0]
print(f"Loaded model: {model_name}")

def test_wav(wav_path, model, name):
    waveform, sr = torchaudio.load(wav_path)
    if sr != 16000:
        waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    audio = (waveform.squeeze().numpy() * 32767).astype(np.int16)
    model.reset()
    chunk_size = 1280
    max_score = 0.0
    for i in range(0, len(audio) - chunk_size, chunk_size):
        chunk = audio[i:i+chunk_size]
        prediction = model.predict(chunk)
        score = prediction[name]
        max_score = max(max_score, score)
    return max_score

In [None]:
import random

print("=" * 50)
print("POSITIVE SAMPLES (should trigger)")
print("=" * 50)

pos_files = []
for root, dirs, files in os.walk(DRIVE_VC_POS):
    for f in files:
        if f.endswith('.wav'):
            pos_files.append(os.path.join(root, f))
test_pos = random.sample(pos_files, min(50, len(pos_files)))

pos_scores = []
for f in test_pos:
    score = test_wav(f, oww_model, model_name)
    pos_scores.append(score)

triggered = sum(1 for s in pos_scores if s >= 0.5)
print(f"Tested: {len(test_pos)}")
print(f"Triggered (>0.5): {triggered}/{len(test_pos)} ({100*triggered/len(test_pos):.0f}%)")
print(f"Mean: {np.mean(pos_scores):.3f}, Min: {np.min(pos_scores):.3f}, Max: {np.max(pos_scores):.3f}")

In [None]:
print("=" * 50)
print("NEGATIVE SAMPLES (should NOT trigger)")
print("=" * 50)

neg_files = []
for root, dirs, files in os.walk(DRIVE_VC_NEG):
    for f in files:
        if f.endswith('.wav'):
            neg_files.append(os.path.join(root, f))
test_neg = random.sample(neg_files, min(100, len(neg_files)))

neg_scores = []
false_positives = []
for f in test_neg:
    score = test_wav(f, oww_model, model_name)
    neg_scores.append(score)
    if score >= 0.5:
        false_positives.append((os.path.basename(f), score))

print(f"Tested: {len(test_neg)}")
print(f"False positives (>0.5): {len(false_positives)}/{len(test_neg)} ({100*len(false_positives)/len(test_neg):.1f}%)")
print(f"Mean: {np.mean(neg_scores):.3f}, Min: {np.min(neg_scores):.3f}, Max: {np.max(neg_scores):.3f}")

if false_positives:
    print("\nWorst false positives:")
    for fname, score in sorted(false_positives, key=lambda x: -x[1])[:10]:
        print(f"  {fname}: {score:.3f}")

In [None]:
# Summary
print("\n" + "=" * 50)
print("SUMMARY")
print("=" * 50)

tp_rate = sum(1 for s in pos_scores if s >= 0.5) / len(pos_scores) * 100
fp_rate = len(false_positives) / len(test_neg) * 100

print(f"True positive rate:  {tp_rate:.0f}%")
print(f"False positive rate: {fp_rate:.1f}%")
print()

if TEST_MODE:
    print("TEST MODE results (not production quality).")
    print("Set TEST_MODE = False and re-run for full training.")
elif tp_rate >= 70 and fp_rate < 5:
    print("Model looks good! Ready for deployment.")
elif tp_rate >= 50:
    print("Model is acceptable. Consider more training or data.")
else:
    print("Model needs improvement.")

## Download model

In [None]:
from google.colab import files

model_path = os.path.join(DRIVE_OUTPUT, "timoshka.tflite")
if os.path.exists(model_path):
    files.download(model_path)
    print(f"Downloaded: timoshka.tflite ({os.path.getsize(model_path)/1024:.1f} KB)")
else:
    print("Model file not found. Check training logs above.")

## Deploy to Home Assistant

After downloading `timoshka.tflite`:

```bash
# 1. Copy model to server
scp timoshka.tflite v@plex.local:/home/v/home-assistant/openwakeword-data/timoshka.tflite

# 2. Restart the openwakeword container
ssh v@plex.local 'cd /home/v/home-assistant && docker compose restart openwakeword'

# 3. In Home Assistant:
#    Settings -> Voice assistants -> your assistant -> Wake word -> select "timoshka"
#
# No changes needed to ESPHome/Atom Echo — wake word is processed server-side.
```