# Voice Clone (Voice Conversion) — Folder Batch

Applies voice conversion to every WAV in a folder using one or more **reference voice** samples.
Produces one output subfolder per reference voice.

### Supported engines

| Engine | Backend | Notes |
|--------|---------|-------|
| **chatterbox** | PyTorch | Best quality, needs GPU |
| **chatterbox_onnx** | ONNX Runtime | CPU-friendly, slightly lower quality |

---

> GPU strongly recommended for `chatterbox`. CPU ok for `chatterbox_onnx`.

In [None]:
import os

# ════════════════════════════════════════════
# ⚙️  CONFIGURATION
# ════════════════════════════════════════════
os.environ.setdefault("INPUT_DIR",       "/data/audio_in")         # Source WAVs
os.environ.setdefault("OUTPUT_DIR",      "/data/audio_cloned")     # Output root (subdir per voice)
os.environ.setdefault("REF_VOICES_DIR",  "/data/ref_voices")       # Folder of reference voice WAVs
os.environ.setdefault("VC_ENGINE",       "chatterbox")             # chatterbox | chatterbox_onnx
os.environ.setdefault("VC_DEVICE",       "cuda")                   # cuda | cpu

INPUT_DIR      = os.environ["INPUT_DIR"]
OUTPUT_DIR     = os.environ["OUTPUT_DIR"]
REF_VOICES_DIR = os.environ["REF_VOICES_DIR"]
VC_ENGINE      = os.environ["VC_ENGINE"]
VC_DEVICE      = os.environ["VC_DEVICE"]

print(f"Input:       {INPUT_DIR}")
print(f"Output:      {OUTPUT_DIR}")
print(f"Ref voices:  {REF_VOICES_DIR}")
print(f"Engine:      {VC_ENGINE}")
print(f"Device:      {VC_DEVICE}")

In [None]:
vc_engine = os.environ.get("VC_ENGINE", "chatterbox")
if vc_engine == "chatterbox_onnx":
    !pip install --quiet --break-system-packages chatterbox-onnx tqdm
else:
    !pip install --quiet --break-system-packages chatterbox torch torchaudio tqdm

In [None]:
import os
from pathlib import Path
from tqdm import tqdm

# Discover reference voices
ref_files = sorted(Path(REF_VOICES_DIR).rglob("*.wav"))
if not ref_files:
    raise FileNotFoundError(f"No .wav files found in {REF_VOICES_DIR}")
print(f"Found {len(ref_files)} reference voice(s)")

# Discover source audio
source_files = sorted(Path(INPUT_DIR).rglob("*.wav"))
print(f"Found {len(source_files)} source audio files")

# Load VC model once
print(f"Loading {VC_ENGINE} model on {VC_DEVICE}...")
if VC_ENGINE == "chatterbox_onnx":
    from chatterbox_onnx import ChatterboxOnnx
    vc_model = ChatterboxOnnx(device=VC_DEVICE)
else:
    from chatterbox.vc import ChatterboxVC
    vc_model = ChatterboxVC.from_pretrained(VC_DEVICE)
print("  Model loaded.")

# Process each reference voice
for ref_path in ref_files:
    voice_name = ref_path.stem
    voice_out = Path(OUTPUT_DIR) / voice_name
    voice_out.mkdir(parents=True, exist_ok=True)

    done, skipped, failed = 0, 0, 0
    for src in tqdm(source_files, desc=f"VC → {voice_name}", unit="file"):
        rel = src.relative_to(INPUT_DIR)
        dst = voice_out / rel
        dst.parent.mkdir(parents=True, exist_ok=True)

        if dst.exists():
            skipped += 1
            continue

        try:
            if VC_ENGINE == "chatterbox_onnx":
                vc_model.voice_convert(
                    source_audio_path=str(src),
                    target_voice_path=str(ref_path),
                    output_file_name=str(dst),
                )
            else:
                import torchaudio as ta
                wav = vc_model.generate(
                    audio=str(src),
                    target_voice_path=str(ref_path),
                )
                ta.save(str(dst), wav, vc_model.sr)
            done += 1
        except Exception as e:
            tqdm.write(f"  FAIL {src.name}: {e}")
            failed += 1

    print(f"  {voice_name}: {done} converted, {skipped} skipped, {failed} failed")

# Cleanup
del vc_model
try:
    import torch
    torch.cuda.empty_cache()
except Exception:
    pass

print("\nAll voices done.")