# Piper TTS Preprocessing Notebook

This notebook preprocesses audio data for Piper TTS training.

## What This Does
1. Sets up the environment with Python 3.10 and **espeak-ng**
2. Unzips your **wavs.zip** and copies **metadata.csv** from Google Drive
3. Preprocesses data **locally** on Colab (very fast)
4. **Zips the output** and saves it back to Google Drive

## Requirements (in Google Drive Root)
- `wavs.zip`: A zip file containing your wav files
- `metadata.csv`: Transcript file `wavs/filename.wav|Text`

## Important
Run Cell 1 **twice** - it will restart the kernel after installing condacolab.

---

In [None]:
#@markdown # **1. Environment Setup**
#@markdown ---
#@markdown Run this cell **twice**. First run installs condacolab and restarts the kernel.

import sys
import os

# Check if condacolab is set up
try:
    import condacolab
    condacolab.check()
    print(f"Condacolab ready! Python: {sys.version}")
except:
    print("Installing condacolab...")
    !pip install -q condacolab
    import condacolab
    condacolab.install()  # Restarts kernel

# Install Python 3.10 if needed
if sys.version_info >= (3, 11):
    print("Installing Python 3.10...")
    !rm -f /usr/local/conda-meta/pinned
    !conda install -y python=3.10 --override-channels -c conda-forge -q
    print("Restarting...")
    os.kill(os.getpid(), 9)

print(f"Python version: {sys.version}")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Clone Piper
print("\nCloning Piper...")
!rm -rf /content/piper
!git clone -q https://github.com/rhasspy/piper.git /content/piper

# Install dependencies for preprocessing
print("\nInstalling dependencies...")
!sudo apt-get install -y espeak-ng
!pip install -q cython piper-phonemize==1.1.0 librosa numpy==1.26

# Build monotonic_align
%cd /content/piper/src/python
!bash build_monotonic_align.sh 2>/dev/null

print("\n" + "="*50)
print("Setup complete!")
print("="*50)

In [None]:
#@markdown # **2. Preprocess Dataset**
#@markdown ---
#@markdown Configure paths and run preprocessing. 
#@markdown **NOTE:** This script will unzip your `wavs.zip` from Drive to the local Colab VM for maximum speed.

import os
import shutil

#@markdown ### Input Files (in Google Drive):
drive_wavs_zip = "/content/drive/MyDrive/wavs.zip" #@param {type:"string"}
drive_metadata = "/content/drive/MyDrive/metadata.csv" #@param {type:"string"}

#@markdown ### Output folder (ZIP will be saved here):
drive_output_dir = "/content/drive/MyDrive/colab/piper/Steve" #@param {type:"string"}

#@markdown ### Model name:
model_name = "Steve" #@param {type:"string"}

#@markdown ### Language:
language = "en-us" #@param ["en-us", "en", "de", "fr", "es", "it", "pt-br", "nl", "pl", "ru", "zh"]

#@markdown ### Sample rate:
sample_rate = "22050" #@param ["16000", "22050"]

#@markdown ### Single speaker dataset?
single_speaker = True #@param {type:"boolean"}

# ----------------------------------------------------------
# Setup Local Directories
# ----------------------------------------------------------
local_input_dir = "/content/dataset_input"
local_output_dir = "/content/piper_preprocessed"
cache_dir = "/content/audio_cache"

# Clean up previous runs if needed
for d in [local_input_dir, local_output_dir, cache_dir]:
    if os.path.exists(d):
        shutil.rmtree(d)
    os.makedirs(d, exist_ok=True)

print("="*50)
print("Preparing Input Data (Local Copy)")
print("="*50)

# 1. Unzip wavs
print(f"[1/3] Unzipping wavs from {drive_wavs_zip}...")
if not os.path.exists(drive_wavs_zip):
    raise Exception(f"wavs file not found at: {drive_wavs_zip}")

!unzip -q "{drive_wavs_zip}" -d "{local_input_dir}"

# Handle case where zip contains a folder (e.g. wavs/file.wav) or just files
# We need the structure local_input_dir/wavs/file.wav
extracted_contents = os.listdir(local_input_dir)
if "wavs" in extracted_contents and os.path.isdir(os.path.join(local_input_dir, "wavs")):
    print("      dataset structure looks correct (found wavs folder).")
else:
    print("      'wavs' folder not found directly in zip. Creating it...")
    # If the user just zipped a bunch of wav files without a parent 'wavs' folder
    wavs_path = os.path.join(local_input_dir, "wavs")
    os.makedirs(wavs_path, exist_ok=True)
    for f in extracted_contents:
        src = os.path.join(local_input_dir, f)
        if os.path.isfile(src) and f.lower().endswith(".wav"):
            shutil.move(src, os.path.join(wavs_path, f))

# 2. Copy metadata
print(f"[2/3] Copying metadata from {drive_metadata}...")
if not os.path.exists(drive_metadata):
    raise Exception(f"Metadata file not found at: {drive_metadata}")

shutil.copy2(drive_metadata, os.path.join(local_input_dir, "metadata.csv"))

# 3. Run Preprocessing
print(f"[3/3] Running Piper processing...")
speaker_flag = "--single-speaker" if single_speaker else ""

%cd /content/piper/src/python

!python -m piper_train.preprocess \
    --language {language} \
    --input-dir "{local_input_dir}" \
    --output-dir "{local_output_dir}" \
    --dataset-name "{model_name}" \
    --dataset-format ljspeech \
    --sample-rate {sample_rate} \
    --cache-dir "{cache_dir}" \
    {speaker_flag}

print("\n" + "="*50)
print("Preprocessing complete! Packaging results...")
print("="*50)

# Zip results
zip_name = f"{model_name}_preprocessed.zip"
zip_path = os.path.join("/content", zip_name)
print(f"Zipping to {zip_path}...")
!cd "{local_output_dir}" && zip -r -q "{zip_path}" .

# Copy to Drive
os.makedirs(drive_output_dir, exist_ok=True)
drive_dest = os.path.join(drive_output_dir, zip_name)
print(f"Copying to Google Drive: {drive_dest}")
shutil.copy2(zip_path, drive_dest)

print("\nDONE! Download the zip file from your Drive to train locally.")

In [None]:
#@markdown # **3. Verify Preprocessing (from Local Temp)**
#@markdown ---
#@markdown Check that all required files were created in the local temp folder.

import os
import json

# We check the local temp dir where data was just generated
check_dir = local_output_dir

print(f"Checking output in: {check_dir}\n")

# Check config.json
config_path = os.path.join(check_dir, "config.json")
if os.path.exists(config_path):
    print("[OK] config.json exists")
    with open(config_path) as f:
        config = json.load(f)
        print(f"     Sample rate: {config.get('audio', {}).get('sample_rate', 'N/A')}")
else:
    print("[ERROR] config.json missing!")

# Check dataset.jsonl
dataset_path = os.path.join(check_dir, "dataset.jsonl")
if os.path.exists(dataset_path):
    with open(dataset_path) as f:
        lines = f.readlines()
    print(f"[OK] dataset.jsonl exists ({len(lines)} entries)")
    
    # Check first entry for required fields
    if len(lines) > 0:
        first = json.loads(lines[0])
        has_audio_norm = "audio_norm_path" in first
        has_phoneme_ids = "phoneme_ids" in first
        print(f"     Has audio_norm_path: {has_audio_norm}")
        print(f"     Has phoneme_ids: {has_phoneme_ids}")
        
        if not has_audio_norm:
            print("\n[WARNING] audio_norm_path missing - audio cache may not have been created")
else:
    print("[ERROR] dataset.jsonl missing!")

# Check audio cache (in output)
audio_dir = os.path.join(check_dir, "audio")
if os.path.exists(audio_dir):
    pt_files = [f for f in os.listdir(audio_dir) if f.endswith('.pt')]
    print(f"[OK] audio/ folder exists ({len(pt_files)} .pt files)")
else:
    print("[WARNING] audio/ folder missing! (Did preprocessing finish?)")

print("\n" + "="*50)
print("Verification complete!")
print("="*50)
print(f"\nThe ZIP file should be in your Drive at: {drive_dest}")