# Piper TTS Preprocessing Notebook

This notebook preprocesses audio data for Piper TTS training.

## What This Does
1. Sets up the environment with Python 3.10
2. Preprocesses your audio dataset (creates phoneme mappings and audio tensors)
3. Saves preprocessed data to Google Drive

## Requirements
- Audio files: WAV format (22050Hz, 16-bit, mono) in a `wavs/` folder
- Transcript file: `metadata.csv` with format `wavs/filename.wav|Text spoken in audio`

## Important
Run Cell 1 **twice** - it will restart the kernel after installing condacolab.

---

In [None]:
#@markdown # **1. Environment Setup**
#@markdown ---
#@markdown Run this cell **twice**. First run installs condacolab and restarts the kernel.

import sys
import os

# Check if condacolab is set up
try:
    import condacolab
    condacolab.check()
    print(f"Condacolab ready! Python: {sys.version}")
except:
    print("Installing condacolab...")
    !pip install -q condacolab
    import condacolab
    condacolab.install()  # Restarts kernel

# Install Python 3.10 if needed
if sys.version_info >= (3, 11):
    print("Installing Python 3.10...")
    !rm -f /usr/local/conda-meta/pinned
    !conda install -y python=3.10 --override-channels -c conda-forge -q
    print("Restarting...")
    os.kill(os.getpid(), 9)

print(f"Python version: {sys.version}")

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Clone Piper
print("\nCloning Piper...")
!rm -rf /content/piper
!git clone -q https://github.com/rhasspy/piper.git /content/piper

# Install dependencies for preprocessing
print("\nInstalling dependencies...")
!pip install -q cython piper-phonemize==1.1.0 librosa numpy==1.26

# Build monotonic_align
%cd /content/piper/src/python
!bash build_monotonic_align.sh 2>/dev/null

print("\n" + "="*50)
print("Setup complete!")
print("="*50)

In [None]:
#@markdown # **2. Preprocess Dataset**
#@markdown ---
#@markdown Configure paths and run preprocessing.
#@markdown
#@markdown **Your Google Drive should have:**
#@markdown ```
#@markdown /MyDrive/colab/piper/
#@markdown   wavs/
#@markdown     0000000001.wav
#@markdown     0000000002.wav
#@markdown     ...
#@markdown   metadata.csv
#@markdown ```

import os

#@markdown ### Input folder (contains wavs/ and metadata.csv):
input_dir = "/content/drive/MyDrive/colab/piper" #@param {type:"string"}

#@markdown ### Output folder (preprocessed data saved here):
output_dir = "/content/drive/MyDrive/colab/piper/Steve" #@param {type:"string"}

#@markdown ### Model name:
model_name = "Steve" #@param {type:"string"}

#@markdown ### Language:
language = "en-us" #@param ["en-us", "en", "de", "fr", "es", "it", "pt-br", "nl", "pl", "ru", "zh"]

#@markdown ### Sample rate:
sample_rate = "22050" #@param ["16000", "22050"]

#@markdown ### Single speaker dataset?
single_speaker = True #@param {type:"boolean"}

# Verify input exists
wavs_dir = os.path.join(input_dir, "wavs")
metadata_file = os.path.join(input_dir, "metadata.csv")

if not os.path.exists(wavs_dir):
    raise Exception(f"wavs folder not found: {wavs_dir}")
if not os.path.exists(metadata_file):
    raise Exception(f"metadata.csv not found: {metadata_file}")

wav_count = len([f for f in os.listdir(wavs_dir) if f.endswith('.wav')])
print(f"Found {wav_count} WAV files")
print(f"Output will be saved to: {output_dir}")

# Create output directory
os.makedirs(output_dir, exist_ok=True)

# Create audio cache directory
cache_dir = "/content/audio_cache"
os.makedirs(cache_dir, exist_ok=True)

# Build command
speaker_flag = "--single-speaker" if single_speaker else ""

print("\n" + "="*50)
print("Running preprocessing...")
print("="*50 + "\n")

%cd /content/piper/src/python

!python -m piper_train.preprocess \
    --language {language} \
    --input-dir "{input_dir}" \
    --output-dir "{output_dir}" \
    --dataset-name "{model_name}" \
    --dataset-format ljspeech \
    --sample-rate {sample_rate} \
    --cache-dir "{cache_dir}" \
    {speaker_flag}

print("\n" + "="*50)
print("Preprocessing complete!")
print("="*50)
print(f"\nOutput saved to: {output_dir}")
print("\nFiles created:")
!ls -la "{output_dir}"

In [None]:
#@markdown # **3. Verify Preprocessing**
#@markdown ---
#@markdown Check that all required files were created.

import os
import json

print("Checking preprocessed output...\n")

# Check config.json
config_path = os.path.join(output_dir, "config.json")
if os.path.exists(config_path):
    print("[OK] config.json exists")
    with open(config_path) as f:
        config = json.load(f)
        print(f"     Sample rate: {config.get('audio', {}).get('sample_rate', 'N/A')}")
else:
    print("[ERROR] config.json missing!")

# Check dataset.jsonl
dataset_path = os.path.join(output_dir, "dataset.jsonl")
if os.path.exists(dataset_path):
    with open(dataset_path) as f:
        lines = f.readlines()
    print(f"[OK] dataset.jsonl exists ({len(lines)} entries)")
    
    # Check first entry for required fields
    first = json.loads(lines[0])
    has_audio_norm = "audio_norm_path" in first
    has_phoneme_ids = "phoneme_ids" in first
    print(f"     Has audio_norm_path: {has_audio_norm}")
    print(f"     Has phoneme_ids: {has_phoneme_ids}")
    
    if not has_audio_norm:
        print("\n[WARNING] audio_norm_path missing - audio cache may not have been created")
else:
    print("[ERROR] dataset.jsonl missing!")

# Check audio cache
audio_dir = os.path.join(output_dir, "audio")
if os.path.exists(audio_dir):
    pt_files = [f for f in os.listdir(audio_dir) if f.endswith('.pt')]
    print(f"[OK] audio/ folder exists ({len(pt_files)} .pt files)")
else:
    # Check cache dir
    if os.path.exists(cache_dir):
        pt_files = [f for f in os.listdir(cache_dir) if f.endswith('.pt')]
        if pt_files:
            print(f"[INFO] Audio cache in {cache_dir} ({len(pt_files)} .pt files)")
            print("       Copying to output directory...")
            !cp -r "{cache_dir}" "{output_dir}/audio"
            print("       Done!")
        else:
            print("[WARNING] No .pt files found in cache")
    else:
        print("[WARNING] audio/ folder missing!")

print("\n" + "="*50)
print("Verification complete!")
print("="*50)
print(f"\nYour preprocessed data is at: {output_dir}")
print("Sync this folder to your Mac for local training.")