# TTS v5 - F5-TTS-MLX (Apple Silicon Optimized)
- Adapted for running locally with F5-TTS-MLX
- Optimized for Apple Silicon (M1/M2/M3/M4) with MLX framework
- Includes sentence tracking and timeline manifest generation
- CPU fallback support for compatibility

## 0) Environment Setup (Optional)

**This step helps you manage Python packages and avoid conflicts with your system installation.**

- If you have **conda** installed, you can create a fresh environment for this notebook
- Or use an existing environment by providing its name
- At the end of the notebook, you can easily clean up and delete the environment to free storage

In [None]:
import subprocess
import sys
import os

# Flag to track if we created an environment in this notebook
environment_created_by_notebook = False
environment_name = None

# Check if conda is installed
try:
    result = subprocess.run(['conda', '--version'], capture_output=True, text=True, check=True)
    conda_available = True
    print(f"✓ Conda detected: {result.stdout.strip()}")
except (subprocess.CalledProcessError, FileNotFoundError):
    conda_available = False
    print("✗ Conda not found - skipping environment management")
    print("Packages will be installed in your current Python environment")

if conda_available:
    print("\n" + "="*60)
    print("ENVIRONMENT SETUP OPTIONS")
    print("="*60)
    
    choice = input("\nDo you want to:\n  [1] Create a NEW conda environment (recommended)\n  [2] Use an EXISTING environment\n  [3] Skip and use current environment\n\nEnter choice (1/2/3): ").strip()
    
    if choice == "1":
        # Create new environment
        env_name = input("\nEnter name for new environment (default: f5_tts_mlx): ").strip()
        if not env_name:
            env_name = "f5_tts_mlx"
        
        print(f"\n→ Creating conda environment: {env_name}")
        print("  This may take a few minutes...")
        
        try:
            # Create environment with Python 3.10
            subprocess.run(['conda', 'create', '-n', env_name, 'python=3.10', '-y'],
                           check=True, capture_output=True)
            
            environment_created_by_notebook = True
            environment_name = env_name
            
            print(f"✓ Environment '{env_name}' created successfully!")
            print(f"\n{'='*60}")
            print("IMPORTANT: Restart your Jupyter kernel and select the new environment:")
            print(f"  Kernel → Change Kernel → {env_name}")
            print(f"{'='*60}\n")
        
        except subprocess.CalledProcessError as e:
            print(f"✗ Failed to create environment: {e}")
            print("Continuing with current environment...")
    
    elif choice == "2":
        # Use existing environment
        env_name = input("\nEnter name of existing environment: ").strip()
        if env_name:
            environment_name = env_name
            print(f"\n✓ Using existing environment: {env_name}")
            print(f"\n{'='*60}")
            print("IMPORTANT: Make sure your kernel is using this environment:")
            print(f"  Kernel → Change Kernel → {env_name}")
            print(f"{'='*60}\n")
        else:
            print("✗ No environment name provided - using current environment")
    
    else:
        print("\n✓ Using current environment")

print("\nYou can now proceed with the rest of the notebook.")

## 1) Install Dependencies

In [None]:
# Core TTS + I/O deps
!pip install f5-tts-mlx soundfile pypdf ebooklib pydub

# Advanced PDF extraction
!pip install "unstructured[local-inference]"
!pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"

# Note: ffmpeg should be installed on your system for MP3 encoding
# macOS: brew install ffmpeg
# Linux: sudo apt-get install ffmpeg
# Windows: Download from https://ffmpeg.org/

# Silence overly chatty logs
import logging
logging.getLogger("unstructured").setLevel(logging.ERROR)
logging.getLogger("pypdf").setLevel(logging.CRITICAL)

## 2) Configuration and Setup

In [None]:
import os
from pathlib import Path
import platform

# --- Output directory setup ---
OUTPUT_DIR = Path(".")  # Use current directory (same as notebook location)
print(f"Output directory: {OUTPUT_DIR.resolve()}")

# --- Device detection ---
# F5-TTS-MLX is optimized for Apple Silicon but can fall back to CPU
SYSTEM = platform.system()
MACHINE = platform.machine()

if SYSTEM == "Darwin" and MACHINE == "arm64":
    DEVICE = "mlx"  # Apple Silicon (M1/M2/M3/M4)
    print(f"Detected Apple Silicon ({MACHINE})")
    print("Using MLX framework for optimal performance")
else:
    DEVICE = "cpu"  # Fallback to CPU
    print(f"Detected {SYSTEM} on {MACHINE}")
    print("MLX is optimized for Apple Silicon. Performance may be limited on this platform.")
    print("Using CPU fallback mode")

print(f"\nDevice mode: {DEVICE}")

## 3) Helper Functions (PDF/EPUB extraction & TTS synthesis)

In [None]:
import numpy as np
import soundfile as sf
import re, io, zipfile, tempfile, os
from pathlib import Path
from typing import List, Tuple, Dict, Union, Optional
from functools import lru_cache

from pypdf import PdfReader
from ebooklib import epub
from pydub import AudioSegment

# Imports for advanced PDF extraction
from unstructured.partition.auto import partition

# Import F5-TTS-MLX
try:
    from f5_tts_mlx.generate import generate
    F5_AVAILABLE = True
except ImportError as e:
    F5_AVAILABLE = False
    print(f"Warning: F5-TTS-MLX not available: {e}")
    print("Please install with: pip install f5-tts-mlx")

# Sentence-ish split; keeps chunks small
SPLIT_PATTERN = r"[.?!]\s+|[\n]{2,}"
SPLIT_PATTERN_CAP = r"([.?!]\s+|[\n]{2,})"


# --- PDF Extraction using unstructured.io ---
def extract_text_from_pdf(file_like: io.BytesIO, pages: Optional[List[int]] = None) -> List[Dict]:
    """Extract text from PDF using unstructured.io with layout analysis.

    Args:
        file_like: PDF file as BytesIO object
        pages: Optional list of page numbers to extract (1-indexed). None = all pages.

    Returns:
        List of text elements with metadata
    """
    print("Parsing PDF with layout analysis (strategy='hi_res')...")
    try:
        partitioned_elements = partition(file=file_like, strategy="hi_res", content_type="application/pdf", include_page_breaks=True)
        print(f"Unstructured 'hi_res' returned {len(partitioned_elements)} raw elements.")
    except Exception as e:
        print(f"Unstructured 'hi_res' strategy failed: {e}. Falling back to 'fast'.")
        try:
            file_like.seek(0)
            partitioned_elements = partition(file=file_like, strategy="fast", content_type="application/pdf", include_page_breaks=True)
            print(f"Unstructured 'fast' returned {len(partitioned_elements)} raw elements.")
        except Exception as e2:
            print(f"Unstructured 'fast' strategy also failed: {e2}.")
            return [{"text": "Error: Unstructured parsing failed.", "metadata": {"page_number": 1, "points": None}}]

    # Convert pages to set for faster lookup
    pages_set = set(pages) if pages else None

    element_list = []
    current_page = 1
    print("\n--- Processing elements (checking for points) ---")

    for i, el in enumerate(partitioned_elements):
        meta_dict = el.metadata.to_dict()

        page_num_meta = meta_dict.get("page_number")
        if page_num_meta is not None:
             current_page = page_num_meta

        # Skip if page filtering is enabled and current page not in list
        if pages_set and current_page not in pages_set:
            continue

        # Extract coordinate points if available
        points = None
        coords_meta = meta_dict.get("coordinates")
        if coords_meta:
            points = coords_meta.get("points")

        location_data = {
            "page_number": current_page,
            "points": points
        }

        element_text = str(el).strip()
        if element_text:
            element_list.append({
                "text": element_text,
                "metadata": location_data
            })

    print("--- Finished processing elements ---")
    if pages_set:
        print(f"Unstructured: Found {len(element_list)} text elements from pages {sorted(pages_set)}.")
    else:
        print(f"Unstructured: Found {len(element_list)} text elements from all pages.")
    if not element_list:
         return [{"text": "Warning: Unstructured found no text elements.", "metadata": {"page_number": 1, "points": None}}]
    return element_list


# --- EPUB Extraction ---
def extract_chapters_from_epub(file_like: io.BytesIO):
    bk = epub.read_epub(file_like)
    chapters = []
    for item in bk.get_items_of_type(epub.ITEM_DOCUMENT):
        if getattr(item, "is_nav", False): continue
        html = item.get_content().decode("utf-8", errors="ignore")
        text = re.sub(r"<(script|style).*?>.*?</\1>", " ", html, flags=re.S|re.I)
        text = re.sub(r"<br\s*/?>", "\n", text, flags=re.I)
        text = re.sub(r"</p>|</div>|</h\d>", "\n\n", text, flags=re.I)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text).strip()
        if text:
            title = Path(item.file_name).stem
            first = text.splitlines()[0] if text else ""; m = re.match(r"(?i)\s*(chapter|part|book)\b[^\n]{0,80}", first)
            if m: title = first[:60]
            chapters.append((title, text))
    if not chapters:
        blobs = [];
        for item in bk.get_items_of_type(epub.ITEM_DOCUMENT):
             if getattr(item, "is_nav", False): continue
             blobs.append(item.get_content().decode("utf-8", errors="ignore"))
        html = " ".join(blobs)
        text = re.sub(r"<(script|style).*?>.*?</\1>", " ", html, flags=re.S|re.I)
        text = re.sub(r"<br\s*/?>", "\n", text, flags=re.I)
        text = re.sub(r"</p>|</div>|</h\d>", "\n\n", text, flags=re.I)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text).strip()
        if text: chapters = [("Chapter 1", text)]
    return chapters

def safe_name(s: str) -> str:
    s = re.sub(r"[^\w\-]+", "_", s).strip("_"); return s or "chapter"

def _synthesize_sentence_f5(
    sentence: str,
    ref_audio: Optional[str] = None,
    ref_text: Optional[str] = None,
    sample_rate: int = 24000
) -> np.ndarray:
    """
    Synthesize a single sentence using F5-TTS-MLX.

    Args:
        sentence: Text to synthesize
        ref_audio: Optional path to reference audio for voice cloning
        ref_text: Optional transcription of reference audio
        sample_rate: Output sample rate (default: 24000 Hz)

    Returns:
        Audio samples as numpy array
    """
    if not F5_AVAILABLE:
        raise RuntimeError("F5-TTS-MLX is not available. Please install with: pip install f5-tts-mlx")

    try:
        # Create a temporary file for output
        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
            temp_path = tmp_file.name

        try:
            # Generate audio using F5-TTS-MLX with correct parameter names
            generate(
                generation_text=sentence,
                ref_audio_path=ref_audio,
                ref_audio_text=ref_text,
                output_path=temp_path,
                estimate_duration=True  # Auto-estimate duration for natural speech
            )

            # Read the generated audio file
            audio, sr = sf.read(temp_path)

            # Ensure we have float32 mono audio
            if audio.ndim > 1:
                audio = audio[:, 0]  # Take first channel if stereo
            audio = audio.astype(np.float32)

            return audio

        finally:
            # Clean up temp file
            if os.path.exists(temp_path):
                os.unlink(temp_path)

    except Exception as e:
        print(f"Error in F5-TTS-MLX synthesis: {e}")
        print(f"Sentence that failed: {sentence[:50]}...")
        # Return silence on error
        return np.zeros((sample_rate // 10,), dtype=np.float32)

def split_sentences_keep_delim(text: str) -> List[str]:
    parts = re.split(SPLIT_PATTERN_CAP, text); sents = []
    for i in range(0, len(parts), 2):
        chunk = (parts[i] or "").strip(); sep = parts[i+1] if i+1 < len(parts) else ""
        if not chunk: continue
        if sep and not sep.isspace(): chunk = (chunk + " " + sep.strip()).strip()
        sents.append(chunk)
    return sents

# --- Synthesizer ---
def synth_text_to_wav_and_manifest(
    text_or_elements: Union[str, List[Dict]],
    ref_audio: Optional[str] = None,
    ref_text: Optional[str] = None,
    device: str = DEVICE) -> Tuple[bytes, Dict]:
    """
    Synthesize text to WAV audio with timeline manifest.

    Args:
        text_or_elements: Either a string or list of text elements with metadata
        ref_audio: Optional path to reference audio for voice cloning
        ref_text: Optional transcription of reference audio
        device: Device to use (mlx or cpu)

    Returns:
        Tuple of (wav_bytes, manifest_dict)
    """
    sr = 24000  # F5-TTS-MLX uses 24kHz

    if isinstance(text_or_elements, str):
        elements = [{"text": text_or_elements, "metadata": {"page_number": 1, "points": None}}]
    else:
        elements = text_or_elements

    pcm_all = []; timeline = []; t = 0.0; sentence_index = 0
    print(f"Synthesizing {len(elements)} text elements...")

    for element in elements:
        element_text = element.get("text", "")
        element_meta = element.get("metadata", {})

        sentences = split_sentences_keep_delim(element_text)

        for sent in sentences:
            if not sent: continue

            pcm = _synthesize_sentence_f5(
                sent,
                ref_audio=ref_audio,
                ref_text=ref_text,
                sample_rate=sr
            )

            dur = pcm.shape[0] / sr
            timeline.append({
                "i": sentence_index,
                "start": round(t, 3),
                "end": round(t + dur, 3),
                "text": sent.strip(),
                "location": element_meta
            })
            pcm_all.append(pcm); t += dur; sentence_index += 1

    pcm_cat = np.concatenate(pcm_all, axis=0) if pcm_all else np.zeros((sr//10,), dtype=np.float32)
    buf = io.BytesIO(); sf.write(buf, pcm_cat, sr, format='WAV'); buf.seek(0)
    manifest = {"audioUrl": "", "sentences": timeline}
    return buf.read(), manifest

def wav_to_mp3_bytes(wav_bytes: bytes, bitrate="128k") -> bytes:
    audio = AudioSegment.from_file(io.BytesIO(wav_bytes), format="wav"); out = io.BytesIO()
    audio.export(out, format="mp3", bitrate=bitrate); out.seek(0); return out.read()


## 4) High-Level Synthesis Wrappers

In [None]:
def synth_string(text: str,
                 ref_audio: Optional[str] = None,
                 ref_text: Optional[str] = None,
                 out_format="wav",
                 device=None,
                 basename="f5_text",
                 output_dir=None):
    """
    Synthesize a text string to audio.
    
    Args:
        text: Text to synthesize
        ref_audio: Optional path to reference audio for voice cloning (mono, 24kHz, 5-10s WAV)
        ref_text: Optional transcription of reference audio
        out_format: Output format ("wav" or "mp3")
        device: Device to use (None = auto-detect)
        basename: Base name for output files
        output_dir: Output directory (None = use OUTPUT_DIR)
    
    Returns:
        Tuple of (audio_path, manifest_path)
    """
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR

    elements = [{
        "text": text,
        "metadata": {"page_number": 1, "source": "string", "coordinates": None}
    }]

    wav_bytes, manifest = synth_text_to_wav_and_manifest(
        elements,
        ref_audio=ref_audio,
        ref_text=ref_text,
        device=device
    )

    out_base = output_dir / basename

    if out_format.lower() == "mp3":
        mp3 = wav_to_mp3_bytes(wav_bytes)
        audio_path = str(out_base) + ".mp3"
        with open(audio_path, "wb") as f: f.write(mp3)
    else:
        audio_path = str(out_base) + ".wav"
        with open(audio_path, "wb") as f: f.write(wav_bytes)

    manifest_path = str(out_base) + "_manifest.json"
    manifest["audioUrl"] = Path(audio_path).name
    with open(manifest_path, "w", encoding="utf-8") as f:
        import json; json.dump(manifest, f, ensure_ascii=False, indent=2)

    return audio_path, manifest_path

def synth_pdf(file_path_or_bytes,
              ref_audio: Optional[str] = None,
              ref_text: Optional[str] = None,
              out_format="wav",
              device=None,
              basename=None,
              output_dir=None,
              pages=None):
    """
    Synthesize a PDF document to audio with timeline manifest.
    
    Args:
        file_path_or_bytes: Path to PDF file or BytesIO object
        ref_audio: Optional path to reference audio for voice cloning (mono, 24kHz, 5-10s WAV)
        ref_text: Optional transcription of reference audio
        out_format: Output format ("wav" or "mp3")
        device: Device to use (None = auto-detect)
        basename: Base name for output files (None = use PDF filename)
        output_dir: Output directory (None = use OUTPUT_DIR)
        pages: Optional list of page numbers to synthesize (1-indexed). None = all pages.
               Examples: [1, 2, 3] or [5] or None
    
    Returns:
        Tuple of (audio_path, manifest_path)
    """
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR
    
    if isinstance(file_path_or_bytes, (str, Path)):
        with open(file_path_or_bytes, "rb") as fh:
            pdf_bytes = io.BytesIO(fh.read())
        stem = Path(file_path_or_bytes).stem
    else:
        pdf_bytes = file_path_or_bytes
        stem = basename or "document"

    elements = extract_text_from_pdf(pdf_bytes, pages=pages)

    wav_bytes, manifest = synth_text_to_wav_and_manifest(
        elements,
        ref_audio=ref_audio,
        ref_text=ref_text,
        device=device
    )

    out_base = output_dir / f"{(basename or stem)}_tts"

    if out_format.lower() == "mp3":
        mp3 = wav_to_mp3_bytes(wav_bytes)
        audio_path = str(out_base) + ".mp3"
        with open(audio_path, "wb") as f: f.write(mp3)
    else:
        audio_path = str(out_base) + ".wav"
        with open(audio_path, "wb") as f: f.write(wav_bytes)

    manifest_path = str(out_base) + "_manifest.json"
    manifest["audioUrl"] = Path(audio_path).name
    with open(manifest_path, "w", encoding="utf-8") as f:
        import json; json.dump(manifest, f, ensure_ascii=False, indent=2)

    return audio_path, manifest_path

def synth_epub(file_path_or_bytes,
               ref_audio: Optional[str] = None,
               ref_text: Optional[str] = None,
               per_chapter_format="wav",
               device=None,
               zip_name=None,
               output_dir=None):
    """
    Synthesize an EPUB book to audio, creating one audio file per chapter in a ZIP.
    
    Args:
        file_path_or_bytes: Path to EPUB file or BytesIO object
        ref_audio: Optional path to reference audio for voice cloning (mono, 24kHz, 5-10s WAV)
        ref_text: Optional transcription of reference audio
        per_chapter_format: Format for chapter audio files ("wav" or "mp3")
        device: Device to use (None = auto-detect)
        zip_name: Name for output ZIP file (None = use EPUB filename + '_chapters')
        output_dir: Output directory (None = use OUTPUT_DIR)
    
    Returns:
        Path to ZIP file containing chapter audio and manifest files
    """
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR

    if isinstance(file_path_or_bytes, (str, Path)):
        with open(file_path_or_bytes, "rb") as fh:
            epub_bytes = io.BytesIO(fh.read())
        stem = Path(file_path_or_bytes).stem
    else:
        epub_bytes = file_path_or_bytes
        stem = "book"

    chapters = extract_chapters_from_epub(epub_bytes)
    assert chapters, "No chapters detected in EPUB."

    zip_buf = io.BytesIO()
    with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
        for idx, (title, body) in enumerate(chapters, 1):
            name = f"{idx:02d}_{safe_name(title)[:40]}"

            chapter_elements = [{
                "text": body,
                "metadata": {
                    "chapter_index": idx,
                    "chapter_title": title,
                    "page_number": 1,
                    "coordinates": None
                }
            }]

            wav_bytes, manifest = synth_text_to_wav_and_manifest(
                chapter_elements,
                ref_audio=ref_audio,
                ref_text=ref_text,
                device=device
            )

            if per_chapter_format.lower() == "mp3":
                data = wav_to_mp3_bytes(wav_bytes)
                audio_name = f"{name}.mp3"
                zf.writestr(audio_name, data)
            else:
                audio_name = f"{name}.wav"
                zf.writestr(audio_name, wav_bytes)

            manifest["audioUrl"] = audio_name
            import json
            zf.writestr(f"{name}_manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2))

    zip_buf.seek(0)
    zpath = str(output_dir / f"{zip_name or (stem + '_chapters')}.zip")
    with open(zpath, "wb") as f:
        f.write(zip_buf.read())
    return zpath


## Usage Examples

Below are examples for synthesizing text, PDFs, and EPUBs locally with F5-TTS-MLX.

**Voice Cloning (Optional)**:
- You can provide a reference audio file (`ref_audio`) and its transcription (`ref_text`) to clone a voice
- Reference audio should be: mono, 24kHz sample rate, 5-10 seconds duration, WAV format
- If you don't provide reference audio, F5-TTS will use its default voice

### A) String → Audio

In [None]:
# Configuration
FORMAT = "mp3"  # "wav" or "mp3"
BASENAME = "f5_text"

# Optional: Voice cloning (leave as None to use default voice)
REF_AUDIO = None  # Path to reference audio file (e.g., "reference.wav")
REF_TEXT = None   # Transcription of reference audio

# Text to synthesize
TEXT = """Paste or type your text here.
It can be multiple paragraphs. Chapters aren't needed for this path.
"""

# Run synthesis
audio_path, manifest_path = synth_string(
    TEXT, 
    ref_audio=REF_AUDIO,
    ref_text=REF_TEXT,
    out_format=FORMAT,
    basename=BASENAME
)

print(f"Audio saved to: {audio_path}")
print(f"Manifest saved to: {manifest_path}")

### B) PDF → Audio (with manifest)

In [None]:
# Configuration
FORMAT = "mp3"  # "wav" or "mp3"

# Optional: Voice cloning (leave as None to use default voice)
REF_AUDIO = None  # Path to reference audio file (e.g., "reference.wav")
REF_TEXT = None   # Transcription of reference audio

# Specify the path to your PDF file (relative to notebook location)
PDF_PATH = "document.pdf"  # Change this to your PDF filename

# Page selection (optional)
# None = all pages (default)
# [1, 2, 3] = only pages 1, 2, and 3
# [5] = only page 5
# [1, 3, 5, 7] = only odd pages 1, 3, 5, 7
PAGES = None  # Change to a list like [1, 2, 3] to select specific pages

# Run synthesis
audio_path, manifest_path = synth_pdf(
    PDF_PATH,
    ref_audio=REF_AUDIO,
    ref_text=REF_TEXT,
    out_format=FORMAT,
    pages=PAGES
)

print(f"Audio saved to: {audio_path}")
print(f"Manifest saved to: {manifest_path}")


### C) EPUB → ZIP (Per-Chapter Audio + Manifests)

In [None]:
# Configuration
CHAPTER_FORMAT = "wav"  # "wav" or "mp3"
ZIP_NAME = ""  # Optional: custom name for the output ZIP file

# Optional: Voice cloning (leave as None to use default voice)
REF_AUDIO = None  # Path to reference audio file (e.g., "reference.wav")
REF_TEXT = None   # Transcription of reference audio

# Specify the path to your EPUB file (relative to notebook location)
EPUB_PATH = "book.epub"  # Change this to your EPUB filename

# Run synthesis
zip_path = synth_epub(
    EPUB_PATH,
    ref_audio=REF_AUDIO,
    ref_text=REF_TEXT,
    per_chapter_format=CHAPTER_FORMAT,
    zip_name=(ZIP_NAME or None)
)

print(f"ZIP archive saved to: {zip_path}")

## Notes

- **Output Directory**: By default, all outputs are saved to the same directory as the notebook. You can change this by modifying `OUTPUT_DIR` in the Configuration cell.
- **Input Files**: Place your PDF/EPUB files in the same directory as the notebook, or provide relative/absolute paths.
- **Device Selection**: The notebook will automatically detect Apple Silicon (M1/M2/M3/M4) and use MLX for optimal performance. On other platforms, it will fall back to CPU mode.
- **Voice Cloning**: F5-TTS-MLX supports zero-shot voice cloning. Provide a reference audio file (mono, 24kHz, 5-10s WAV) and its transcription to clone a voice.
  - To convert audio to the required format: `ffmpeg -i input.wav -ac 1 -ar 24000 -sample_fmt s16 -t 10 output.wav`
- **PDF Extraction**: The notebook uses `unstructured.io` for advanced PDF extraction with layout analysis. This may take longer but provides better results.
- **Manifest Files**: Each audio output includes a JSON manifest file with sentence-level timing information and metadata.
- **Performance**: On Apple Silicon (M4 MacBook Air), generation typically takes ~4 seconds per sentence. Performance scales with the number of CPU cores.
- **Error Handling**: The notebook includes CPU fallback for errors. If synthesis fails on a particular sentence, it will insert silence and continue.

## Cleanup: Delete Environment (Optional)

**If you created a new environment at the beginning of this notebook**, you can delete it here to free up storage space.

⚠️ **Warning**: This will permanently delete the environment and all installed packages!

In [None]:
import subprocess

# Check if we created an environment in this notebook
if 'environment_created_by_notebook' not in globals():
    print("✗ No environment tracking found")
    print("This cell only works if you ran the environment setup cell at the beginning")
elif not environment_created_by_notebook:
    print("✗ No environment was created by this notebook")
    print("You can only delete environments that were created in this session")
else:
    print(f"Environment '{environment_name}' was created by this notebook")
    print(f"\n{'='*60}")
    print("DELETE ENVIRONMENT")
    print(f"{'='*60}")
    
    confirm = input(f"\nAre you sure you want to DELETE '{environment_name}'?\nType 'yes' to confirm: ").strip().lower()
    
    if confirm == 'yes':
        print(f"\n→ Deleting environment '{environment_name}'...")
        print("  This may take a moment...")
        
        try:
            subprocess.run(['conda', 'env', 'remove', '-n', environment_name, '-y'],
                           check=True, capture_output=True)
            print(f"✓ Environment '{environment_name}' deleted successfully!")
            print("  Storage space has been freed.")
            
            # Reset the flag
            environment_created_by_notebook = False
            environment_name = None
        
        except subprocess.CalledProcessError as e:
            print(f"✗ Failed to delete environment: {e}")
            print("You may need to delete it manually with: conda env remove -n {environment_name}")
    else:
        print("\n✗ Deletion cancelled - environment preserved")

## Manage Model Caches (Optional)

**View and delete cached TTS models to free up storage space.**

TTS models are cached in several locations:
- HuggingFace models: `~/.cache/huggingface/`
- PyTorch models: `~/.cache/torch/`
- Pip package cache
- MLX models (for F5-TTS-MLX)

⚠️ **Warning**: Deleting caches will require re-downloading models next time you run the notebook!

In [None]:
import subprocess
import os
from pathlib import Path

def get_dir_size(path):
    """Calculate total size of a directory in bytes."""
    total = 0
    try:
        for entry in os.scandir(path):
            if entry.is_file(follow_symlinks=False):
                total += entry.stat().st_size
            elif entry.is_dir(follow_symlinks=False):
                total += get_dir_size(entry.path)
    except (PermissionError, FileNotFoundError):
        pass
    return total

def format_size(bytes_size):
    """Format bytes to human-readable size."""
    for unit in ['B', 'KB', 'MB', 'GB']:
        if bytes_size < 1024.0:
            return f"{bytes_size:.2f} {unit}"
        bytes_size /= 1024.0
    return f"{bytes_size:.2f} TB"

# Define cache locations
home = Path.home()
cache_locations = {
    "HuggingFace Models": home / ".cache" / "huggingface",
    "PyTorch Models": home / ".cache" / "torch",
    "Pip Cache": home / ".cache" / "pip",
}

print("="*60)
print("MODEL CACHE INFORMATION")
print("="*60)

total_cache_size = 0
existing_caches = {}

for name, path in cache_locations.items():
    if path.exists():
        size = get_dir_size(path)
        total_cache_size += size
        existing_caches[name] = (path, size)
        print(f"\n{name}:")
        print(f"  Location: {path}")
        print(f"  Size: {format_size(size)}")
    else:
        print(f"\n{name}: Not found")

print(f"\n{'='*60}")
print(f"Total cache size: {format_size(total_cache_size)}")
print(f"{'='*60}")

if existing_caches:
    print("\n⚠️  You can delete these caches to free up storage space.")
    print("Note: Models will be re-downloaded when needed.")
    
    choice = input("\nDo you want to delete caches? (yes/no): ").strip().lower()
    
    if choice == 'yes':
        print("\nSelect which caches to delete:")
        print("  [1] HuggingFace only")
        print("  [2] PyTorch only")
        print("  [3] Pip only")
        print("  [4] All caches")
        print("  [5] Cancel")
        
        delete_choice = input("\nEnter choice (1-5): ").strip()
        
        to_delete = []
        if delete_choice == "1" and "HuggingFace Models" in existing_caches:
            to_delete = [("HuggingFace Models", existing_caches["HuggingFace Models"])]
        elif delete_choice == "2" and "PyTorch Models" in existing_caches:
            to_delete = [("PyTorch Models", existing_caches["PyTorch Models"])]
        elif delete_choice == "3" and "Pip Cache" in existing_caches:
            to_delete = [("Pip Cache", existing_caches["Pip Cache"])]
        elif delete_choice == "4":
            to_delete = list(existing_caches.items())
        
        if to_delete:
            confirm = input(f"\n⚠️  Really delete {len(to_delete)} cache(s)? Type 'DELETE' to confirm: ").strip()
            
            if confirm == "DELETE":
                freed_space = 0
                for name, (path, size) in to_delete:
                    print(f"\n→ Deleting {name}...")
                    try:
                        import shutil
                        shutil.rmtree(path)
                        freed_space += size
                        print(f"✓ Deleted {name} ({format_size(size)})")
                    except Exception as e:
                        print(f"✗ Failed to delete {name}: {e}")
                
                print(f"\n{'='*60}")
                print(f"✓ Total space freed: {format_size(freed_space)}")
                print(f"{'='*60}")
            else:
                print("\n✗ Deletion cancelled")
        else:
            print("\n✗ Invalid choice or cache not found")
    else:
        print("\n✓ Caches preserved")
else:
    print("\n✓ No caches found")