# TTS v4 - Apple Vision Framework + Kokoro (macOS)
- Uses Apple Vision Framework for fast, accurate PDF text extraction with locations
- Uses Kokoro for high-quality text-to-speech synthesis
- Includes sentence tracking and timeline manifest generation
- **Requires macOS** (Vision Framework is Apple-only)

## 0) Environment Setup (Optional)**This step helps you manage Python packages and avoid conflicts with your system installation.**- If you have **conda** installed, you can create a fresh environment for this notebook- Or use an existing environment by providing its name- At the end of the notebook, you can easily clean up and delete the environment to free storage

In [None]:
import subprocess
import sys
import os

# Flag to track if we created an environment in this notebook
environment_created_by_notebook = False
environment_name = None

# Check if conda is installed
try:
    result = subprocess.run(['conda', '--version'], capture_output=True, text=True, check=True)
    conda_available = True
    print(f"✓ Conda detected: {result.stdout.strip()}")
except (subprocess.CalledProcessError, FileNotFoundError):
    conda_available = False
    print("✗ Conda not found - skipping environment management")
    print("Packages will be installed in your current Python environment")

if conda_available:
    print("\n" + "="*60)
    print("ENVIRONMENT SETUP OPTIONS")
    print("="*60)
    
    choice = input("\nDo you want to:\n  [1] Create a NEW conda environment (recommended)\n  [2] Use an EXISTING environment\n  [3] Skip and use current environment\n\nEnter choice (1/2/3): ").strip()
    
    if choice == "1":
        # Create new environment
        env_name = input("\nEnter name for new environment (default: kokoro_vision): ").strip()
        if not env_name:
            env_name = "kokoro_vision"
        
        print(f"\n→ Creating conda environment: {env_name}")
        print("  This may take a few minutes...")
        
        try:
            # Create environment with Python 3.10
            subprocess.run(['conda', 'create', '-n', env_name, 'python=3.10', '-y'], 
                          check=True, capture_output=True)
            
            environment_created_by_notebook = True
            environment_name = env_name
            
            print(f"✓ Environment '{env_name}' created successfully!")
            print(f"\n{'='*60}")
            print("IMPORTANT: Restart your Jupyter kernel and select the new environment:")
            print(f"  Kernel → Change Kernel → {env_name}")
            print(f"{'='*60}\n")
            
        except subprocess.CalledProcessError as e:
            print(f"✗ Failed to create environment: {e}")
            print("Continuing with current environment...")
    
    elif choice == "2":
        # Use existing environment
        env_name = input("\nEnter name of existing environment: ").strip()
        if env_name:
            environment_name = env_name
            print(f"\n✓ Using existing environment: {env_name}")
            print(f"\n{'='*60}")
            print("IMPORTANT: Make sure your kernel is using this environment:")
            print(f"  Kernel → Change Kernel → {env_name}")
            print(f"{'='*60}\n")
        else:
            print("✗ No environment name provided - using current environment")
    
    else:
        print("\n✓ Using current environment")

print("\nYou can now proceed with the rest of the notebook.")

## 1) Install Dependencies

In [None]:
# Core TTS + I/O deps!pip install "kokoro>=0.9.4" soundfile "misaki[en]" pypdf ebooklib pydub# PyObjC for Apple Vision Framework access!pip install pyobjc-framework-Vision pyobjc-framework-Quartz pyobjc-framework-Cocoa# Note: ffmpeg should be installed on your system for MP3 encoding# macOS: brew install ffmpeg# Silence overly chatty logsimport logginglogging.getLogger("phonemizer").setLevel(logging.ERROR)logging.getLogger("pypdf").setLevel(logging.CRITICAL)

## 2) Configuration and Setup

In [None]:
import osimport sysfrom pathlib import Path# Check if running on macOSif sys.platform != "darwin":    print("WARNING: This notebook requires macOS for Vision Framework support!")    print("For other platforms, use TTS_Kokoro_Local.ipynb instead.")# --- MPS Fallback for Apple Silicon ---# Enable CPU fallback for operations not yet implemented on MPS# (specifically torch.angle used in Kokoro's STFT operations)os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'# --- Output directory setup ---OUTPUT_DIR = Path(".")  # Use current directory (same as notebook location)print(f"Output directory: {OUTPUT_DIR.resolve()}")# --- Device selection ---# DEVICE_MODE: "auto" (default), "cuda", "cpu", or "mps" (Apple Silicon)DEVICE_MODE = "auto"import torchdef _pick_device():    if DEVICE_MODE == "cuda":        return "cuda"    if DEVICE_MODE == "cpu":        return "cpu"    if DEVICE_MODE == "mps":        return "mps"    # Auto mode: prefer MPS on Apple Silicon, then CUDA, then CPU    if torch.backends.mps.is_available():        return "mps"    return "cuda" if torch.cuda.is_available() else "cpu"DEVICE = _pick_device()print(f"Using device: {DEVICE}")if DEVICE == "mps":    print("Note: MPS will fall back to CPU for unsupported operations (like torch.angle)")

## 3) Helper Functions (PDF/EPUB extraction & TTS synthesis)

In [None]:
import numpy as np
import soundfile as sf
import re, io, zipfile, torch
from pathlib import Path
from typing import List, Tuple, Dict, Union, Optional
from functools import lru_cache

from ebooklib import epub
from kokoro import KPipeline
from pydub import AudioSegment

# Apple Vision Framework imports
import Quartz
import Vision
from Foundation import NSURL

# Sentence-ish split; keeps chunks small (avoids 510-phoneme truncation)
SPLIT_PATTERN = r"[.?!]\s+|[\n]{2,}"
SPLIT_PATTERN_CAP = r"([.?!]\s+|[\n]{2,})"


# --- PDF Extraction using Apple Vision Framework ---
def extract_text_from_pdf_vision(pdf_path: str, pages: Optional[List[int]] = None) -> List[Dict]:
    """
    Extract text from PDF using Apple Vision Framework with precise bounding boxes.

    Args:
        pdf_path: Path to the PDF file
        pages: Optional list of page numbers to extract (1-indexed). If None, extracts all pages.
               Examples: [1, 2, 3] for pages 1-3, [5] for page 5 only, [1, 3, 5] for specific pages

    Returns:
        List of dictionaries containing text and metadata
    """
    print(f"Parsing PDF with Apple Vision Framework: {pdf_path}")
    if pages:
        print(f"  Page filter active: extracting only pages {sorted(pages)}")
    
    # Load PDF
    pdf_url = NSURL.fileURLWithPath_(pdf_path)
    pdf_doc = Quartz.PDFDocument.alloc().initWithURL_(pdf_url)
    
    if pdf_doc is None:
        print("Error: Could not load PDF")
        return [{"text": "Error: Could not load PDF.", "metadata": {"page_number": 1, "bounds": None}}]
    
    page_count = pdf_doc.pageCount()
    print(f"Processing {page_count} pages...")

    # Convert pages to set for faster lookup
    pages_set = set(pages) if pages else None

    element_list = []

    for page_idx in range(page_count):
        page = pdf_doc.pageAtIndex_(page_idx)
        page_num = page_idx + 1

        # Skip if page filtering is enabled and current page not in list
        if pages_set and page_num not in pages_set:
            continue
        
        try:
            # Get page bounds for rendering
            page_rect = page.boundsForBox_(Quartz.kPDFDisplayBoxMediaBox)
            
            # Render PDF page to image data using NSImage/NSBitmapImageRep
            # This is more reliable than manual CGContext creation
            from AppKit import NSImage, NSBitmapImageRep, NSCompositingOperationCopy
            from Cocoa import NSMakeRect, NSZeroPoint
            
            # Scale for better OCR quality (2x)
            scale = 2.0
            width = int(page_rect.size.width * scale)
            height = int(page_rect.size.height * scale)
            
            # Create an NSImage and render the PDF page into it
            ns_image = NSImage.alloc().initWithSize_((width, height))
            ns_image.lockFocus()
            
            # Set up transform for rendering
            import AppKit
            context = AppKit.NSGraphicsContext.currentContext().graphicsPort()
            Quartz.CGContextSaveGState(context)
            Quartz.CGContextScaleCTM(context, scale, scale)
            
            # Draw the PDF page
            page.drawWithBox_(Quartz.kPDFDisplayBoxMediaBox)
            
            Quartz.CGContextRestoreGState(context)
            ns_image.unlockFocus()
            
            # Get TIFF representation and create bitmap
            tiff_data = ns_image.TIFFRepresentation()
            bitmap = NSBitmapImageRep.imageRepWithData_(tiff_data)
            
            # Convert to PNG data for Vision Framework
            from AppKit import NSPNGFileType
            png_data = bitmap.representationUsingType_properties_(NSPNGFileType, None)
            
            # Create a list to collect results from completion handler
            page_results = []
            
            def make_handler(results_container):
                """Create completion handler that captures results."""
                def handler(request, error):
                    if error:
                        print(f"  OCR error: {error}")
                        return
                    
                    observations = request.results()
                    if not observations:
                        return
                    
                    # Sort by Y coordinate (top to bottom)
                    sorted_obs = sorted(observations, key=lambda obs: -obs.boundingBox().origin.y)
                    
                    for observation in sorted_obs:
                        top_candidates = observation.topCandidates_(1)
                        if not top_candidates:
                            continue
                        
                        text = top_candidates[0].string()
                        confidence = top_candidates[0].confidence()
                        bbox = observation.boundingBox()
                        
                        if text.strip():
                            results_container.append({
                                "text": text,
                                "confidence": confidence,
                                "bbox": bbox
                            })
                
                return handler
            
            # Create Vision request with completion handler
            handler = make_handler(page_results)
            vision_request = Vision.VNRecognizeTextRequest.alloc().initWithCompletionHandler_(handler)
            vision_request.setRecognitionLevel_(Vision.VNRequestTextRecognitionLevelAccurate)
            vision_request.setUsesLanguageCorrection_(True)
            
            # Create Vision handler from image data
            vision_handler = Vision.VNImageRequestHandler.alloc().initWithData_options_(
                png_data, None
            )
            
            # Perform OCR
            success, error = vision_handler.performRequests_error_([vision_request], None)
            
            if error:
                print(f"  Warning: Vision request failed for page {page_num}: {error}")
                continue
            
            if not success:
                print(f"  Warning: Vision request returned false for page {page_num}")
                continue
            
            # Process results from completion handler
            for result in page_results:
                element_list.append({
                    "text": result["text"],
                    "metadata": {
                        "page_number": page_num,
                        "bounds": {
                            "x": result["bbox"].origin.x,
                            "y": result["bbox"].origin.y,
                            "width": result["bbox"].size.width,
                            "height": result["bbox"].size.height,
                            "confidence": result["confidence"]
                        }
                    }
                })
            
            print(f"  Page {page_num}: Found {len(page_results)} text elements")
        
        except Exception as e:
            print(f"  Error processing page {page_num}: {e}")
            import traceback
            traceback.print_exc()
            continue
    
    print(f"Vision Framework: Found {len(element_list)} total text elements.")
    if not element_list:
        return [{"text": "Warning: Vision Framework found no text elements.", "metadata": {"page_number": 1, "bounds": None}}]
    
    return element_list


# --- EPUB Extraction ---
def extract_chapters_from_epub(file_like: io.BytesIO):
    bk = epub.read_epub(file_like)
    chapters = []
    for item in bk.get_items_of_type(epub.ITEM_DOCUMENT):
        if getattr(item, "is_nav", False): continue
        html = item.get_content().decode("utf-8", errors="ignore")
        text = re.sub(r"<(script|style).*?>.*?</\1>", " ", html, flags=re.S|re.I)
        text = re.sub(r"<br\s*/?>", "\n", text, flags=re.I)
        text = re.sub(r"</p>|</div>|</h\d>", "\n\n", text, flags=re.I)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text).strip()
        if text:
            title = Path(item.file_name).stem
            first = text.splitlines()[0] if text else ""; m = re.match(r"(?i)\s*(chapter|part|book)\b[^\n]{0,80}", first)
            if m: title = first[:60]
            chapters.append((title, text))
    if not chapters:
        blobs = [];
        for item in bk.get_items_of_type(epub.ITEM_DOCUMENT):
             if getattr(item, "is_nav", False): continue
             blobs.append(item.get_content().decode("utf-8", errors="ignore"))
        html = " ".join(blobs)
        text = re.sub(r"<(script|style).*?>.*?</\1>", " ", html, flags=re.S|re.I)
        text = re.sub(r"<br\s*/?>", "\n", text, flags=re.I)
        text = re.sub(r"</p>|</div>|</h\d>", "\n\n", text, flags=re.I)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text).strip()
        if text: chapters = [("Chapter 1", text)]
    return chapters

def safe_name(s: str) -> str:
    s = re.sub(r"[^\w\-]+", "_", s).strip("_"); return s or "chapter"

# --- Pipeline cache ---
@lru_cache(maxsize=4)
def get_pipeline(lang_code='a', device=DEVICE):
    return KPipeline(lang_code=lang_code, device=device)

def _synthesize_sentence(pipe: KPipeline, sentence: str, voice='af_heart', speed=1.0) -> np.ndarray:
    subchunks = [];
    for _, _, audio in pipe(sentence, voice=voice, speed=speed, split_pattern=None): subchunks.append(audio)
    if not subchunks: return np.zeros((0,), dtype=np.float32)
    return np.concatenate(subchunks, axis=0)

def split_sentences_keep_delim(text: str) -> List[str]:
    parts = re.split(SPLIT_PATTERN_CAP, text); sents = []
    for i in range(0, len(parts), 2):
        chunk = (parts[i] or "").strip(); sep = parts[i+1] if i+1 < len(parts) else ""
        if not chunk: continue
        if sep and not sep.isspace(): chunk = (chunk + " " + sep.strip()).strip()
        sents.append(chunk)
    return sents

# --- Synthesizer ---
def synth_text_to_wav_and_manifest(
    text_or_elements: Union[str, List[Dict]],
    voice='af_heart',
    speed=1.0,
    lang_code='a',
    device=DEVICE) -> Tuple[bytes, Dict]:
    pipe = get_pipeline(lang_code=lang_code, device=device)
    sr = 24000
    if isinstance(text_or_elements, str):
        elements = [{"text": text_or_elements, "metadata": {"page_number": 1, "bounds": None}}]
    else:
        elements = text_or_elements
    pcm_all = []; timeline = []; t = 0.0; sentence_index = 0
    print(f"Synthesizing {len(elements)} text elements...")
    for element in elements:
        element_text = element.get("text", "")
        element_meta = element.get("metadata", {})
        sentences = split_sentences_keep_delim(element_text)
        for sent in sentences:
            if not sent: continue
            pcm = _synthesize_sentence(pipe, sent, voice=voice, speed=speed)
            dur = pcm.shape[0] / sr
            timeline.append({
                "i": sentence_index,
                "start": round(t, 3),
                "end": round(t + dur, 3),
                "text": sent.strip(),
                "location": element_meta
            })
            pcm_all.append(pcm); t += dur; sentence_index += 1
    pcm_cat = np.concatenate(pcm_all, axis=0) if pcm_all else np.zeros((sr//10,), dtype=np.float32)
    buf = io.BytesIO(); sf.write(buf, pcm_cat, sr, format='WAV'); buf.seek(0)
    manifest = {"audioUrl": "", "sentences": timeline}
    return buf.read(), manifest

def wav_to_mp3_bytes(wav_bytes: bytes, bitrate="128k") -> bytes:
    audio = AudioSegment.from_file(io.BytesIO(wav_bytes), format="wav"); out = io.BytesIO()
    audio.export(out, format="mp3", bitrate=bitrate); out.seek(0); return out.read()

## 4) High-Level Synthesis Wrappers

In [None]:
def synth_string(text: str,
                 voice="af_heart",
                 speed=1.0,
                 out_format="wav",
                 lang_code="a",
                 device=None,
                 basename="kokoro_text",
                 output_dir=None):
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR

    elements = [{
        "text": text,
        "metadata": {"page_number": 1, "source": "string", "bounds": None}
    }]

    wav_bytes, manifest = synth_text_to_wav_and_manifest(
        elements,
        voice=voice, speed=speed, lang_code=lang_code, device=device
    )

    out_base = output_dir / basename

    if out_format.lower() == "mp3":
        mp3 = wav_to_mp3_bytes(wav_bytes)
        audio_path = str(out_base) + ".mp3"
        with open(audio_path, "wb") as f: f.write(mp3)
    else:
        audio_path = str(out_base) + ".wav"
        with open(audio_path, "wb") as f: f.write(wav_bytes)

    manifest_path = str(out_base) + "_manifest.json"
    manifest["audioUrl"] = Path(audio_path).name
    with open(manifest_path, "w", encoding="utf-8") as f:
        import json; json.dump(manifest, f, ensure_ascii=False, indent=2)

    return audio_path, manifest_path

def synth_pdf(file_path: str,
              voice="af_heart",
              speed=1.0,
              out_format="wav",
              lang_code="a",
              device=None,
              basename=None,
              output_dir=None,
              pages=None):
    """
    Extract text from PDF using Vision Framework and synthesize with Kokoro.

    Args:
        file_path: Path to the PDF file
        voice: Voice to use for synthesis (default: 'af_heart')
        speed: Speech speed multiplier (default: 1.0)
        out_format: Output format 'wav' or 'mp3' (default: 'wav')
        lang_code: Language code (default: 'a' for auto)
        device: Device to use (default: None, uses global DEVICE)
        basename: Output filename base (default: derived from input filename)
        output_dir: Output directory (default: None, uses global OUTPUT_DIR)
        pages: Optional list of page numbers to extract (1-indexed). If None, extracts all pages.
               Examples: [1, 2, 3] for pages 1-3, [5] for page 5 only, [1, 3, 5, 7] for specific pages

    Returns:
        Tuple of (audio_path, manifest_path)
    """
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR
    
    stem = Path(file_path).stem

    # Extract text using Vision Framework
    elements = extract_text_from_pdf_vision(file_path, pages=pages)

    wav_bytes, manifest = synth_text_to_wav_and_manifest(
        elements,
        voice=voice, speed=speed, lang_code=lang_code, device=device
    )

    out_base = output_dir / f"{(basename or stem)}_tts"

    if out_format.lower() == "mp3":
        mp3 = wav_to_mp3_bytes(wav_bytes)
        audio_path = str(out_base) + ".mp3"
        with open(audio_path, "wb") as f: f.write(mp3)
    else:
        audio_path = str(out_base) + ".wav"
        with open(audio_path, "wb") as f: f.write(wav_bytes)

    manifest_path = str(out_base) + "_manifest.json"
    manifest["audioUrl"] = Path(audio_path).name
    with open(manifest_path, "w", encoding="utf-8") as f:
        import json; json.dump(manifest, f, ensure_ascii=False, indent=2)

    return audio_path, manifest_path

def synth_epub(file_path_or_bytes,
               voice="af_heart",
               speed=1.0,
               per_chapter_format="wav",
               lang_code="a",
               device=None,
               zip_name=None,
               output_dir=None):
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR

    if isinstance(file_path_or_bytes, (str, Path)):
        with open(file_path_or_bytes, "rb") as fh:
            epub_bytes = io.BytesIO(fh.read())
        stem = Path(file_path_or_bytes).stem
    else:
        epub_bytes = file_path_or_bytes
        stem = "book"

    chapters = extract_chapters_from_epub(epub_bytes)
    assert chapters, "No chapters detected in EPUB."

    zip_buf = io.BytesIO()
    with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
        for idx, (title, body) in enumerate(chapters, 1):
            name = f"{idx:02d}_{safe_name(title)[:40]}"

            chapter_elements = [{
                "text": body,
                "metadata": {
                    "chapter_index": idx,
                    "chapter_title": title,
                    "page_number": 1,
                    "bounds": None
                }
            }]

            wav_bytes, manifest = synth_text_to_wav_and_manifest(
                chapter_elements,
                voice=voice, speed=speed, lang_code=lang_code, device=device
            )

            if per_chapter_format.lower() == "mp3":
                data = wav_to_mp3_bytes(wav_bytes)
                audio_name = f"{name}.mp3"
                zf.writestr(audio_name, data)
            else:
                audio_name = f"{name}.wav"
                zf.writestr(audio_name, wav_bytes)

            manifest["audioUrl"] = audio_name
            import json
            zf.writestr(f"{name}_manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2))

    zip_buf.seek(0)
    zpath = str(output_dir / f"{zip_name or (stem + '_chapters')}.zip")
    with open(zpath, "wb") as f:
        f.write(zip_buf.read())
    return zpath

## Usage Examples

Below are examples for synthesizing text, PDFs, and EPUBs locally using Vision Framework + Kokoro.

### A) String → Audio

In [None]:
# Configuration
VOICE = "af_heart"
SPEED = 1.0
FORMAT = "mp3"  # "wav" or "mp3"
LANG = "a"
BASENAME = "kokoro_text"

# Text to synthesize
TEXT = """Paste or type your text here.
It can be multiple paragraphs. Chapters aren't needed for this path.
"""

# Run synthesis
audio_path, manifest_path = synth_string(
    TEXT, 
    voice=VOICE, 
    speed=SPEED,
    out_format=FORMAT, 
    lang_code=LANG,
    basename=BASENAME
)

print(f"Audio saved to: {audio_path}")
print(f"Manifest saved to: {manifest_path}")

### B) PDF → Audio (with Vision Framework OCR)

**Vision Framework advantages:**
- Native macOS integration (very fast)
- Precise bounding boxes for each text element
- High accuracy text recognition
- No heavy dependencies (no detectron2, unstructured, etc.)

In [None]:
# Configuration
VOICE = "af_heart"
SPEED = 1.0
FORMAT = "mp3"  # "wav" or "mp3"
LANG = "a"

# Specify the path to your PDF file (relative to notebook location)
PDF_PATH = "document.pdf"  # Change this to your PDF filename

# Page selection (optional)
# None = all pages (default)
# [1, 2, 3] = only pages 1, 2, and 3
# [5] = only page 5
# [1, 3, 5, 7] = only odd pages 1, 3, 5, 7
PAGES = None  # Change to a list like [1, 2, 3] to select specific pages

# Run synthesis (Vision Framework will extract text with locations)
audio_path, manifest_path = synth_pdf(
    PDF_PATH,
    voice=VOICE,
    speed=SPEED,
    out_format=FORMAT,
    lang_code=LANG,
    pages=PAGES
)

print(f"Audio saved to: {audio_path}")
print(f"Manifest saved to: {manifest_path}")

### C) EPUB → ZIP (Per-Chapter Audio + Manifests)

In [None]:
# ConfigurationVOICE = "af_heart"SPEED = 1.0CHAPTER_FORMAT = "wav"  # "wav" or "mp3"LANG = "a"ZIP_NAME = ""  # Optional: custom name for the output ZIP file# Specify the path to your EPUB file (relative to notebook location)EPUB_PATH = "book.epub"  # Change this to your EPUB filename# Run synthesiszip_path = synth_epub(    EPUB_PATH,     voice=VOICE,     speed=SPEED,    per_chapter_format=CHAPTER_FORMAT,    lang_code=LANG,    zip_name=(ZIP_NAME or None))print(f"ZIP archive saved to: {zip_path}")

## Notes### Vision Framework Benefits:- **Speed**: Native macOS framework - much faster than unstructured.io- **Accuracy**: Excellent OCR quality with language correction- **Bounding Boxes**: Precise normalized coordinates (0-1) for each text element- **Confidence Scores**: Each text element includes confidence level- **No Heavy Dependencies**: No need for detectron2 or complex ML models### System Requirements:- **macOS only** (Vision Framework is Apple-exclusive)- For Apple Silicon Macs, the notebook will automatically use MPS acceleration- Requires PyObjC for Vision Framework access### Apple Silicon (M1/M2/M3) Performance:- **MPS Backend**: Uses Apple's Metal Performance Shaders for GPU acceleration- **CPU Fallback**: Some operations (like `torch.angle` in STFT) aren't yet implemented on MPS and will automatically fall back to CPU- **Overall Performance**: Still faster than pure CPU mode due to GPU acceleration for supported operations- To force CPU-only mode, set `DEVICE_MODE = "cpu"` in the Configuration cell### Output:- **Output Directory**: By default, all outputs are saved to the same directory as the notebook- **Input Files**: Place your PDF/EPUB files in the same directory as the notebook, or provide relative/absolute paths- **Device Selection**: Auto-detects MPS (Apple Silicon), CUDA, or CPU- **Manifest Format**: JSON files with precise bounding box coordinates for each sentence### Comparison with Other Approaches:- **Vision Framework** (this notebook): Best for general documents on macOS, fastest OCR- **Nougat** (TTS_Nougat.ipynb): Best for scientific papers with equations- **Unstructured.io** (TTS_Kokoro_Local.ipynb): Cross-platform, slower but works anywhere

## Cleanup: Delete Environment (Optional)**If you created a new environment at the beginning of this notebook**, you can delete it here to free up storage space.⚠️ **Warning**: This will permanently delete the environment and all installed packages!

In [None]:
import subprocess

# Check if we created an environment in this notebook
if 'environment_created_by_notebook' not in globals():
    print("✗ No environment tracking found")
    print("This cell only works if you ran the environment setup cell at the beginning")
elif not environment_created_by_notebook:
    print("✗ No environment was created by this notebook")
    print("You can only delete environments that were created in this session")
else:
    print(f"Environment '{environment_name}' was created by this notebook")
    print(f"\n{'='*60}")
    print("DELETE ENVIRONMENT")
    print(f"{'='*60}")
    
    confirm = input(f"\nAre you sure you want to DELETE '{environment_name}'?\nType 'yes' to confirm: ").strip().lower()
    
    if confirm == 'yes':
        print(f"\n→ Deleting environment '{environment_name}'...")
        print("  This may take a moment...")
        
        try:
            subprocess.run(['conda', 'env', 'remove', '-n', environment_name, '-y'], 
                          check=True, capture_output=True)
            print(f"✓ Environment '{environment_name}' deleted successfully!")
            print("  Storage space has been freed.")
            
            # Reset the flag
            environment_created_by_notebook = False
            environment_name = None
            
        except subprocess.CalledProcessError as e:
            print(f"✗ Failed to delete environment: {e}")
            print(f"You may need to delete it manually with: conda env remove -n {environment_name}")
    else:
        print("\n✗ Deletion cancelled - environment preserved")