# TTS v4 - PyMuPDF Version (Kokoro)
- Uses PyMuPDF for fast, accurate PDF text extraction with precise coordinates
- Direct text extraction from PDF (no image conversion or OCR)
- Character-level bounding boxes in native PDF coordinate space
- Includes sentence tracking and timeline manifest generation

## 1) Install Dependencies

In [None]:
# Core TTS + I/O deps
!pip install "kokoro>=0.9.4" soundfile misaki[en] ebooklib pydub

# PyMuPDF for fast, accurate PDF extraction with precise coordinates
!pip install pymupdf

# Note: ffmpeg should be installed on your system for MP3 encoding
# Linux: sudo apt-get install ffmpeg
# macOS: brew install ffmpeg
# Windows: Download from https://ffmpeg.org/

# Silence overly chatty logs
import logging
logging.getLogger("phonemizer").setLevel(logging.ERROR)
logging.getLogger("fitz").setLevel(logging.ERROR)

## 2) Configuration and Setup

In [None]:
import os
from pathlib import Path

# --- MPS Fallback for Apple Silicon ---
# Enable CPU fallback for operations not yet implemented on MPS
# (specifically torch.angle used in Kokoro's STFT operations)
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

# --- Output directory setup ---
OUTPUT_DIR = Path.home() / "kokoro_outputs"  # Change this to your preferred output directory
OUTPUT_DIR.mkdir(exist_ok=True)
print(f"Output directory: {OUTPUT_DIR}")

# --- Device selection ---
# DEVICE_MODE: "auto" (default), "cuda", "cpu", or "mps" (Apple Silicon)
DEVICE_MODE = "auto"

import torch
def _pick_device():
    if DEVICE_MODE == "cuda":
        return "cuda"
    if DEVICE_MODE == "cpu":
        return "cpu"
    if DEVICE_MODE == "mps":
        return "mps"
    # Auto mode: prefer MPS on Apple Silicon, then CUDA, then CPU
    if torch.backends.mps.is_available():
        return "mps"
    return "cuda" if torch.cuda.is_available() else "cpu"

DEVICE = _pick_device()
print(f"Using device: {DEVICE}")
if DEVICE == "mps":
    print("Note: MPS will fall back to CPU for unsupported operations (like torch.angle)")

## 3) Helper Functions (PDF/EPUB extraction & TTS synthesis)

In [None]:
import numpy as np
import soundfile as sf
import re, io, zipfile, torch
from pathlib import Path
from typing import List, Tuple, Dict, Union
from functools import lru_cache

import fitz  # PyMuPDF
from ebooklib import epub
from kokoro import KPipeline
from pydub import AudioSegment

# Sentence-ish split; keeps chunks small (avoids 510-phoneme truncation)
SPLIT_PATTERN = r"[.?!]\s+|[\n]{2,}"
SPLIT_PATTERN_CAP = r"([.?!]\s+|[\n]{2,})"


# --- PDF Extraction using PyMuPDF ---
def extract_text_from_pdf_pymupdf(file_path_or_bytes: Union[str, io.BytesIO]) -> List[Dict]:
    """Extract text from PDF using PyMuPDF with precise character-level bounding boxes.
    
    Args:
        file_path_or_bytes: Path to PDF file or BytesIO object
        
    Returns:
        List of dicts with 'text' and 'metadata' (including points for HTML compatibility)
    """
    print("Parsing PDF with PyMuPDF (direct text extraction)...")
    
    # Open PDF
    if isinstance(file_path_or_bytes, (str, Path)):
        doc = fitz.open(file_path_or_bytes)
    else:
        doc = fitz.open(stream=file_path_or_bytes.read(), filetype="pdf")
    
    element_list = []
    
    print(f"Processing {len(doc)} pages...")
    
    for page_num in range(len(doc)):
        page = doc[page_num]
        page_number = page_num + 1
        
        # Get page dimensions
        page_rect = page.rect
        page_width = page_rect.width
        page_height = page_rect.height
        
        # Extract text blocks with detailed information
        blocks = page.get_text("dict", flags=fitz.TEXT_PRESERVE_WHITESPACE)["blocks"]
        
        block_count = 0
        
        for block in blocks:
            # Skip non-text blocks (images, etc.)
            if block.get("type") != 0:  # 0 = text block
                continue
            
            # Extract text from all lines in block
            block_text_parts = []
            block_bbox = block.get("bbox")  # (x0, y0, x1, y1)
            
            for line in block.get("lines", []):
                line_text_parts = []
                for span in line.get("spans", []):
                    span_text = span.get("text", "")
                    if span_text.strip():
                        line_text_parts.append(span_text)
                
                if line_text_parts:
                    block_text_parts.append("".join(line_text_parts))
            
            block_text = "\n".join(block_text_parts).strip()
            
            if block_text:
                # Convert bbox to points format for HTML player compatibility
                # Format: [[x0, y0], [x1, y0], [x1, y1], [x0, y1]]
                # Using absolute coordinates (HTML will infer page dimensions)
                points = [
                    [block_bbox[0], block_bbox[1]],  # top-left
                    [block_bbox[2], block_bbox[1]],  # top-right
                    [block_bbox[2], block_bbox[3]],  # bottom-right
                    [block_bbox[0], block_bbox[3]]   # bottom-left
                ]
                
                element_list.append({
                    "text": block_text,
                    "metadata": {
                        "page_number": page_number,
                        "points": points
                    }
                })
                block_count += 1
        
        print(f"  Page {page_number}: Found {block_count} text blocks")
    
    doc.close()
    
    print(f"PyMuPDF: Found {len(element_list)} total text elements.")
    if not element_list:
        return [{"text": "Warning: PyMuPDF found no text elements.", "metadata": {"page_number": 1, "points": None}}]
    
    return element_list


# --- EPUB Extraction ---
def extract_chapters_from_epub(file_like: io.BytesIO):
    bk = epub.read_epub(file_like)
    chapters = []
    for item in bk.get_items_of_type(epub.ITEM_DOCUMENT):
        if getattr(item, "is_nav", False): continue
        html = item.get_content().decode("utf-8", errors="ignore")
        text = re.sub(r"<(script|style).*?>.*?</\1>", " ", html, flags=re.S|re.I)
        text = re.sub(r"<br\s*/?>", "\n", text, flags=re.I)
        text = re.sub(r"</p>|</div>|</h\d>", "\n\n", text, flags=re.I)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text).strip()
        if text:
            title = Path(item.file_name).stem
            first = text.splitlines()[0] if text else ""; m = re.match(r"(?i)\s*(chapter|part|book)\b[^\n]{0,80}", first)
            if m: title = first[:60]
            chapters.append((title, text))
    if not chapters:
        blobs = [];
        for item in bk.get_items_of_type(epub.ITEM_DOCUMENT):
             if getattr(item, "is_nav", False): continue
             blobs.append(item.get_content().decode("utf-8", errors="ignore"))
        html = " ".join(blobs)
        text = re.sub(r"<(script|style).*?>.*?</\1>", " ", html, flags=re.S|re.I)
        text = re.sub(r"<br\s*/?>", "\n", text, flags=re.I)
        text = re.sub(r"</p>|</div>|</h\d>", "\n\n", text, flags=re.I)
        text = re.sub(r"<[^>]+>", " ", text)
        text = re.sub(r"[ \t]+", " ", text)
        text = re.sub(r"\n{3,}", "\n\n", text).strip()
        if text: chapters = [("Chapter 1", text)]
    return chapters

def safe_name(s: str) -> str:
    s = re.sub(r"[^\w\-]+", "_", s).strip("_"); return s or "chapter"

# --- Pipeline cache ---
@lru_cache(maxsize=4)
def get_pipeline(lang_code='a', device=DEVICE):
    return KPipeline(lang_code=lang_code, device=device)

def _synthesize_sentence(pipe: KPipeline, sentence: str, voice='af_heart', speed=1.0) -> np.ndarray:
    subchunks = [];
    for _, _, audio in pipe(sentence, voice=voice, speed=speed, split_pattern=None): subchunks.append(audio)
    if not subchunks: return np.zeros((0,), dtype=np.float32)
    return np.concatenate(subchunks, axis=0)

def split_sentences_keep_delim(text: str) -> List[str]:
    parts = re.split(SPLIT_PATTERN_CAP, text); sents = []
    for i in range(0, len(parts), 2):
        chunk = (parts[i] or "").strip(); sep = parts[i+1] if i+1 < len(parts) else ""
        if not chunk: continue
        if sep and not sep.isspace(): chunk = (chunk + " " + sep.strip()).strip()
        sents.append(chunk)
    return sents

# --- Synthesizer ---
def synth_text_to_wav_and_manifest(
    text_or_elements: Union[str, List[Dict]],
    voice='af_heart',
    speed=1.0,
    lang_code='a',
    device=DEVICE) -> Tuple[bytes, Dict]:
    pipe = get_pipeline(lang_code=lang_code, device=device)
    sr = 24000

    if isinstance(text_or_elements, str):
        elements = [{"text": text_or_elements, "metadata": {"page_number": 1, "points": None}}]
    else:
        elements = text_or_elements

    pcm_all = []; timeline = []; t = 0.0; sentence_index = 0
    print(f"Synthesizing {len(elements)} text elements...")

    for element in elements:
        element_text = element.get("text", "")
        element_meta = element.get("metadata", {})

        sentences = split_sentences_keep_delim(element_text)

        for sent in sentences:
            if not sent: continue
            pcm = _synthesize_sentence(pipe, sent, voice=voice, speed=speed)
            dur = pcm.shape[0] / sr
            timeline.append({
                "i": sentence_index,
                "start": round(t, 3),
                "end": round(t + dur, 3),
                "text": sent.strip(),
                "location": element_meta  # Note: renamed from "metadata" to "location" for HTML compatibility
            })
            pcm_all.append(pcm); t += dur; sentence_index += 1

    pcm_cat = np.concatenate(pcm_all, axis=0) if pcm_all else np.zeros((sr//10,), dtype=np.float32)
    buf = io.BytesIO(); sf.write(buf, pcm_cat, sr, format='WAV'); buf.seek(0)
    manifest = {"audioUrl": "", "sentences": timeline}
    return buf.read(), manifest

def wav_to_mp3_bytes(wav_bytes: bytes, bitrate="128k") -> bytes:
    audio = AudioSegment.from_file(io.BytesIO(wav_bytes), format="wav"); out = io.BytesIO()
    audio.export(out, format="mp3", bitrate=bitrate); out.seek(0); return out.read()

## 4) High-Level Synthesis Wrappers

In [None]:
def synth_string(text: str,
                 voice="af_heart",
                 speed=1.0,
                 out_format="wav",
                 lang_code="a",
                 device=None,
                 basename="kokoro_text",
                 output_dir=None):
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR

    elements = [{
        "text": text,
        "metadata": {"page_number": 1, "source": "string", "points": None}
    }]

    wav_bytes, manifest = synth_text_to_wav_and_manifest(
        elements,
        voice=voice, speed=speed, lang_code=lang_code, device=device
    )

    out_base = output_dir / basename

    if out_format.lower() == "mp3":
        mp3 = wav_to_mp3_bytes(wav_bytes)
        audio_path = str(out_base) + ".mp3"
        with open(audio_path, "wb") as f: f.write(mp3)
    else:
        audio_path = str(out_base) + ".wav"
        with open(audio_path, "wb") as f: f.write(wav_bytes)

    manifest_path = str(out_base) + "_manifest.json"
    manifest["audioUrl"] = Path(audio_path).name
    with open(manifest_path, "w", encoding="utf-8") as f:
        import json; json.dump(manifest, f, ensure_ascii=False, indent=2)

    return audio_path, manifest_path

def synth_pdf(file_path_or_bytes,
              voice="af_heart",
              speed=1.0,
              out_format="wav",
              lang_code="a",
              device=None,
              basename=None,
              output_dir=None):
    """Extract text from PDF using PyMuPDF and synthesize with Kokoro."""
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR
    
    if isinstance(file_path_or_bytes, (str, Path)):
        stem = Path(file_path_or_bytes).stem
    else:
        stem = basename or "document"

    # Extract text using PyMuPDF
    elements = extract_text_from_pdf_pymupdf(file_path_or_bytes)

    wav_bytes, manifest = synth_text_to_wav_and_manifest(
        elements,
        voice=voice, speed=speed, lang_code=lang_code, device=device
    )

    out_base = output_dir / f"{(basename or stem)}_tts"

    if out_format.lower() == "mp3":
        mp3 = wav_to_mp3_bytes(wav_bytes)
        audio_path = str(out_base) + ".mp3"
        with open(audio_path, "wb") as f: f.write(mp3)
    else:
        audio_path = str(out_base) + ".wav"
        with open(audio_path, "wb") as f: f.write(wav_bytes)

    manifest_path = str(out_base) + "_manifest.json"
    manifest["audioUrl"] = Path(audio_path).name
    with open(manifest_path, "w", encoding="utf-8") as f:
        import json; json.dump(manifest, f, ensure_ascii=False, indent=2)

    return audio_path, manifest_path

def synth_epub(file_path_or_bytes,
               voice="af_heart",
               speed=1.0,
               per_chapter_format="wav",
               lang_code="a",
               device=None,
               zip_name=None,
               output_dir=None):
    device = device or DEVICE
    output_dir = Path(output_dir) if output_dir else OUTPUT_DIR

    if isinstance(file_path_or_bytes, (str, Path)):
        with open(file_path_or_bytes, "rb") as fh:
            epub_bytes = io.BytesIO(fh.read())
        stem = Path(file_path_or_bytes).stem
    else:
        epub_bytes = file_path_or_bytes
        stem = "book"

    chapters = extract_chapters_from_epub(epub_bytes)
    assert chapters, "No chapters detected in EPUB."

    zip_buf = io.BytesIO()
    with zipfile.ZipFile(zip_buf, "w", zipfile.ZIP_DEFLATED) as zf:
        for idx, (title, body) in enumerate(chapters, 1):
            name = f"{idx:02d}_{safe_name(title)[:40]}"

            chapter_elements = [{
                "text": body,
                "metadata": {
                    "chapter_index": idx,
                    "chapter_title": title,
                    "page_number": 1,
                    "points": None
                }
            }]

            wav_bytes, manifest = synth_text_to_wav_and_manifest(
                chapter_elements,
                voice=voice, speed=speed, lang_code=lang_code, device=device
            )

            if per_chapter_format.lower() == "mp3":
                data = wav_to_mp3_bytes(wav_bytes)
                audio_name = f"{name}.mp3"
                zf.writestr(audio_name, data)
            else:
                audio_name = f"{name}.wav"
                zf.writestr(audio_name, wav_bytes)

            manifest["audioUrl"] = audio_name
            import json
            zf.writestr(f"{name}_manifest.json", json.dumps(manifest, ensure_ascii=False, indent=2))

    zip_buf.seek(0)
    zpath = str(output_dir / f"{zip_name or (stem + '_chapters')}.zip")
    with open(zpath, "wb") as f:
        f.write(zip_buf.read())
    return zpath

## Usage Examples

Below are examples for synthesizing text, PDFs, and EPUBs locally.

### A) String → Audio

In [None]:
# Configuration
VOICE = "af_heart"
SPEED = 1.0
FORMAT = "mp3"  # "wav" or "mp3"
LANG = "a"
BASENAME = "kokoro_text"

# Text to synthesize
TEXT = """Paste or type your text here.
It can be multiple paragraphs. Chapters aren't needed for this path.
"""

# Run synthesis
audio_path, manifest_path = synth_string(
    TEXT, 
    voice=VOICE, 
    speed=SPEED,
    out_format=FORMAT, 
    lang_code=LANG,
    basename=BASENAME
)

print(f"Audio saved to: {audio_path}")
print(f"Manifest saved to: {manifest_path}")

### B) PDF → Audio (with PyMuPDF extraction)

**PyMuPDF advantages:**
- Direct text extraction (no OCR needed)
- Character-level precise bounding boxes
- Native PDF coordinate space (no scaling issues)
- Fast and lightweight
- Cross-platform support

In [None]:
# Configuration
VOICE = "af_heart"
SPEED = 1.0
FORMAT = "mp3"  # "wav" or "mp3"
LANG = "a"

# Specify the path to your PDF file
PDF_PATH = "/path/to/your/document.pdf"  # Change this to your PDF path

# Run synthesis (PyMuPDF will extract text with precise coordinates)
audio_path, manifest_path = synth_pdf(
    PDF_PATH, 
    voice=VOICE, 
    speed=SPEED,
    out_format=FORMAT, 
    lang_code=LANG
)

print(f"Audio saved to: {audio_path}")
print(f"Manifest saved to: {manifest_path}")

### C) EPUB → ZIP (Per-Chapter Audio + Manifests)

In [None]:
# Configuration
VOICE = "af_heart"
SPEED = 1.0
CHAPTER_FORMAT = "wav"  # "wav" or "mp3"
LANG = "a"
ZIP_NAME = ""  # Optional: custom name for the output ZIP file

# Specify the path to your EPUB file
EPUB_PATH = "/path/to/your/book.epub"  # Change this to your EPUB path

# Run synthesis
zip_path = synth_epub(
    EPUB_PATH, 
    voice=VOICE, 
    speed=SPEED,
    per_chapter_format=CHAPTER_FORMAT,
    lang_code=LANG,
    zip_name=(ZIP_NAME or None)
)

print(f"ZIP archive saved to: {zip_path}")

## Notes

### PyMuPDF Benefits:
- **Direct Text Extraction**: No OCR overhead - extracts text directly from PDF structure
- **Precise Coordinates**: Character-level bounding boxes in native PDF coordinate space
- **No Scaling Issues**: Uses absolute PDF coordinates - no normalization problems
- **Fast**: C++ backend, much faster than image-based OCR approaches
- **Lightweight**: ~15MB vs detectron2's ~500MB
- **Cross-Platform**: Works on Linux, macOS, Windows
- **HTML Compatible**: Outputs coordinates in the same format as unstructured.io

### Coordinate Format:
Each text element includes coordinates compatible with the HTML player:
```json
{
  "metadata": {
    "page_number": 1,
    "points": [
      [x0, y0],  // top-left
      [x1, y0],  // top-right
      [x1, y1],  // bottom-right
      [x0, y1]   // bottom-left
    ]
  }
}
```
- Absolute coordinates in PDF points
- HTML player automatically infers page dimensions
- Compatible with existing highlighting system

### Apple Silicon (M1/M2/M3) Performance:
- **MPS Backend**: Uses Apple's Metal Performance Shaders for GPU acceleration
- **CPU Fallback**: Some operations (like `torch.angle` in STFT) aren't yet implemented on MPS and will automatically fall back to CPU
- **Overall Performance**: Still faster than pure CPU mode due to GPU acceleration for supported operations
- To force CPU-only mode, set `DEVICE_MODE = "cpu"` in the Configuration cell

### System Requirements:
- Works on all platforms (Linux, macOS, Windows)
- Automatic device detection (CUDA, MPS for Apple Silicon, or CPU)

### Output:
- **Output Directory**: By default, all outputs are saved to `~/kokoro_outputs/`
- **Device Selection**: Auto-detects best available device
- **Manifest Format**: JSON files with precise bounding box coordinates for each sentence

### Comparison with Other Approaches:
- **PyMuPDF** (this notebook): Best for PDFs with text layer, fastest, most accurate coordinates
- **Unstructured.io** (TTS_Kokoro_Local.ipynb): Best for complex layout analysis, slower
- **Vision Framework** (TTS_Kokoro_Vision.ipynb): OCR-based, macOS only, for scanned PDFs
- **Nougat** (TTS_Nougat.ipynb): Best for scientific papers with equations