# TTS with Nougat OCR
Uses Nougat (Neural Optical Understanding for Academic Documents) for PDF text extraction with Kokoro TTS.

**Best for:** Scientific papers, academic documents, and PDFs with complex layouts or mathematical notation.

## 0) Environment Setup (Optional)**This step helps you manage Python packages and avoid conflicts with your system installation.**- If you have **conda** installed, you can create a fresh environment for this notebook- Or use an existing environment by providing its name  - At the end of the notebook, you can easily clean up and delete the environment to free storage

In [None]:
import subprocessimport sysimport os# Flag to track if we created an environment in this notebookenvironment_created_by_notebook = Falseenvironment_name = None# Check if conda is installedtry:    result = subprocess.run(['conda', '--version'], capture_output=True, text=True, check=True)    conda_available = True    print(f"✓ Conda detected: {result.stdout.strip()}")except (subprocess.CalledProcessError, FileNotFoundError):    conda_available = False    print("✗ Conda not found - skipping environment management")    print("Packages will be installed in your current Python environment")if conda_available:    print("\n" + "="*60)    print("ENVIRONMENT SETUP OPTIONS")    print("="*60)        choice = input("\nDo you want to:\n  [1] Create a NEW conda environment (recommended)\n  [2] Use an EXISTING environment\n  [3] Skip and use current environment\n\nEnter choice (1/2/3): ").strip()        if choice == "1":        # Create new environment        env_name = input("\nEnter name for new environment (default: kokoro_nougat): ").strip()        if not env_name:            env_name = "kokoro_nougat"                print(f"\n→ Creating conda environment: {env_name}")        print("  This may take a few minutes...")                try:            # Create environment with Python 3.10            subprocess.run(['conda', 'create', '-n', env_name, 'python=3.10', '-y'],                           check=True, capture_output=True)                        environment_created_by_notebook = True            environment_name = env_name                        print(f"✓ Environment '{env_name}' created successfully!")            print(f"\n{'='*60}")            print("IMPORTANT: Restart your Jupyter kernel and select the new environment:")            print(f"  Kernel → Change Kernel → {env_name}")            print(f"{'='*60}\n")                    except subprocess.CalledProcessError as e:            print(f"✗ Failed to create environment: {e}")            print("Continuing with current environment...")        elif choice == "2":        # Use existing environment        env_name = input("\nEnter name of existing environment: ").strip()        if env_name:            environment_name = env_name            print(f"\n✓ Using existing environment: {env_name}")            print(f"\n{'='*60}")            print("IMPORTANT: Make sure your kernel is using this environment:")            print(f"  Kernel → Change Kernel → {env_name}")            print(f"{'='*60}\n")        else:            print("✗ No environment name provided - using current environment")        else:        print("\n✓ Using current environment")print("\nYou can now proceed with the rest of the notebook.")

## 1) Installation

In [None]:
# Core TTS + I/O deps
!pip -q install "kokoro>=0.9.4" soundfile pydub

# Nougat OCR dependencies
!pip -q install "transformers[torch]" "nougat-ocr"
!apt-get -yqq install poppler-utils
!pip -q install pdf2image

# MP3 encoder (pydub uses ffmpeg)
!apt-get -yqq install ffmpeg

# Silence overly chatty logs
import logging
logging.getLogger("phonemizer").setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR)

## 2) Config & Device Selection

In [None]:
# --- Toggle: False = download to device; True = save into Google Drive ---
SAVE_TO_DRIVE = False
DRIVE_DIR = "/content/drive/MyDrive/TTS/nougat_outputs"

if SAVE_TO_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')
    import os
    os.makedirs(DRIVE_DIR, exist_ok=True)

import os
from pathlib import Path
from google.colab import files

def deliver(path: str):
    """Save to Drive if enabled; otherwise trigger a browser download."""
    if SAVE_TO_DRIVE:
        dest = f"{DRIVE_DIR}/{Path(path).name}"
        os.replace(path, dest)
        print("Saved to Drive:", dest)
    else:
        files.download(path)

# --- Device selection ---
DEVICE_MODE = "auto"  # "auto", "cuda", or "cpu"

import torch
def _pick_device():
    if DEVICE_MODE == "cuda":
        return "cuda"
    if DEVICE_MODE == "cpu":
        return "cpu"
    return "cuda" if torch.cuda.is_available() else "cpu"

DEVICE = _pick_device()
print("Using device:", DEVICE)

## 3) Nougat PDF Extraction + TTS Helpers

In [None]:
import numpy as np
import soundfile as sf
import re, io, torch
from pathlib import Path
from typing import List, Tuple, Dict
from functools import lru_cache

from kokoro import KPipeline
from pydub import AudioSegment
from transformers import pipeline as hf_pipeline
from pdf2image import convert_from_bytes

# Sentence split pattern
SPLIT_PATTERN_CAP = r"([.?!]\s+|[\n]{2,})"

# --- Nougat Pipeline Cache ---
@lru_cache(maxsize=1)
def get_nougat_pipeline(device=DEVICE):
    """Load Nougat model for OCR-based PDF extraction."""
    print(f"Loading Nougat model to {device}... (this may take a moment on first run)")
    if device == "cuda" and torch.cuda.is_available():
        return hf_pipeline("image-to-text", model="facebook/nougat-base", device=0)
    else:
        return hf_pipeline("image-to-text", model="facebook/nougat-base")

def extract_text_from_pdf_nougat(file_like: io.BytesIO, device=DEVICE) -> List[Dict]:
    """
    Extract text from PDF using Nougat OCR model.
    Returns a list of elements with text and metadata.
    """
    pipe = get_nougat_pipeline(device=device)
    print("Nougat: Converting PDF to images...")
    
    pdf_bytes = file_like.read()
    try:
        images = convert_from_bytes(pdf_bytes)
    except Exception as e:
        print(f"Nougat: pdf2image conversion failed: {e}. Cannot process.")
        return [{"text": "Error: Could not convert PDF.", "metadata": {"page_number": 1}}]
    
    all_elements = []
    print(f"Nougat: Processing {len(images)} pages with model...")
    
    for i, page_img in enumerate(images):
        try:
            result = pipe(page_img)
            page_text = result[0]['generated_text']
            all_elements.append({
                "text": page_text,
                "metadata": {"page_number": i + 1}
            })
        except Exception as e:
            print(f"Nougat: Error on page {i+1}: {e}")
            all_elements.append({
                "text": f"\n[Error processing page {i+1}]\n",
                "metadata": {"page_number": i+1}
            })
    
    print("Nougat: Processing complete.")
    return all_elements

# --- TTS Pipeline Cache ---
@lru_cache(maxsize=4)
def get_tts_pipeline(lang_code='a', device=DEVICE):
    """Load Kokoro TTS pipeline."""
    return KPipeline(lang_code=lang_code, device=device)

def _synthesize_sentence(pipe: KPipeline, sentence: str, voice='af_heart', speed=1.0) -> np.ndarray:
    """Synthesize a single sentence to audio."""
    subchunks = []
    for _, _, audio in pipe(sentence, voice=voice, speed=speed, split_pattern=None):
        subchunks.append(audio)
    if not subchunks:
        return np.zeros((0,), dtype=np.float32)
    return np.concatenate(subchunks, axis=0)

def split_sentences(text: str) -> List[str]:
    """Split text into sentences, keeping delimiters."""
    parts = re.split(SPLIT_PATTERN_CAP, text)
    sents = []
    for i in range(0, len(parts), 2):
        chunk = (parts[i] or "").strip()
        sep = parts[i+1] if i+1 < len(parts) else ""
        if not chunk:
            continue
        if sep and not sep.isspace():
            chunk = (chunk + " " + sep.strip()).strip()
        sents.append(chunk)
    return sents

def synth_elements_to_wav_and_manifest(
    elements: List[Dict],
    voice='af_heart',
    speed=1.0,
    lang_code='a',
    device=DEVICE) -> Tuple[bytes, Dict]:
    """
    Synthesize text elements to WAV audio with sentence-level manifest.
    """
    pipe = get_tts_pipeline(lang_code=lang_code, device=device)
    sr = 24000

    pcm_all = []
    timeline = []
    t = 0.0
    sentence_index = 0
    
    print(f"Synthesizing {len(elements)} text elements...")

    for element in elements:
        element_text = element.get("text", "")
        element_meta = element.get("metadata", {})

        sentences = split_sentences(element_text)

        for sent in sentences:
            if not sent:
                continue
            pcm = _synthesize_sentence(pipe, sent, voice=voice, speed=speed)
            dur = pcm.shape[0] / sr
            timeline.append({
                "i": sentence_index,
                "start": round(t, 3),
                "end": round(t + dur, 3),
                "text": sent.strip(),
                "location": element_meta
            })
            pcm_all.append(pcm)
            t += dur
            sentence_index += 1

    pcm_cat = np.concatenate(pcm_all, axis=0) if pcm_all else np.zeros((sr//10,), dtype=np.float32)
    buf = io.BytesIO()
    sf.write(buf, pcm_cat, sr, format='WAV')
    buf.seek(0)
    manifest = {"audioUrl": "", "sentences": timeline}
    return buf.read(), manifest

def wav_to_mp3_bytes(wav_bytes: bytes, bitrate="128k") -> bytes:
    """Convert WAV bytes to MP3 bytes."""
    audio = AudioSegment.from_file(io.BytesIO(wav_bytes), format="wav")
    out = io.BytesIO()
    audio.export(out, format="mp3", bitrate=bitrate)
    out.seek(0)
    return out.read()

## 4) High-Level Synthesis Function

In [None]:
import json

def synth_pdf_nougat(
    file_path_or_bytes,
    voice="af_heart",
    speed=1.0,
    out_format="wav",
    lang_code="a",
    device=None,
    basename=None):
    """
    Extract text from PDF using Nougat OCR and synthesize to audio.
    
    Args:
        file_path_or_bytes: Path to PDF file or BytesIO object
        voice: Kokoro voice to use (default: 'af_heart')
        speed: Speech speed multiplier (default: 1.0)
        out_format: Output format 'wav' or 'mp3' (default: 'wav')
        lang_code: Language code (default: 'a' for auto)
        device: Device to use (default: None, uses global DEVICE)
        basename: Output filename base (default: derived from input)
    
    Returns:
        Tuple of (audio_path, manifest_path)
    """
    device = device or DEVICE
    
    if isinstance(file_path_or_bytes, (str, Path)):
        with open(file_path_or_bytes, "rb") as fh:
            pdf_bytes = io.BytesIO(fh.read())
        stem = Path(file_path_or_bytes).stem
    else:
        pdf_bytes = file_path_or_bytes
        stem = basename or "document"

    # Extract text using Nougat
    elements = extract_text_from_pdf_nougat(pdf_bytes, device=device)

    # Synthesize to audio
    wav_bytes, manifest = synth_elements_to_wav_and_manifest(
        elements,
        voice=voice,
        speed=speed,
        lang_code=lang_code,
        device=device
    )

    out_base = f"/content/{(basename or stem)}_nougat_tts"

    if out_format.lower() == "mp3":
        mp3 = wav_to_mp3_bytes(wav_bytes)
        audio_path = out_base + ".mp3"
        with open(audio_path, "wb") as f:
            f.write(mp3)
    else:
        audio_path = out_base + ".wav"
        with open(audio_path, "wb") as f:
            f.write(wav_bytes)

    manifest_path = out_base + "_manifest.json"
    manifest["audioUrl"] = Path(audio_path).name
    with open(manifest_path, "w", encoding="utf-8") as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    return audio_path, manifest_path

## 5) Quick-Call: PDF → Audio (Nougat OCR)

In [None]:
# @title PDF → Audio (Nougat OCR)
# @markdown ### Options (edit here)
VOICE = "af_heart"  # @param {type:"string"}
SPEED = 1.0          # @param {type:"number"}
FORMAT = "mp3"       # @param ["wav", "mp3"]
LANG = "a"           # @param {type:"string"}
DEVICE_OVERRIDE = "None"  # @param ["None", "cuda", "cpu"]

# @markdown **Upload a PDF when prompted.**
from google.colab import files
print("Upload a PDF (academic papers work best with Nougat)...")
_uploaded = files.upload()
_pdf_key = next(iter(_uploaded))

# ---- Run (no edits needed below) ----
_dev = None if DEVICE_OVERRIDE == "None" else DEVICE_OVERRIDE
audio_path, manifest_path = synth_pdf_nougat(
    _pdf_key,
    voice=VOICE,
    speed=SPEED,
    out_format=FORMAT,
    lang_code=LANG,
    device=_dev
)
deliver(audio_path)
deliver(manifest_path)
print("Done:", audio_path, manifest_path)

## Notes

**When to use Nougat:**
- Academic papers with equations and special notation
- Documents with complex layouts
- PDFs where standard text extraction fails

**Limitations:**
- Slower than standard extraction (processes each page as an image)
- Requires more GPU memory
- May hallucinate or misread some text

**Performance:**
- First run downloads the model (~1.5GB)
- Processing time: ~5-15 seconds per page (GPU)
- Best results with clear, high-quality scans

## Cleanup: Delete Environment (Optional)**If you created a new environment at the beginning of this notebook**, you can delete it here to free up storage space.⚠️ **Warning**: This will permanently delete the environment and all installed packages!

In [None]:
import subprocess# Check if we created an environment in this notebookif 'environment_created_by_notebook' not in globals():    print("✗ No environment tracking found")    print("This cell only works if you ran the environment setup cell at the beginning")elif not environment_created_by_notebook:    print("✗ No environment was created by this notebook")    print("You can only delete environments that were created in this session")else:    print(f"Environment '{environment_name}' was created by this notebook")    print(f"\n{'='*60}")    print("DELETE ENVIRONMENT")    print(f"{'='*60}")        confirm = input(f"\nAre you sure you want to DELETE '{environment_name}'?\nType 'yes' to confirm: ").strip().lower()        if confirm == 'yes':        print(f"\n→ Deleting environment '{environment_name}'...")        print("  This may take a moment...")                try:            subprocess.run(['conda', 'env', 'remove', '-n', environment_name, '-y'],                           check=True, capture_output=True)            print(f"✓ Environment '{environment_name}' deleted successfully!")            print("  Storage space has been freed.")                        # Reset the flag            environment_created_by_notebook = False            environment_name = None                    except subprocess.CalledProcessError as e:            print(f"✗ Failed to delete environment: {e}")            print(f"You may need to delete it manually with: conda env remove -n {environment_name}")    else:        print("\n✗ Deletion cancelled - environment preserved")