# Notebook 1: PDF Ingestion

## Purpose

This notebook extracts text, images, and metadata from O'Reilly AI books in PDF format.

## Process

1. Load all PDFs from the Books_pdf directory
2. Extract text content page by page
3. Extract images and diagrams
4. Extract metadata (book title, chapter, page numbers)
5. Save extracted data in structured format

## Output

- Extracted text with metadata (JSON format)
- Extracted images saved to data/images/
- Summary statistics of extraction process


In [1]:
# Import required libraries
import os
import json
import fitz  # PyMuPDF
import pdfplumber
from pathlib import Path
from PIL import Image
from tqdm import tqdm
import re
from typing import List, Dict, Any
import hashlib

In [2]:
os.getcwd()

'c:\\Users\\jagth\\Downloads\\New folder\\ai-books-rag-chatbot\\notebooks'

In [None]:
test =Path(r"c:\\Users\\jagth\\Downloads\\New folder\\ai-books-rag-chatbot\\Books_pdf")
test
            

WindowsPath('c:/Users/jagth/Downloads/New folder/ai-books-rag-chatbot/Books_pdf')

In [13]:
# Configuration: Set up paths and directories
BASE_DIR = Path(r"c:\\Users\\jagth\\Downloads\\New folder\\ai-books-rag-chatbot\\Books_pdf")
PDF_DIR = BASE_DIR 
OUTPUT_DIR = BASE_DIR / "data" / "extracted"
IMAGE_DIR = BASE_DIR / "data" / "images"

# Create output directories if they don't exist
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_DIR.mkdir(parents=True, exist_ok=True)

print(f"PDF Directory: {PDF_DIR}")
print(f"Output Directory: {OUTPUT_DIR}")
print(f"Image Directory: {IMAGE_DIR}")

PDF Directory: c:\Users\jagth\Downloads\New folder\ai-books-rag-chatbot\Books_pdf
Output Directory: c:\Users\jagth\Downloads\New folder\ai-books-rag-chatbot\Books_pdf\data\extracted
Image Directory: c:\Users\jagth\Downloads\New folder\ai-books-rag-chatbot\Books_pdf\data\images


In [14]:
# Helper function: Extract book title from filename
def extract_book_title(filename: str) -> str:
    """
    Extract a clean book title from the PDF filename.
    Removes file extension and common suffixes like (Z-Library).
    """
    # Remove .pdf extension
    title = filename.replace('.pdf', '')
    
    # Remove common suffixes
    title = re.sub(r'\s*\([^)]*Z-Library[^)]*\)', '', title)
    title = re.sub(r'\s*\([^)]*\)\s*$', '', title)
    
    # Clean up extra whitespace
    title = ' '.join(title.split())
    
    return title.strip()

In [15]:
# Helper function: Detect chapter from page content
def detect_chapter(text: str, prev_chapter: str = "Introduction") -> str:
    """
    Attempt to detect chapter title from page text.
    Looks for common patterns like "Chapter X" or "CHAPTER X:".
    """
    # Pattern 1: "Chapter X: Title" or "CHAPTER X: Title"
    chapter_pattern1 = re.search(r'(?:Chapter|CHAPTER)\s+(\d+|[IVX]+)\s*[:\-]?\s*([^\n]{0,100})', text[:500])
    if chapter_pattern1:
        chapter_num = chapter_pattern1.group(1)
        chapter_title = chapter_pattern1.group(2).strip()
        return f"Chapter {chapter_num}: {chapter_title}" if chapter_title else f"Chapter {chapter_num}"
    
    # Pattern 2: Look for numbered sections
    section_pattern = re.search(r'^(\d+\.\d+|\d+)\s+([A-Z][^\n]{10,80})$', text[:500], re.MULTILINE)
    if section_pattern:
        return section_pattern.group(0).strip()
    
    # Default: return previous chapter
    return prev_chapter

In [16]:
# Helper function: Generate unique image ID
def generate_image_id(book_title: str, page_num: int, img_index: int) -> str:
    """
    Generate a unique identifier for an image.
    Format: hash of book_title + page number + image index
    """
    book_hash = hashlib.md5(book_title.encode()).hexdigest()[:8]
    return f"{book_hash}_p{page_num}_img{img_index}"

In [17]:
# Main function: Extract text and metadata from a single PDF
def extract_pdf_content(pdf_path: Path, book_title: str) -> Dict[str, Any]:
    """
    Extract text content and metadata from a PDF file.
    
    Args:
        pdf_path: Path to the PDF file
        book_title: Title of the book
    
    Returns:
        Dictionary containing extracted pages with text and metadata
    """
    pages_data = []
    current_chapter = "Introduction"
    
    # Open PDF with pdfplumber for text extraction
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        
        print(f"\nProcessing: {book_title}")
        print(f"Total pages: {total_pages}")
        
        # Iterate through each page
        for page_num, page in enumerate(tqdm(pdf.pages, desc="Extracting text"), start=1):
            # Extract text from page
            text = page.extract_text() or ""
            
            # Skip empty pages
            if len(text.strip()) < 50:
                continue
            
            # Detect chapter (update if new chapter found)
            detected_chapter = detect_chapter(text, current_chapter)
            if detected_chapter != current_chapter and "Chapter" in detected_chapter:
                current_chapter = detected_chapter
            
            # Store page data
            page_data = {
                "book_title": book_title,
                "chapter": current_chapter,
                "page_number": page_num,
                "text": text,
                "char_count": len(text),
                "word_count": len(text.split())
            }
            
            pages_data.append(page_data)
    
    return {
        "book_title": book_title,
        "total_pages": len(pages_data),
        "pages": pages_data
    }

In [18]:
# Main function: Extract images from a single PDF
def extract_pdf_images(pdf_path: Path, book_title: str) -> List[Dict[str, Any]]:
    """
    Extract images and diagrams from a PDF file.
    
    Args:
        pdf_path: Path to the PDF file
        book_title: Title of the book
    
    Returns:
        List of dictionaries containing image metadata
    """
    images_data = []
    
    # Open PDF with PyMuPDF for image extraction
    pdf_document = fitz.open(pdf_path)
    
    print(f"\nExtracting images from: {book_title}")
    
    # Iterate through each page
    for page_num in tqdm(range(len(pdf_document)), desc="Extracting images"):
        page = pdf_document[page_num]
        image_list = page.get_images(full=True)
        
        # Extract each image from the page
        for img_index, img in enumerate(image_list):
            xref = img[0]
            
            # Get image data
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            
            # Generate unique image ID and filename
            image_id = generate_image_id(book_title, page_num + 1, img_index)
            image_filename = f"{image_id}.{image_ext}"
            image_path = IMAGE_DIR / image_filename
            
            # Save image to disk
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
            
            # Store image metadata
            image_data = {
                "image_id": image_id,
                "book_title": book_title,
                "page_number": page_num + 1,
                "image_index": img_index,
                "filename": image_filename,
                "path": str(image_path),
                "format": image_ext
            }
            
            images_data.append(image_data)
    
    pdf_document.close()
    
    return images_data

In [19]:
# Process all PDFs in the Books_pdf directory
all_books_data = []
all_images_data = []

# Get list of PDF files
pdf_files = list(PDF_DIR.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDF files to process\n")

# Process each PDF file
for pdf_file in pdf_files:
    # Extract book title from filename
    book_title = extract_book_title(pdf_file.name)
    
    try:
        # Extract text content
        book_data = extract_pdf_content(pdf_file, book_title)
        all_books_data.append(book_data)
        
        # Extract images
        images_data = extract_pdf_images(pdf_file, book_title)
        all_images_data.extend(images_data)
        
        print(f"✓ Completed: {book_title}")
        print(f"  - Pages extracted: {book_data['total_pages']}")
        print(f"  - Images extracted: {len(images_data)}")
        
    except Exception as e:
        print(f"✗ Error processing {book_title}: {str(e)}")
        continue

print(f"\n{'='*60}")
print(f"Extraction complete!")
print(f"Total books processed: {len(all_books_data)}")
print(f"Total images extracted: {len(all_images_data)}")
print(f"{'='*60}")

Found 6 PDF files to process


Processing: AI Engineering
Total pages: 535


Extracting text: 100%|██████████| 535/535 [00:35<00:00, 15.26it/s]



Extracting images from: AI Engineering


Extracting images: 100%|██████████| 535/535 [00:06<00:00, 76.72it/s] 


✓ Completed: AI Engineering
  - Pages extracted: 529
  - Images extracted: 224

Processing: Applied-Machine-Learning-and-AI-for-Engineers
Total pages: 666


Extracting text: 100%|██████████| 666/666 [00:35<00:00, 19.01it/s]



Extracting images from: Applied-Machine-Learning-and-AI-for-Engineers


Extracting images: 100%|██████████| 666/666 [00:00<00:00, 3443.51it/s]


✓ Completed: Applied-Machine-Learning-and-AI-for-Engineers
  - Pages extracted: 661
  - Images extracted: 227

Processing: Hands-On Large Language Models Language Understanding and Generation
Total pages: 428


Extracting text: 100%|██████████| 428/428 [00:21<00:00, 19.64it/s]



Extracting images from: Hands-On Large Language Models Language Understanding and Generation


Extracting images: 100%|██████████| 428/428 [00:07<00:00, 60.35it/s]


✓ Completed: Hands-On Large Language Models Language Understanding and Generation
  - Pages extracted: 413
  - Images extracted: 329

Processing: Hands-On Machine Learning with Scikit-Learn and PyTorch
Total pages: 608


Extracting text: 100%|██████████| 608/608 [01:01<00:00,  9.91it/s]



Extracting images from: Hands-On Machine Learning with Scikit-Learn and PyTorch


Extracting images: 100%|██████████| 608/608 [00:10<00:00, 60.18it/s] 


✓ Completed: Hands-On Machine Learning with Scikit-Learn and PyTorch
  - Pages extracted: 532
  - Images extracted: 168

Processing: LLM Engineers Handbook
Total pages: 523


Extracting text: 100%|██████████| 523/523 [00:34<00:00, 14.95it/s]



Extracting images from: LLM Engineers Handbook


Extracting images: 100%|██████████| 523/523 [00:01<00:00, 351.83it/s] 


✓ Completed: LLM Engineers Handbook
  - Pages extracted: 513
  - Images extracted: 116

Processing: NLP with Transformer models
Total pages: 409


Extracting text: 100%|██████████| 409/409 [00:35<00:00, 11.38it/s]



Extracting images from: NLP with Transformer models


Extracting images: 100%|██████████| 409/409 [00:07<00:00, 55.41it/s]

✓ Completed: NLP with Transformer models
  - Pages extracted: 403
  - Images extracted: 272

Extraction complete!
Total books processed: 6
Total images extracted: 1336





In [20]:
# Save extracted text data to JSON files
print("\nSaving extracted data...")

# Save each book's data separately
for book_data in all_books_data:
    book_title_safe = re.sub(r'[^a-zA-Z0-9\s]', '', book_data['book_title'])
    book_title_safe = '_'.join(book_title_safe.split())
    
    output_file = OUTPUT_DIR / f"{book_title_safe}.json"
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(book_data, f, indent=2, ensure_ascii=False)
    
    print(f"✓ Saved: {output_file.name}")

# Save combined data
combined_file = OUTPUT_DIR / "all_books_combined.json"
with open(combined_file, 'w', encoding='utf-8') as f:
    json.dump(all_books_data, f, indent=2, ensure_ascii=False)
print(f"✓ Saved: {combined_file.name}")

# Save images metadata
images_file = OUTPUT_DIR / "images_metadata.json"
with open(images_file, 'w', encoding='utf-8') as f:
    json.dump(all_images_data, f, indent=2, ensure_ascii=False)
print(f"✓ Saved: {images_file.name}")

print("\nAll data saved successfully!")


Saving extracted data...
✓ Saved: AI_Engineering.json
✓ Saved: AppliedMachineLearningandAIforEngineers.json
✓ Saved: HandsOn_Large_Language_Models_Language_Understanding_and_Generation.json
✓ Saved: HandsOn_Machine_Learning_with_ScikitLearn_and_PyTorch.json
✓ Saved: LLM_Engineers_Handbook.json
✓ Saved: NLP_with_Transformer_models.json
✓ Saved: all_books_combined.json
✓ Saved: images_metadata.json

All data saved successfully!


In [21]:
# Display extraction statistics
import pandas as pd

# Create summary statistics
stats = []
for book_data in all_books_data:
    total_chars = sum(page['char_count'] for page in book_data['pages'])
    total_words = sum(page['word_count'] for page in book_data['pages'])
    
    book_images = [img for img in all_images_data if img['book_title'] == book_data['book_title']]
    
    stats.append({
        'Book Title': book_data['book_title'],
        'Pages': book_data['total_pages'],
        'Total Characters': f"{total_chars:,}",
        'Total Words': f"{total_words:,}",
        'Images': len(book_images)
    })

# Display as DataFrame
df_stats = pd.DataFrame(stats)
print("\n" + "="*80)
print("EXTRACTION SUMMARY")
print("="*80)
print(df_stats.to_string(index=False))
print("="*80)


EXTRACTION SUMMARY
                                                          Book Title  Pages Total Characters Total Words  Images
                                                      AI Engineering    529        1,080,290     173,374     224
                       Applied-Machine-Learning-and-AI-for-Engineers    661          730,604     108,504     227
Hands-On Large Language Models Language Understanding and Generation    413          623,488      95,614     329
             Hands-On Machine Learning with Scikit-Learn and PyTorch    532          825,706     131,905     168
                                              LLM Engineers Handbook    513          869,917     128,613     116
                                         NLP with Transformer models    403          747,135     114,067     272


In [11]:
# Sample: Display first page of first book
if all_books_data:
    first_book = all_books_data[0]
    first_page = first_book['pages'][0]
    
    print("\n" + "="*80)
    print("SAMPLE: First Page of First Book")
    print("="*80)
    print(f"Book: {first_page['book_title']}")
    print(f"Chapter: {first_page['chapter']}")
    print(f"Page: {first_page['page_number']}")
    print(f"\nText Preview (first 500 characters):")
    print("-" * 80)
    print(first_page['text'][:500])
    print("...")
    print("="*80)


SAMPLE: First Page of First Book
Book: AI Engineering
Chapter: Introduction
Page: 1

Text Preview (first 500 characters):
--------------------------------------------------------------------------------
AI Engineering
Building Applications
with Foundation Models
Chip Huyen
...
