# Simple RAG for Book Processing

Starting simple: chunk text → store embeddings → retrieve context → enhance NER extraction

This notebook demonstrates:
1. Basic text chunking
2. Simple embedding storage
3. Context retrieval
4. Enhanced entity extraction using retrieved context

In [1]:
# Core imports - same as 03_validating_NER.ipynb
from outlines import Generator, from_transformers, Template
from pydantic import BaseModel, Field
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sqlite3
import json
from pathlib import Path
from typing import List
import difflib
from rich.console import Console
from rich.table import Table
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Same Person model as 03_validating_NER.ipynb
class Person(BaseModel):
    display_name: str = Field(description="The canonical name of the person.")
    display_name_alternatives: List[str] = Field(description="Other ways this person's name is displayed.")

class PersonExtraction(BaseModel):
    persons: List[Person] = Field(description="List of all persons found in the text.")

print("Person schema defined")

Person schema defined


In [3]:
class SimplePersonsKB:
    def __init__(self, db_path: str = "persons_kb.db"):
        self.db_path = db_path
        self.init_database()
        
    def init_database(self):
        """Initialize simple SQLite database for persons only"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Simple persons table
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS persons (
                id INTEGER PRIMARY KEY,
                display_name TEXT UNIQUE,
                alternatives TEXT,  -- JSON list of alternative names
                first_seen_chunk INTEGER,
                mention_count INTEGER DEFAULT 1,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        
        # Simple chunks table for provenance
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS chunks (
                id INTEGER PRIMARY KEY,
                text TEXT,
                chunk_index INTEGER,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        
        conn.commit()
        conn.close()
        
    def add_chunk(self, text: str, chunk_index: int) -> int:
        """Add text chunk and return chunk ID"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute("""
            INSERT INTO chunks (text, chunk_index)
            VALUES (?, ?)
        """, (text, chunk_index))
        
        chunk_id = cursor.lastrowid
        conn.commit()
        conn.close()
        return chunk_id
    
    def add_person(self, person: Person, chunk_id: int) -> int:
        """Add person with simple duplicate checking"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Check if person already exists (fuzzy matching)
        existing_id = self._find_similar_person(person.display_name)
        
        if existing_id:
            # Update existing person - increment mention count
            cursor.execute("""
                UPDATE persons 
                SET mention_count = mention_count + 1,
                    alternatives = ?
                WHERE id = ?
            """, (json.dumps(person.display_name_alternatives), existing_id))
            person_id = existing_id
        else:
            # Insert new person
            cursor.execute("""
                INSERT INTO persons (display_name, alternatives, first_seen_chunk)
                VALUES (?, ?, ?)
            """, (
                person.display_name,
                json.dumps(person.display_name_alternatives),
                chunk_id
            ))
            person_id = cursor.lastrowid
        
        conn.commit()
        conn.close()
        return person_id
    
    def _find_similar_person(self, name: str, threshold: float = 0.85) -> int:
        """Find existing person using fuzzy string matching"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute("SELECT id, display_name FROM persons")
        existing_persons = cursor.fetchall()
        
        for person_id, existing_name in existing_persons:
            similarity = difflib.SequenceMatcher(None, name.lower(), existing_name.lower()).ratio()
            if similarity >= threshold:
                conn.close()
                return person_id
        
        conn.close()
        return None
    
    def get_stats(self):
        """Get simple statistics"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute("SELECT COUNT(*) FROM persons")
        persons_count = cursor.fetchone()[0]
        
        cursor.execute("SELECT COUNT(*) FROM chunks")
        chunks_count = cursor.fetchone()[0]
        
        cursor.execute("SELECT SUM(mention_count) FROM persons")
        total_mentions = cursor.fetchone()[0] or 0
        
        conn.close()
        return {
            'unique_persons': persons_count,
            'total_mentions': total_mentions,
            'chunks_processed': chunks_count
        }
    
    def search_persons(self, query: str):
        """Search persons by name"""
        conn = sqlite3.connect(self.db_path)
        df = pd.read_sql_query("""
            SELECT display_name, alternatives, mention_count, first_seen_chunk
            FROM persons 
            WHERE display_name LIKE ? 
            ORDER BY mention_count DESC
        """, conn, params=[f"%{query}%"])
        conn.close()
        return df
    
    def get_top_persons(self, limit: int = 10):
        """Get most mentioned persons"""
        conn = sqlite3.connect(self.db_path)
        df = pd.read_sql_query("""
            SELECT display_name, alternatives, mention_count
            FROM persons 
            ORDER BY mention_count DESC 
            LIMIT ?
        """, conn, params=[limit])
        conn.close()
        return df

# Initialize simple KB
kb = SimplePersonsKB()
print("Simple Persons Knowledge Base initialized")

Simple Persons Knowledge Base initialized


## Step 3: Process Chunks with Context

Extract entities from each chunk, using context from previous chunks

In [4]:
# Load model (same as 03_validating_NER.ipynb)
model_path = "/gpfs1/llm/llama-3.2-hf/Meta-Llama-3.2-3B-Instruct"

model = from_transformers(
    AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda"),
    AutoTokenizer.from_pretrained(model_path)
)

print("Model loaded successfully")

Loading checkpoint shards: 100%|██████████| 2/2 [00:22<00:00, 11.45s/it]


Model loaded successfully


In [5]:
# Use the exact same template from 03_validating_NER.ipynb
template_ner = Template.from_string(
    """You are an experienced history of science professor.

Given some text, extract ALL persons mentioned or cited with their canonical and alternative names.

IMPORTANT: Only include alternative names that actually appear in the text. If no alternatives are found, use an empty list.

# Examples

TEXT: It fell to John F. Carrington to explain. An English missionary, born in 1914 in
Northamptonshire, Carrington left for Africa. Marshall McLuhan was mentioned.
RESULT: {
  "persons": [
    {"display_name": "John F. Carrington", "display_name_alternatives": ["Carrington"]},
    {"display_name": "Marshall McLuhan", "display_name_alternatives": []}
  ]
}

TEXT: "The information circle becomes the unit of life," says Werner Loewenstein after thirty years spent studying intercellular communication.
RESULT: {
  "persons": [
    {"display_name": "Werner Loewenstein", "display_name_alternatives": []}
  ]
}

# OUTPUT INSTRUCTIONS

Answer in valid JSON with the following structure:
PersonExtraction:
    persons (list[Person]): List of all persons found in the text

CRITICAL: Only include display_name_alternatives that literally appear in the provided text. Do not infer or generate alternatives.

# OUTPUT

TEXT: {{ text }}
RESULT: """)

# Create generator
generator = Generator(model, PersonExtraction)
print("Generator ready for person extraction")

Generator ready for person extraction


## Simple Processing Pipeline

Process text chunks and extract persons

In [13]:
# Load data (same as previous notebooks)
data_file = Path("../data/output_03e48481195ba4783678f1ae446b40a7f6f12791.jsonl")

def read_jsonl(file_path):
    with open(file_path, 'r') as file:
        return json.loads(file.readline())

# Load and get text section
data = read_jsonl(data_file)
full_text = data['text']

# Get chapter 2 section (same as other notebooks)
pagelookup = {page[-1]: page[0] for page in data['attributes']['pdf_page_numbers']}
text_section = full_text[pagelookup[22]:pagelookup[69]-1]

print(f"Processing text section of {len(text_section):,} characters")

Processing text section of 129,613 characters


In [14]:
# Simple chunking
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". "]
)

chunks = text_splitter.split_text(text_section)
print(f"Created {len(chunks)} chunks")
print(f"Average chunk size: {sum(len(c) for c in chunks) // len(chunks)} characters")

Created 164 chunks
Average chunk size: 808 characters


In [16]:
# Simple processing function
def process_chunk_for_persons(chunk_text: str, chunk_index: int):
    """Extract persons from a chunk and add to KB"""
    
    # Add chunk to KB
    chunk_id = kb.add_chunk(chunk_text, chunk_index)
    
    # Generate prompt
    prompt = template_ner(text=chunk_text)
    
    try:
        # Extract persons
        result = generator(prompt, max_new_tokens=400, temperature=0.0, do_sample=False)
        extracted = json.loads(result)
        
        # Add persons to KB
        persons_added = []
        for person_data in extracted.get('persons', []):
            person = Person(**person_data)
            person_id = kb.add_person(person, chunk_id)
            persons_added.append(person.display_name)
        
        return {
            'chunk_id': chunk_id,
            'persons_found': len(persons_added),
            'persons': persons_added
        }
        
    except Exception as e:
        print(f"Error processing chunk {chunk_index}: {e}")
        return {'chunk_id': chunk_id, 'error': str(e)}

print("Processing function ready")

Processing function ready


In [17]:
# Process chunks
console = Console()
results = []

for i, chunk in enumerate(chunks):
    console.print(f"\n[bold blue]Processing chunk {i+1}[/bold blue]")
    console.print(f"Preview: {chunk[:80]}...")
    
    result = process_chunk_for_persons(chunk, i)
    results.append(result)
    
    if 'persons_found' in result:
        console.print(f"[green]✓ Found {result['persons_found']} persons: {', '.join(result['persons'])}[/green]")
    else:
        console.print(f"[red]✗ Error: {result.get('error', 'Unknown error')}[/red]")

console.print(f"\n[bold]Processing complete![/bold]")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


## Knowledge Base Analysis

Simple queries and statistics

In [18]:
# Get statistics
stats = kb.get_stats()

console = Console()
console.print(f"\n[bold cyan]Knowledge Base Statistics:[/bold cyan]")
console.print(f"Unique persons: {stats['unique_persons']}")
console.print(f"Total mentions: {stats['total_mentions']}")
console.print(f"Chunks processed: {stats['chunks_processed']}")

In [19]:
# Show top mentioned persons
top_persons = kb.get_top_persons(10)

table = Table(title="Most Mentioned Persons")
table.add_column("Name", style="cyan")
table.add_column("Mentions", style="green")
table.add_column("Alternatives", style="yellow")

for _, row in top_persons.iterrows():
    alternatives = json.loads(row['alternatives']) if row['alternatives'] else []
    alt_text = ", ".join(alternatives) if alternatives else "None"
    
    table.add_row(
        row['display_name'],
        str(row['mention_count']),
        alt_text
    )

console.print(table)

In [20]:
# Simple search examples
def search_demo(query: str):
    results = kb.search_persons(query)
    console.print(f"\n[bold]Search results for '{query}':[/bold]")
    if len(results) > 0:
        for _, row in results.iterrows():
            alternatives = json.loads(row['alternatives']) if row['alternatives'] else []
            alt_text = f" (also: {', '.join(alternatives)})" if alternatives else ""
            console.print(f"  {row['display_name']}{alt_text} - {row['mention_count']} mentions")
    else:
        console.print("  No results found")

# Try some searches
search_demo("Ong")
search_demo("McLuhan")
search_demo("Plato")