# Twilight Imperium PDF Processor
## Step 1: Extract Text from PDFs for LangChain Assistant

This notebook processes the Twilight Imperium PDFs and extracts text content that will be used to build a LangChain-powered rules assistant.


In [9]:
# Import necessary libraries
import fitz  # PyMuPDF for PDF processing
import os
import json
from pathlib import Path
import re
from typing import List, Dict


In [10]:
# Define paths to our PDF files
dataset_path = Path("dataset")
learn_to_play_pdf = dataset_path / "ti-k0289_learn_to_playcompressed.pdf"
rulebook_pdf = dataset_path / "ti10_rulebook_web-good.pdf"

# Verify files exist
print(f"Learn to Play PDF exists: {learn_to_play_pdf.exists()}")
print(f"Rulebook PDF exists: {rulebook_pdf.exists()}")

# Create output directory for processed text
output_dir = Path("processed_rules")
output_dir.mkdir(exist_ok=True)
print(f"Output directory created: {output_dir}")


Learn to Play PDF exists: True
Rulebook PDF exists: True
Output directory created: processed_rules


In [11]:
def extract_text_from_pdf(pdf_path: Path, source_name: str) -> Dict[str, any]:
    """
    Extract text from a PDF file and return structured data
    
    Args:
        pdf_path: Path to the PDF file
        source_name: Name to identify the source (e.g., 'learn_to_play' or 'rulebook')
    
    Returns:
        Dictionary containing extracted text and metadata
    """
    print(f"\nProcessing {source_name}...")
    
    # Open the PDF
    doc = fitz.open(pdf_path)
    
    # Get total pages before processing (needed before closing document)
    total_pages = len(doc)
    
    # Extract text from each page
    pages_text = []
    full_text = ""
    
    for page_num in range(total_pages):
        page = doc[page_num]
        text = page.get_text()
        
        # Clean up the text - remove excessive whitespace
        text = re.sub(r'\n\s*\n', '\n\n', text)  # Replace multiple newlines with double newline
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
        text = text.strip()
        
        if text:  # Only add non-empty pages
            pages_text.append({
                'page_number': page_num + 1,
                'text': text
            })
            full_text += f"\n\n--- Page {page_num + 1} ---\n\n{text}"
    
    doc.close()
    
    result = {
        'source': source_name,
        'total_pages': total_pages,
        'pages_with_text': len(pages_text),
        'full_text': full_text.strip(),
        'pages': pages_text
    }
    
    print(f"Extracted text from {len(pages_text)} pages out of {total_pages} total pages")
    print(f"Total characters extracted: {len(full_text)}")
    
    return result


In [12]:
# Extract text from Learn to Play PDF
learn_to_play_data = extract_text_from_pdf(learn_to_play_pdf, "learn_to_play")



Processing learn_to_play...
Extracted text from 25 pages out of 25 total pages
Total characters extracted: 100025


In [13]:
# Extract text from Rulebook PDF
rulebook_data = extract_text_from_pdf(rulebook_pdf, "rulebook")



Processing rulebook...
Extracted text from 16 pages out of 16 total pages
Total characters extracted: 57697


In [14]:
# Save extracted text to files for later use
def save_extracted_data(data: Dict, filename: str):
    """
    Save extracted text data to both JSON and plain text files
    """
    # Save as JSON for structured access
    json_path = output_dir / f"{filename}.json"
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)
    
    # Save as plain text for easy reading
    txt_path = output_dir / f"{filename}.txt"
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(data['full_text'])
    
    print(f"Saved {filename} data to:")
    print(f"  JSON: {json_path}")
    print(f"  Text: {txt_path}")

# Save both extracted datasets
save_extracted_data(learn_to_play_data, "learn_to_play")
save_extracted_data(rulebook_data, "rulebook")


Saved learn_to_play data to:
  JSON: processed_rules\learn_to_play.json
  Text: processed_rules\learn_to_play.txt
Saved rulebook data to:
  JSON: processed_rules\rulebook.json
  Text: processed_rules\rulebook.txt


In [15]:
# Display summary statistics
print("\n" + "="*50)
print("EXTRACTION SUMMARY")
print("="*50)

print(f"\nLearn to Play:")
print(f"  Total pages: {learn_to_play_data['total_pages']}")
print(f"  Pages with text: {learn_to_play_data['pages_with_text']}")
print(f"  Characters extracted: {len(learn_to_play_data['full_text']):,}")

print(f"\nRulebook:")
print(f"  Total pages: {rulebook_data['total_pages']}")
print(f"  Pages with text: {rulebook_data['pages_with_text']}")
print(f"  Characters extracted: {len(rulebook_data['full_text']):,}")

total_chars = len(learn_to_play_data['full_text']) + len(rulebook_data['full_text'])
print(f"\nTotal characters from both PDFs: {total_chars:,}")



EXTRACTION SUMMARY

Learn to Play:
  Total pages: 25
  Pages with text: 25
  Characters extracted: 100,023

Rulebook:
  Total pages: 16
  Pages with text: 16
  Characters extracted: 57,695

Total characters from both PDFs: 157,718


In [16]:
# Preview some of the extracted content
print("\n" + "="*50)
print("CONTENT PREVIEW")
print("="*50)

print("\n🔹 Learn to Play (first 500 characters):")
print("-" * 40)
print(learn_to_play_data['full_text'][:500] + "...")

print("\n🔹 Rulebook (first 500 characters):")
print("-" * 40)
print(rulebook_data['full_text'][:500] + "...")



CONTENT PREVIEW

🔹 Learn to Play (first 500 characters):
----------------------------------------
--- Page 1 ---

®

--- Page 2 ---

2 My name is Mahthom Iq Seerva. I am the Winnaran keeper of the Custodian Chronicle, and I write this from the ancient Tower of Annals in old Mecatol City. Since inheriting the duties of the chronicle from my father, I have enjoyed the inspiring views of great buildings, ancient towers, and the bright lights of life that stretch into the distance. Yet, like the shadow at my feet, I can never escape or forget the lethally finite borders of this city. Less than a...

🔹 Rulebook (first 500 characters):
----------------------------------------
--- Page 1 ---

®

--- Page 2 ---

2 Teallian shook her head. “No, researcher. It’s not wasteful at all.” “What do you mean?” Hiari asked. The Barony officer gave her a small, superior smile. “It’s arrogant, Doctor. Pure arrogance to demonstrate their power by reshaping a planet to get what they want. And I approve.” —

In [17]:
# Prepare data structure for next steps (chunking and embedding)
combined_data = {
    'learn_to_play': learn_to_play_data,
    'rulebook': rulebook_data,
    'processing_complete': True,
    'next_steps': [
        'Chunk the text using RecursiveCharacterTextSplitter',
        'Generate embeddings for each chunk',
        'Store in vector database (FAISS or Chroma)',
        'Create LangChain tool for rule search',
        'Build the chatbot agent'
    ]
}

# Save combined data for easy access in next notebook
combined_path = output_dir / "combined_extracted_data.json"
with open(combined_path, 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, indent=2, ensure_ascii=False)

print(f"\n✅ PDF processing complete!")
print(f"📁 All extracted data saved to: {output_dir}")
print(f"🚀 Ready for Step 2: Text chunking and embedding")
print(f"\nCombined data saved to: {combined_path}")



✅ PDF processing complete!
📁 All extracted data saved to: processed_rules
🚀 Ready for Step 2: Text chunking and embedding

Combined data saved to: processed_rules\combined_extracted_data.json
