In [None]:
# Install required packages
!pip install langchain langchain-text-splitters pymupdf sentence-transformers chromadb -q

import pymupdf
from pathlib import Path
from typing import List, Dict
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import json


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.4/21.4 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.2/278.2 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m65.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:

In [None]:

class InstructionalPDFChunker:
    """
    Chunker optimized for instructional design materials with semantic awareness
    """

    def __init__(self, chunk_size=800, chunk_overlap=150):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Text splitter with separators optimized for training content
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[
                "\n\n\n",  # Section breaks
                "\n\n",    # Paragraph breaks
                "\n",      # Line breaks
                ". ",      # Sentences
                "! ",
                "? ",
                " ",       # Words
                ""
            ],
            length_function=len,
        )

    def extract_text_from_pdf(self, pdf_path: str) -> List[Dict]:
        """Extract text from PDF with metadata"""
        doc = pymupdf.open(pdf_path)
        pages_data = []

        for page_num, page in enumerate(doc, start=1):
            text = page.get_text()

            # Extract potential headings (text in larger font or bold)
            blocks = page.get_text("dict")["blocks"]
            headings = []

            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            # Detect headings by font size or flags
                            if span["size"] > 12 or span["flags"] & 2**4:  # Bold flag
                                headings.append(span["text"].strip())

            pages_data.append({
                "page_num": page_num,
                "text": text,
                "headings": headings,
                "source": Path(pdf_path).name
            })

        doc.close()
        return pages_data

    def detect_section_boundaries(self, text: str) -> List[str]:
        """Detect natural section boundaries in instructional content"""
        # Common patterns in training materials
        patterns = [
            r'\n\s*(?:Chapter|Module|Section|Lesson|Unit)\s+\d+',
            r'\n\s*(?:Objective|Learning Outcome|Goal)s?:',
            r'\n\s*(?:Procedure|Steps|Instructions):',
            r'\n\s*(?:Example|Exercise|Practice):',
            r'\n\s*(?:Summary|Conclusion|Review):',
            r'\n\s*\d+\.\s+[A-Z]',  # Numbered sections with capitalized start
        ]

        sections = []
        last_pos = 0

        for pattern in patterns:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                if match.start() > last_pos:
                    sections.append(text[last_pos:match.start()].strip())
                    last_pos = match.start()

        if last_pos < len(text):
            sections.append(text[last_pos:].strip())

        return [s for s in sections if len(s) > 100]  # Filter very short sections

    def chunk_documents(self, pdf_paths: List[str]) -> List[Document]:
        """Main chunking method for multiple PDFs"""
        all_chunks = []

        for pdf_path in pdf_paths:
            print(f"Processing: {pdf_path}")
            pages_data = self.extract_text_from_pdf(pdf_path)

            for page_data in pages_data:
                text = page_data["text"]

                if not text.strip():
                    continue

                # Try semantic sectioning first
                sections = self.detect_section_boundaries(text)

                if len(sections) > 1:
                    # Process each section separately
                    for section in sections:
                        chunks = self.splitter.split_text(section)

                        for i, chunk in enumerate(chunks):
                            # Extract first heading as context
                            heading = page_data["headings"][0] if page_data["headings"] else "No Title"

                            doc = Document(
                                page_content=chunk,
                                metadata={
                                    "source": page_data["source"],
                                    "page": page_data["page_num"],
                                    "chunk_id": f"{page_data['source']}_p{page_data['page_num']}_c{i}",
                                    "section_heading": heading,
                                    "total_chunks_in_page": len(chunks)
                                }
                            )
                            all_chunks.append(doc)
                else:
                    # Fallback to standard chunking
                    chunks = self.splitter.split_text(text)

                    for i, chunk in enumerate(chunks):
                        heading = page_data["headings"][0] if page_data["headings"] else "No Title"

                        doc = Document(
                            page_content=chunk,
                            metadata={
                                "source": page_data["source"],
                                "page": page_data["page_num"],
                                "chunk_id": f"{page_data['source']}_p{page_data['page_num']}_c{i}",
                                "section_heading": heading,
                                "total_chunks_in_page": len(chunks)
                            }
                        )
                        all_chunks.append(doc)

        return all_chunks

    def save_chunks(self, chunks: List[Document], output_path: str = "chunks_output.json"):
        """Save chunks to JSON for inspection"""
        chunks_data = [
            {
                "content": chunk.page_content,
                "metadata": chunk.metadata
            }
            for chunk in chunks
        ]

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunks_data, f, indent=2, ensure_ascii=False)

        print(f"Saved {len(chunks)} chunks to {output_path}")

# Example usage
if __name__ == "__main__":
    # Upload your PDFs first (use Colab's file upload)
    from google.colab import files

    print("Upload your instructional PDF files...")
    uploaded = files.upload()

    pdf_paths = list(uploaded.keys())
    print(f"\nUploaded files: {pdf_paths}")

    # Initialize chunker
    chunker = InstructionalPDFChunker(
        chunk_size=800,      # Adjust based on your needs
        chunk_overlap=150     # Maintain context between chunks
    )

    # Process PDFs
    print("\n" + "="*50)
    print("Chunking documents...")
    print("="*50)

    chunks = chunker.chunk_documents(pdf_paths)

    # Display statistics
    print(f"\nTotal chunks created: {len(chunks)}")
    print(f"Average chunk length: {sum(len(c.page_content) for c in chunks) / len(chunks):.0f} characters")

    # Show sample chunks
    print("\n" + "="*50)
    print("SAMPLE CHUNKS (first 3):")
    print("="*50)

    for i, chunk in enumerate(chunks[:3], 1):
        print(f"\n--- Chunk {i} ---")
        print(f"Source: {chunk.metadata['source']}")
        print(f"Page: {chunk.metadata['page']}")
        print(f"Section: {chunk.metadata['section_heading']}")
        print(f"Content preview: {chunk.page_content[:200]}...")

    # Save to file
    chunker.save_chunks(chunks, "training_modules_chunks2.json")

    # Download the results
    print("\nDownloading chunks file...")
    files.download("training_modules_chunks.json")

    print("\n✅ Chunking complete!")

Upload your instructional PDF files...


Saving Time Management, LEAP Online.pdf to Time Management, LEAP Online.pdf

Uploaded files: ['Time Management, LEAP Online.pdf']

Chunking documents...
Processing: Time Management, LEAP Online.pdf

Total chunks created: 123
Average chunk length: 592 characters

SAMPLE CHUNKS (first 3):

--- Chunk 1 ---
Source: Time Management, LEAP Online.pdf
Page: 1
Section: 
Content preview: Time Management 
LEAP Online 
 
University of Bolton 2024 
 
 
 
 
 
 
 
 
 
 
 
Time Management 
LEAP Online...

--- Chunk 2 ---
Source: Time Management, LEAP Online.pdf
Page: 2
Section: Contents
Content preview: Time Management 
LEAP Online 
 
University of Bolton 2024 
 
 
Contents 
 
Introduction ...................................................................................................................

--- Chunk 3 ---
Source: Time Management, LEAP Online.pdf
Page: 2
Section: Contents
Content preview: Time management in Higher Education ................................................................

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Chunking complete!


In [None]:

import pymupdf
from pathlib import Path
from typing import List, Dict
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import json

class InstructionalPDFChunker:
    """
    Chunker optimized for instructional design materials with semantic awareness
    """

    def __init__(self, chunk_size=800, chunk_overlap=150):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

        # Text splitter with separators optimized for training content
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            separators=[
                "\n\n\n",
                "\n\n",
                "\n",
                ". ",
                "! ",
                "? ",
                " ",
                ""
            ],
            length_function=len,
        )

    def extract_text_from_pdf(self, pdf_path: str) -> List[Dict]:
        """Extract text from PDF with metadata"""
        doc = pymupdf.open(pdf_path)
        pages_data = []

        for page_num, page in enumerate(doc, start=1):
            text = page.get_text()

            # Extract potential headings (text in larger font or bold)
            blocks = page.get_text("dict")["blocks"]
            headings = []

            for block in blocks:
                if "lines" in block:
                    for line in block["lines"]:
                        for span in line["spans"]:
                            # Detect headings by font size or flags
                            if span["size"] > 12 or span["flags"] & 2**4:
                                headings.append(span["text"].strip())

            pages_data.append({
                "page_num": page_num,
                "text": text,
                "headings": headings,
                "source": Path(pdf_path).name
            })

        doc.close()
        return pages_data

    def is_irrelevant_section(self, text: str) -> bool:
        """Detect if text is from irrelevant sections"""
        text_lower = text.lower()

        # Patterns for sections to ignore
        irrelevant_patterns = [
            'references',
            'bibliography',
            'works cited',
            'about the author',
            'author information',
            'author bio',
            'acknowledgment',
            'table of contents',
            'index',
            'appendix',
            'copyright',
            'isbn',
            'published by'
        ]

        # Check if the text starts with these headings
        text_start = text_lower[:100]
        for pattern in irrelevant_patterns:
            if pattern in text_start:
                return True

        # Check for citation-heavy text (likely references section)
        citation_patterns = [
            r'\([12]\d{3}\)',
            r'\bet\s+al\.',
            r'\bpp?\.\s*\d+',
            r'doi:',
            r'http[s]?://(?:dx\.)?doi\.org',
        ]

        citation_count = sum(len(re.findall(pattern, text_lower)) for pattern in citation_patterns)
        words = len(text.split())

        # If more than 15% are citations, likely a reference section
        if words > 20 and citation_count > words * 0.15:
            return True

        return False

    def clean_text(self, text: str) -> str:
        """Clean text by removing headers, footers, and page numbers"""
        lines = text.split('\n')
        cleaned_lines = []

        for line in lines:
            line_stripped = line.strip()

            # Skip very short lines
            if len(line_stripped) < 5:
                continue

            # Skip page numbers (standalone numbers)
            if re.match(r'^\d+$', line_stripped):
                continue

            # Skip copyright notices
            if re.match(r'^©|^copyright', line_stripped, re.IGNORECASE):
                continue

            cleaned_lines.append(line)

        return '\n'.join(cleaned_lines)

    def detect_section_boundaries(self, text: str) -> List[str]:
        """Detect natural section boundaries in instructional content"""
        patterns = [
            r'\n\s*(?:Chapter|Module|Section|Lesson|Unit)\s+\d+',
            r'\n\s*(?:Objective|Learning Outcome|Goal)s?:',
            r'\n\s*(?:Procedure|Steps|Instructions):',
            r'\n\s*(?:Example|Exercise|Practice):',
            r'\n\s*(?:Summary|Conclusion|Review):',
            r'\n\s*\d+\.\s+[A-Z]',
        ]

        sections = []
        last_pos = 0

        for pattern in patterns:
            for match in re.finditer(pattern, text, re.IGNORECASE):
                if match.start() > last_pos:
                    sections.append(text[last_pos:match.start()].strip())
                    last_pos = match.start()

        if last_pos < len(text):
            sections.append(text[last_pos:].strip())

        return [s for s in sections if len(s) > 100]

    def chunk_documents(self, pdf_paths: List[str]) -> List[Document]:
        """Main chunking method for multiple PDFs"""
        all_chunks = []

        for pdf_path in pdf_paths:
            print(f"Processing: {pdf_path}")
            pages_data = self.extract_text_from_pdf(pdf_path)

            for page_data in pages_data:
                text = page_data["text"]

                if not text.strip():
                    continue

                # Clean the text
                text = self.clean_text(text)

                # Skip if this is an irrelevant section
                if self.is_irrelevant_section(text):
                    print(f"  Skipping irrelevant section on page {page_data['page_num']}")
                    continue

                # Try semantic sectioning first
                sections = self.detect_section_boundaries(text)

                if len(sections) > 1:
                    for section in sections:
                        if self.is_irrelevant_section(section):
                            continue

                        chunks = self.splitter.split_text(section)

                        for i, chunk in enumerate(chunks):
                            heading = page_data["headings"][0] if page_data["headings"] else "No Title"

                            doc = Document(
                                page_content=chunk,
                                metadata={
                                    "source": page_data["source"],
                                    "page": page_data["page_num"],
                                    "chunk_id": f"{page_data['source']}_p{page_data['page_num']}_c{i}",
                                    "section_heading": heading,
                                    "total_chunks_in_page": len(chunks)
                                }
                            )
                            all_chunks.append(doc)
                else:
                    chunks = self.splitter.split_text(text)

                    for i, chunk in enumerate(chunks):
                        heading = page_data["headings"][0] if page_data["headings"] else "No Title"

                        doc = Document(
                            page_content=chunk,
                            metadata={
                                "source": page_data["source"],
                                "page": page_data["page_num"],
                                "chunk_id": f"{page_data['source']}_p{page_data['page_num']}_c{i}",
                                "section_heading": heading,
                                "total_chunks_in_page": len(chunks)
                            }
                        )
                        all_chunks.append(doc)

        return all_chunks

    def save_chunks(self, chunks: List[Document], output_path: str = "chunks_output.json"):
        """Save chunks to JSON for inspection"""
        chunks_data = [
            {
                "content": chunk.page_content,
                "metadata": chunk.metadata
            }
            for chunk in chunks
        ]

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(chunks_data, f, indent=2, ensure_ascii=False)

        print(f"Saved {len(chunks)} chunks to {output_path}")

# Example usage
if __name__ == "__main__":
    from google.colab import files

    print("Upload your instructional PDF files...")
    uploaded = files.upload()

    pdf_paths = list(uploaded.keys())
    print(f"\nUploaded files: {pdf_paths}")

    # Initialize chunker
    chunker = InstructionalPDFChunker(
        chunk_size=800,
        chunk_overlap=150
    )

    # Process PDFs
    print("\n" + "="*50)
    print("Chunking documents...")
    print("="*50)

    chunks = chunker.chunk_documents(pdf_paths)

    # Display statistics
    print(f"\nTotal chunks created: {len(chunks)}")
    print(f"Average chunk length: {sum(len(c.page_content) for c in chunks) / len(chunks):.0f} characters")

    # Show sample chunks
    print("\n" + "="*50)
    print("SAMPLE CHUNKS (first 3):")
    print("="*50)

    for i, chunk in enumerate(chunks[:3], 1):
        print(f"\n--- Chunk {i} ---")
        print(f"Source: {chunk.metadata['source']}")
        print(f"Page: {chunk.metadata['page']}")
        print(f"Section: {chunk.metadata['section_heading']}")
        print(f"Content preview: {chunk.page_content[:200]}...")

    # Save to file
    chunker.save_chunks(chunks, "training_modules_chunks.json")

    # Download the results
    print("\nDownloading chunks file...")
    files.download("training_modules_chunks.json")

    print("\n✅ Chunking complete!")

Upload your instructional PDF files...


Saving Time Switching The Advanced Method of Time Management for Self-Education (Sazonov, Victor) (Z-Library).pdf to Time Switching The Advanced Method of Time Management for Self-Education (Sazonov, Victor) (Z-Library).pdf

Uploaded files: ['Time Switching The Advanced Method of Time Management for Self-Education (Sazonov, Victor) (Z-Library).pdf']

Chunking documents...
Processing: Time Switching The Advanced Method of Time Management for Self-Education (Sazonov, Victor) (Z-Library).pdf
  Skipping irrelevant section on page 7
  Skipping irrelevant section on page 33
  Skipping irrelevant section on page 34

Total chunks created: 56
Average chunk length: 589 characters

SAMPLE CHUNKS (first 3):

--- Chunk 1 ---
Source: Time Switching The Advanced Method of Time Management for Self-Education (Sazonov, Victor) (Z-Library).pdf
Page: 2
Section: 
Content preview: TIME SWITCHING
The Advanced Method of Time Management
for Self-Education
Victor Sazonov...

--- Chunk 2 ---
Source: Time Switchi

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Chunking complete!
