In [1]:
from test_sematic_chunking import pdf_path, extract_book_text, chunk_pdf_content

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\olanr\AppData\Local\sagemaker\sagemaker\config.yaml


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pdf_content = extract_book_text(pdf_path)

In [3]:
chunks = chunk_pdf_content(pdf_content)

[32m2024-10-02 23:07:52 INFO semantic_chunkers.utils.logger Single document exceeds the maximum token limit of 300. Splitting to sentences before semantically merging.[0m
100%|██████████| 337/337 [03:22<00:00,  1.67it/s]


In [4]:
chunks[0][1].splits

['If you have',
 'purchased a license to use this document from IIBA',
 '®, you may transfer ownership to a third party.',
 'IIBA® members may not',
 'transfer ownership of their complimentary copy.',
 'This document is provided to the busine ss analysis community for educational purpos es.',
 'IIBA® does not warrant that it is',
 'suitable for any other purpose and makes no expressed or impl ied warranty of any kind and assumes no responsibility for',
 'errors or omissions.',
 'No liability is assumed for incidental or consequential damages in connection with or arising out of the',
 'use of the information contained herein.IIBA',
 '®, the IIBA® logo, BABOK® and Business Analysis Body of Knowledge® are registered trademarks owned by',
 'International Institute of Business Analysis.',
 'CBAP® is a registered certification mark owned by International Institute of',
 'Business Analysis.',
 'Certified Business Analysis Professional,  EEP and the EEP logo are trademarks owned by Internatio

In [5]:
import fitz  # PyMuPDF
from collections import Counter


def gather_all_search_texts(chunks: list) -> list:
    """Collect all subtexts from the chunks upfront to avoid repeated calls to `.splits`."""
    all_search_texts = []
    for chunk in chunks[0]:
        all_search_texts.append(chunk.splits)  # Assuming `chunk.splits` gives the subtexts
    return all_search_texts


def get_text_to_page_mapping(pdf_path: str) -> dict:
    """Get a mapping of page numbers to their text."""
    page_text_map = {}
    
    try:
        with fitz.open(pdf_path) as pdf:
            # Loop through all pages and get their text
            for page_num in range(len(pdf)):
                page = pdf.load_page(page_num)
                page_text_map[page_num + 1] = page.get_text().replace("\n", "").replace("\t", "")  # Store 1-based page number
    except FileNotFoundError:
        print(f"PDF Filepath: {pdf_path} does not exist")
        return {}
    
    return page_text_map


def get_text_to_page_map(page_text_map: dict, search_texts: list) -> dict:
    """Search for all subtexts in the pre-fetched page text."""
    search_texts_map = {}
    page_numbers = []

    for sentence in search_texts:
        for page_num, page_text in page_text_map.items():
            if sentence.lower() in page_text.lower():  # Case-insensitive search
                page_numbers.append(page_num)

    if Counter(page_numbers).most_common(1):
        search_texts_map[" ".join(search_texts)] = Counter(page_numbers).most_common(1)[0][0]
        return search_texts_map
    else:
        search_texts_map[" ".join(search_texts)] = -1
        return search_texts_map


def texts_to_page_mapping(pdf_path: str, chunks: list) -> dict:
    """Create a full page mapping for all text chunks."""
    if not chunks or not chunks[0]:
        return {}

    chunk_mapping = {}
    
    # Step 1: Gather all search texts first
    all_search_texts = gather_all_search_texts(chunks)  # Pre-process search texts to avoid redundant calls
    print(f"Collected {len(all_search_texts)} search texts.")

    # Step 2: Get page-to-text mapping first
    page_text_map = get_text_to_page_mapping(pdf_path)
    print("Page text mapping complete.")

    # Step 3: Map search texts to pages
    for search_texts in all_search_texts:
        search_texts_map = get_text_to_page_map(page_text_map, search_texts)
        chunk_mapping.update(search_texts_map)

    return chunk_mapping


# Example usage

pages2 = texts_to_page_mapping(pdf_path, chunks)


Collected 1349 search texts.
Page text mapping complete.


In [23]:
len(pages2.values())

1349

In [22]:
len(normalize_pages(pages2).values())

1349

In [27]:
def derive_final_maps(texts_to_page_map: Dict)-> Tuple[Dict]:
    final_map = {}
    pages_map = {}
    
    for idx, text in enumerate(texts_to_page_map.keys()):
        final_map[f"chunk_{idx}"] = [text]
        pages_map[f"chunk_{idx}"] = texts_to_page_map[text]

    return final_map, pages_map

x, y = derive_final_maps(pages2)

In [29]:
y

{'chunk_0': 4,
 'chunk_1': 4,
 'chunk_2': 4,
 'chunk_3': 5,
 'chunk_4': 6,
 'chunk_5': 6,
 'chunk_6': 6,
 'chunk_7': 7,
 'chunk_8': 8,
 'chunk_9': 8,
 'chunk_10': 9,
 'chunk_11': 9,
 'chunk_12': 9,
 'chunk_13': 9,
 'chunk_14': 10,
 'chunk_15': 10,
 'chunk_16': 11,
 'chunk_17': 11,
 'chunk_18': 12,
 'chunk_19': 12,
 'chunk_20': 13,
 'chunk_21': 13,
 'chunk_22': 14,
 'chunk_23': 14,
 'chunk_24': 14,
 'chunk_25': 15,
 'chunk_26': 15,
 'chunk_27': 16,
 'chunk_28': 16,
 'chunk_29': 16,
 'chunk_30': 17,
 'chunk_31': 17,
 'chunk_32': 17,
 'chunk_33': 18,
 'chunk_34': 18,
 'chunk_35': 19,
 'chunk_36': 19,
 'chunk_37': 19,
 'chunk_38': 19,
 'chunk_39': 20,
 'chunk_40': 21,
 'chunk_41': 21,
 'chunk_42': 22,
 'chunk_43': 22,
 'chunk_44': 22,
 'chunk_45': 23,
 'chunk_46': 23,
 'chunk_47': 23,
 'chunk_48': 24,
 'chunk_49': 25,
 'chunk_50': 25,
 'chunk_51': 25,
 'chunk_52': 25,
 'chunk_53': 26,
 'chunk_54': 26,
 'chunk_55': 26,
 'chunk_56': 27,
 'chunk_57': 27,
 'chunk_58': 27,
 'chunk_59': 28,
 'ch

In [28]:
x

{'chunk_0': ['A GUIDE TO THE BUSINESS ANALYSIS BODY OF KNOWLEDGE®v3BABOK® v3 A GUIDE TO THE BUSINESS ANALYSIS BODY OF KNOWLEDGE®Complimentary IIBA® Member Copy. Not for Distribution or Resale.International Institute of Business Analysis, Toronto, Ontario, Canada. ©2005, 2006, 2008, 2009, 2015 International Institut e of Business Analysis. All rights reserved. Version 1.0 and 1.4 published 2005. Version 1.6 Draft publ ished 2006. Version 1.6 Final published 2008. Version 2.0 published 2009. Version 3.0 published 2015.ISBN-13: 978-1-927584-03-3 Permission is granted to reproduce this document for your own personal, professi onal, or educational use.'],
 'chunk_1': ['If you have purchased a license to use this document from IIBA ®, you may transfer ownership to a third party. IIBA® members may not transfer ownership of their complimentary copy. This document is provided to the busine ss analysis community for educational purpos es. IIBA® does not warrant that it is suitable for any other 

In [25]:
from typing import Dict, Tuple
def normalize_pages(texts_to_page_map: Dict):
    previous_page = 0

    for text in texts_to_page_map.keys():
        if texts_to_page_map[text] < 0:
            texts_to_page_map[text] < previous_page
        else:
            previous_page = texts_to_page_map[text]

    return texts_to_page_map

normalize_pages(pages2)

{'A GUIDE TO THE BUSINESS ANALYSIS BODY OF KNOWLEDGE®v3BABOK® v3 A GUIDE TO THE BUSINESS ANALYSIS BODY OF KNOWLEDGE®Complimentary IIBA® Member Copy. Not for Distribution or Resale.International Institute of Business Analysis, Toronto, Ontario, Canada. ©2005, 2006, 2008, 2009, 2015 International Institut e of Business Analysis. All rights reserved. Version 1.0 and 1.4 published 2005. Version 1.6 Draft publ ished 2006. Version 1.6 Final published 2008. Version 2.0 published 2009. Version 3.0 published 2015.ISBN-13: 978-1-927584-03-3 Permission is granted to reproduce this document for your own personal, professi onal, or educational use.': 4,
 'If you have purchased a license to use this document from IIBA ®, you may transfer ownership to a third party. IIBA® members may not transfer ownership of their complimentary copy. This document is provided to the busine ss analysis community for educational purpos es. IIBA® does not warrant that it is suitable for any other purpose and makes no e

In [10]:
d = {"one":1, "two": 2, "three": 3}

d.values = [5, 6, 7]

d

AttributeError: 'dict' object attribute 'values' is read-only