# Phase 1: The PDF parser


**Goal:** Extract clean text and page numbers from a user-uploaded PDF.

In [124]:
import fitz  # Import the PyMuPDF library
# Define a function to accept pdf with its path
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path) # Open the Document
    # Preparing "Collection Basket"
    pages_data = [] 
    # loop through each page in the doc
    for page_no , page in enumerate(doc , start=1):
        texts = page.get_text()
        page_dict = {"page_no":page_no , "text":texts}
        pages_data.append(page_dict)

    return pages_data        

In [125]:
# Testing
path = "/Users/tushar04master/Documents/Project QnA/samples/Atomic habits.pdf"
extracted_data = extract_text_from_pdf(path)
print(f"The PDF has {len(extracted_data)} pages.")
print("\nData from the first page:")
print(extracted_data[26])
print("\nJust the text from the first page:")
print(extracted_data[26]['text'])

The PDF has 256 pages.

Data from the first page:
{'page_no': 27, 'text': 'Chapter Summary\nHabits are the compound interest of selfimprovement. Getting 1\npercent better every day counts for a lot in the long-run.\nHabits are a double-edged sword. They can work for you or against\nyou, which is why understanding the details is essential.\nSmall changes often appear to make no difference until you cross a\ncritical threshold. The most powerful outcomes of any compounding\nprocess are delayed. You need to be patient.\nAn atomic habit is a little habit that is part of a larger system. Just as\natoms are the building blocks of molecules, atomic habits are the\nbuilding blocks of remarkable results.\nIf you want better results, then forget about setting goals. Focus on\nyour system instead.\nYou do not rise to the level of your goals. You fall to the level of your\nsystems.\n'}

Just the text from the first page:
Chapter Summary
Habits are the compound interest of selfimprovement. Getting 

# Phase 2: The Document Processor (The "Chunker")

#### Problem:
The transformer models we use for Question Answering have a "short attention span." They can only read a certain amount of text at one time (typically around 512 tokens, which is roughly 300-400 words). If you give them an entire page from a book, which could have thousands of words, they get overwhelmed and can't process it.

#### Solution: 
We need to be like a book editor. We will take the long scroll of text from each page and cut it into smaller, standard-sized paragraphs or "chunks." We'll also make these chunks overlap slightly, so if an important sentence gets cut in half at the end of one chunk, it will be complete at the beginning of the next.

#### Goal:
 Write a function that takes the list_of_pages we created in Phase 1 and returns a new, much longer list. Each item in this new list will be a dictionary containing a page_num and a small chunk_of_text.

In [126]:
# Creating function for text chunker
def chunk_text(pages_data,chunk_size,chunk_overlap): # it's input is the output of the previous function . A list .
    # Prepare a New "Collection Basket"
    chunks = []
    # The Outer loop
    for page_item in pages_data:
        page_num = page_item['page_no']
        page_text = page_item['text']
        
        # --- Step 2: Apply the sliding window logic to the text of THIS page ---
        for i in range(0, len(page_text), chunk_size - chunk_overlap):
            
            # Get a slice of the page's text
            chunk_text = page_text[i : i + chunk_size]
            
            # --- Step 3: Create the final dictionary with metadata ---
            chunk_dict = {
                "page_num": page_num,
                "text_chunk": chunk_text
            }
            
            # --- Step 4: Add the dictionary to our final list ---
            chunks.append(chunk_dict)
            
    return chunks         

if we just jumped by the full chunk_size, there would be no overlap. By jumping forward by chunk_size - chunk_overlap, we ensure the start of the new chunk begins inside the end of the old one.

In [127]:
CHUNK_SIZE = 500
CHUNK_OVERLAP = 50

print("Chunking the document with the new function...")
final_chunked_data = chunk_text(
    pages_data=extracted_data, 
    chunk_size=CHUNK_SIZE, 
    chunk_overlap=CHUNK_OVERLAP
)
print("Chunking complete! ✅")

# --- Inspect the new results ---
print(f"\nOriginal number of pages: {len(extracted_data)}")
print(f"Number of chunks after processing: {len(final_chunked_data)}")

# Look at a few chunks to verify the structure and page numbers
print("\n--- Example Chunks with Page Numbers ---")
if len(final_chunked_data) > 5:
    for chunk in final_chunked_data[:50]:
        print(chunk)



Chunking the document with the new function...
Chunking complete! ✅

Original number of pages: 256
Number of chunks after processing: 1158

--- Example Chunks with Page Numbers ---
{'page_num': 3, 'text_chunk': 'AN IMPRINT OF PENGUIN RANDOM HOUSE LLC\n375 Hudson Street\nNew York, New York 10014\nCopyright © 2018 by James Clear\nPenguin supports copyright. Copyright fuels creativity, encourages diverse voices, promotes free speech, and creates a vibrant culture. Thank you for buying an authorized edition of this book and for\ncomplying with copyright laws by not reproducing, scanning, or distributing any part of it in any form without permission. You are supporting writers and allowing Penguin to continue to '}
{'page_num': 3, 'text_chunk': 'rting writers and allowing Penguin to continue to publish books\nfor every reader.\n'}
{'page_num': 4, 'text_chunk': 'Ebook ISBN 9780735211308\nWhile the author has made every effort to provide accurate Internet addresses at the time of publication,

# Phase 3: The **QA Engine** 

#### Hugging Face Pipeline

Goal: To create a QA pipeline and test it on a single text chunk from your PDF to see it in action.

In [128]:
# Import the Tool
from transformers import pipeline

# Hire Assistant
qa_pipe = pipeline(task="question-answering", model="distilbert-base-cased-distilled-squad")

# Testing phase 3
# Prepare the "Work Documents"
question = "What is the author's full name?"
# We grab the dictionary at index 3, then get the value from the 'text_chunk' key
context = final_chunked_data[1]['text_chunk']

# Find the specific chunk for our test
# We can do this with a simple loop or list comprehension
context_chunk = None
for chunk in final_chunked_data:
    if chunk['page_num'] == 236:
        context_chunk = chunk
        break # Stop after we find the first one

# Now, get just the text from that chunk
if context_chunk:
    context = context_chunk['text_chunk']
    print("--- Context for QA Test (from Page 236) ---")
    print(context)
else:
    print("Could not find a chunk for page 236.")

print("\nThis is the chunk☝🏼\n")

 # Give the command
result = qa_pipe(question=question,context=context)
print(result)


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Device set to use mps:0


--- Context for QA Test (from Page 236) ---
About the Author
James Clear's work has appeared in the New York Times, Time, and
Entrepreneur, and on CBS This Morning, and is taught in colleges around the
world. His website, jamesclear.com, receives millions of visitors each month,
and hundreds of thousands subscribe to his email newsletter. He is the creator of
The Habits Academy, the premier training platform for organizations and
individuals that are interested in building better habits in life and work.


This is the chunk☝🏼

{'score': 0.9961872696876526, 'start': 17, 'end': 28, 'answer': 'James Clear'}


# Phase 4: The Answer Aggregator

 Goal: Write a function that takes a question and your final_chunked_data list as input, and returns the single best answer found anywhere in the document.

In [129]:
# We'll use tqdm for a progress bar, as this will be a slow process.
from tqdm.auto import tqdm
# For voting
from collections import Counter # a tool that is a  specialized dictionary subclass designed for easily counting hashable objects. It stores elements as dictionary keys and their counts as dictionary values.
# Function
def get_answer(question , final_chunked_data, confidence_threshold=0.10):
    # Prepare a "Candidate List"
    candidate_answers = [] # List
    # the loop
    for chunk in tqdm(final_chunked_data):
        context = chunk['text_chunk']
        page_num = chunk['page_num']
        # implementing QA pipeline
        result = qa_pipe(question=question,context=context)

        # --- THE FINAL, ROBUST SAFETY CHECK ---
        # The pipeline can return a dictionary, a list, or be empty.
        # This code handles all cases.
        
        # 1. Determine if we have a valid result to work with
        if isinstance(result, list) and result:
            # If it's a non-empty list, grab the first dictionary
            result_dict = result[0]
        elif isinstance(result, dict):
            # If it's already a dictionary, just use it
            result_dict = result
        else:
            # If it's empty or something else, skip this chunk
            continue

        if result_dict:
          # add page no to this result
          result_dict['page_num'] = page_num
          candidate_answers.append(result_dict)

    # The "Filtering" Phase    
    # <=== We need to throw out all the low-confidence, nonsensical answers before we do anything else ===>
    confidence_answers = [ans for ans in candidate_answers if ans['score'] > confidence_threshold] # List comprehension
    if not confidence_answers:
        return {"Sorry , No answers found"}
    
    # The "Voting" phase
    answers = [ans['answer'] for ans in confidence_answers] # list comprehension
    # most_common() method:
    # This method returns a list of the n most common elements and their counts, sorted from most to least common.
    counts = Counter(answers).most_common(1)
    # The result will be a list containing a tuple, like [('James Clear', 5)]. 
    # You need to extract just the string "James Clear". Store this in a variable called most_common_answer_text
    most_common_answer = counts[0][0]

    # The Final Selection phase
    # Find the instance of the most common answer that has the highest score
    best_instance_of_common_answer = max(
        [ans for ans in confidence_answers if ans['answer'] == most_common_answer],
        key=lambda x: x['score']
    )

    return best_instance_of_common_answer

In [131]:

# --- Let's test our "Editor-in-Chief" function ---
question = "How should I remain focused all day"
final_answer = get_answer(question, final_chunked_data)

# --- Print the final result ---
print("\n--- The Final, Best Answer ---")
print(f"Answer: '{final_answer.get('answer', 'N/A')}'")
print(f"Confidence Score: {final_answer.get('score', 0):.4f}")
print(f"Found on Page: {final_answer.get('page_num', 'N/A')}")

  0%|          | 0/1158 [00:00<?, ?it/s]


--- The Final, Best Answer ---
Answer: 'relaxing'
Confidence Score: 0.9022
Found on Page: 182
