# PDF reader and embedder via Azure/ Open AI tools for MM-RAG
## 📘 Tanat Piumsuwan's Customized Version

This notebook also contains scripts for reading and extracting content from PDF files.

### ✅ Setup Before Running This Notebook:

- Ensure the **corresponding PDF file** is in its designated folder "\Data"
- Add yourown .env, based on the empty one I provided.

## Environment setup:

In [15]:
%load_ext autoreload
%autoreload 2
import os
from glob import glob

pdf_dir = os.getcwd()

from dotenv import load_dotenv
load_dotenv()

document_basename = os.environ.get("DATA_NAME")
pdf_dir = os.path.join(os.getcwd(),'Data')
document_name = glob(os.path.join(pdf_dir, "*.PDF"))[0]
pdf_doc = os.path.join(pdf_dir,document_name )  

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### OpenAI Models information

In [16]:
# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
DI_endpoint = os.environ.get('DI_ENDPOINT')
key = os.environ.get("DI_KEY")

## Azure DI

In [17]:
# import libraries
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, DocumentContentFormat
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest


# helper functions

def get_words(page, line):
    result = []
    for word in page.words:
        if _in_span(word, line.spans):
            result.append(word)
    return result


def _in_span(word, spans):
    for span in spans:
        if word.span.offset >= span.offset and (
            word.span.offset + word.span.length
        ) <= (span.offset + span.length):
            return True
    return False



def analyze_layout(pdf_doc):
    # sample document

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=DI_endpoint, credential=AzureKeyCredential(key)
    )
    # Open your local PDF file
    with open(pdf_doc, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            model_id="prebuilt-layout",
            body=f,         
        )

    result = poller.result()

    print("Extracted keys: ",result.keys())

    return result

In [18]:
document_intelligence_client = DocumentIntelligenceClient(
        endpoint=DI_endpoint, credential=AzureKeyCredential(key)
    )

In [19]:
DI_result = analyze_layout(pdf_doc)

Extracted keys:  dict_keys(['apiVersion', 'modelId', 'stringIndexType', 'content', 'pages', 'tables', 'paragraphs', 'styles', 'contentFormat', 'sections'])


## Extract text

In [21]:
import csv
import os

def polygon_center(polygon):
    # polygon is a list of floats: [x1, y1, x2, y2, x3, y3, x4, y4]
    xs = polygon[0::2]
    ys = polygon[1::2]
    return sum(xs) / len(xs), sum(ys) / len(ys)

def distance(p1, p2):
    return ((p1[0]-p2[0])**2 + (p1[1]-p2[1])**2) ** 0.5

def is_near(poly1, poly2, threshold=0.5):
    # Threshold in same unit as page dimensions (inch, etc.)
    c1 = polygon_center(poly1)
    c2 = polygon_center(poly2)
    return distance(c1, c2) < threshold

# Folder to save CSVs
csv_dir = os.path.join(os.getcwd(), 'selection_mark_text')
os.makedirs(csv_dir, exist_ok=True)

for page in DI_result.pages:
    print(f"----Analyzing layout from page #{page.page_number}----")

    if page.selection_marks:
        # List of rows for CSV
        csv_rows = []

        for sel_mark in page.selection_marks:
            sel_center = polygon_center(sel_mark.polygon)
            sel_state = sel_mark.state
            sel_conf = sel_mark.confidence

            # Find lines near selection mark
            nearby_texts = []
            if page.lines:
                for line in page.lines:
                    # Each line has a polygon attribute for bounding box? If not, approximate by words
                    # Let's get bounding polygon of line by min/max of word polygons
                    word_polygons = [word.polygon for word in line.words] if hasattr(line, "words") else []
                    if not word_polygons:
                        # fallback: no polygon for line? Skip or just check line center by words content positions if available
                        continue
                    
                    # Compute bounding polygon of line as min/max of all word polygons
                    xs = []
                    ys = []
                    for poly in word_polygons:
                        xs.extend(poly[0::2])
                        ys.extend(poly[1::2])
                    line_bbox = [min(xs), min(ys), max(xs), min(ys), max(xs), max(ys), min(xs), max(ys)]  # rectangle

                    # Check if line bbox is near selection mark polygon center (you could refine this)
                    if is_near(sel_mark.polygon, line_bbox, threshold=0.75):
                        nearby_texts.append(line.content)

            # Join all nearby lines into one string
            nearby_text = " ".join(nearby_texts) if nearby_texts else "(No nearby text found)"

            csv_rows.append({
                "Page Number": page.page_number,
                "Selection Mark State": sel_state,
                "Selection Mark Confidence": sel_conf,
                "Nearby Text": nearby_text
            })

        # Save CSV for this page
        csv_path = os.path.join(csv_dir, f"page_{page.page_number}_selection_marks.csv")
        with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
            fieldnames = ["Page Number", "Selection Mark State", "Selection Mark Confidence", "Nearby Text"]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(csv_rows)

        print(f"📝 Saved selection mark texts to CSV: {csv_path}")

    else:
        print(f"No selection marks found on page {page.page_number}, skipping CSV save.")


----Analyzing layout from page #1----
No selection marks found on page 1, skipping CSV save.
----Analyzing layout from page #2----
📝 Saved selection mark texts to CSV: e:\Work\KAsset\Form to csv\selection_mark_text\page_2_selection_marks.csv
----Analyzing layout from page #3----
📝 Saved selection mark texts to CSV: e:\Work\KAsset\Form to csv\selection_mark_text\page_3_selection_marks.csv
----Analyzing layout from page #4----
📝 Saved selection mark texts to CSV: e:\Work\KAsset\Form to csv\selection_mark_text\page_4_selection_marks.csv
----Analyzing layout from page #5----
📝 Saved selection mark texts to CSV: e:\Work\KAsset\Form to csv\selection_mark_text\page_5_selection_marks.csv
----Analyzing layout from page #6----
📝 Saved selection mark texts to CSV: e:\Work\KAsset\Form to csv\selection_mark_text\page_6_selection_marks.csv
----Analyzing layout from page #7----
📝 Saved selection mark texts to CSV: e:\Work\KAsset\Form to csv\selection_mark_text\page_7_selection_marks.csv
----Analyzing