#### Installations

In [5]:
!pip install pymupdf pytesseract pdf2image opencv-python pillow langchain openai
!apt install poppler-utils tesseract-ocr


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
zsh:1: command not found: apt


#### Extract Text from PDFs

In [6]:
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    for i, page in enumerate(doc):
        text = page.get_text()
        all_text.append({"page": i+1, "text": text})
    return all_text

spec_text = extract_text_from_pdf("./Input - Specifications.pdf")

# Preview first few pages
for section in spec_text[:3]:
    print(f"Page {section['page']}")
    print(section['text'][:1000])
    print("="*80)

Page 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
                                                                                             
 
FOULGER-PRATT HEADQUARTERS 
PROJECT MANUAL 
 
 
12435 PARK POTOMAC AVENUE 
POTOMAC, MD 20852 
 
 
ISSUED FOR PERMIT AND BID 
May 28, 2015 
 
 
 
 
 
 
 
 
 
 
 
 

Page 2
FOULGER-PRATT HEADQUARTERS 
PERKINS+WILL 
 
860424 
ISSUE FOR PERMIT AND BID 
05/28/15 
 
 
 
Division 01 - General Requirements 
 
01 10 00 
Summary 
01 21 00 
Allowances 
01 22 00 
Unit Prices 
01 23 00 
Alternates 
01 25 00 
Substitution Procedures 
01 26 00 
Contract Modification Procedures 
01 29 00 
Payment Procedures 
01 31 00 
Project Management and Coordination 
01 32 00 
Construction Progress Documentation 
01 32 33 
Photographic Documentation 
01 33 00 
Submittal Procedures 
01 40 00 
Quality Requirements 
01 42 00 
References 
01 50 00 
Temporary Facilities and Controls 
01 60 00 
Product Requirements 
01 73 00 
Execution 
0

#### Convert Drawing PDF pages to Images

In [2]:
from pdf2image import convert_from_bytes

with open("Input - Construction_Drawings.pdf", "rb") as f:
    pdf_bytes = f.read()

drawing_images = convert_from_bytes(pdf_bytes, dpi=200)

# Save the first few pages as PNGs
for idx, img in enumerate(drawing_images[:3]):
    img.save(f"drawing_page_{idx+1}.png")


KeyboardInterrupt: 

#### OCR on Drawings

In [3]:
import pytesseract
from PIL import Image

def ocr_image(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

# Example: OCR on first 3 pages
drawing_ocr_data = []
for i in range(3):
    page_path = f"drawing_page_{i+1}.png"
    ocr_text = ocr_image(page_path)
    drawing_ocr_data.append({"page": i+1, "text": ocr_text})


In [4]:
drawing_ocr_data

[{'page': 1,
  'text': 'FOULGER-PRATT HEADQUARTERS\n\n12435 PARK POTOMAC AVE.,\n\nPOTOMAC, MD\n\nISSUED FOR PERMIT AND BID\n\n05.28.15\n\nCopyright © 2015 Perkins+Will\n\neeeeaee\neevereeen\nveveere J\nve\n\n.\neeeneecescaeos\nveeereooerererrer sere\nSAeeeeoresesoaaeeaaeen\nJ e 4 .\n\nsnateeenee\neeeceeece\nsevesece\n\neseeerce\n\neeace\n\neeoceeveceseccee\n\npaeenaeesae\n\nPERKINS\n+WILL\n\nTr\neseeeeuees\nsesseacecs\n9020090606\nesecesoces\ni) ¢aa6ee0\nseees ssesn\nOcececasoseaoace\nesoconvsececa\nsooceeoes\n6eceeese\neaaeece\n\nill.com\n\nINSWI\n\n+A www.perk\n\nsupad@siabouJoyluUal OH-d4 pZpogg\\duiey\\;o\nWd O-€¢-€ SL0Z/82/S\n\n'},
 {'page': 2,
  'text': 'c:\\temp\\860424_FP-HQ_jennifer.rogers@perkinswill.com.rvt\n\n5/28/2015 3:43:11 PM\n\nABBREVIATIONS LEGEND MATERIALS SYMBOLS LEGEND GENERAL PROJECT NOTES\n\nNOTE 1: ABBREVIATIONS WHEN USED IN COMPOSITION MAY INCLUDE PERIODS FOR CLARIFICATION\nNOTE 2: ABBREVIATIONS MAY BE DIFFERENT WHEN A PART OF A LEGEND\n\nEMER SHR\nENGR\n\nEO\n

#### Initial Chunking (Specs) and Search for Key Terms

In [9]:
import re

def chunk_spec_by_section(text_data):
    chunks = []
    current_chunk = ""
    for page_data in text_data:
        page_text = page_data["text"]
        current_chunk += page_text + "\n"
        if re.search(r"^\d{2} \d{2} \d{2}", page_text, re.MULTILINE):
            chunks.append(current_chunk.strip())
            current_chunk = ""
    if current_chunk:
        chunks.append(current_chunk)
    return chunks

spec_chunks = chunk_spec_by_section(spec_text)

# Preview 1st chunk
print(spec_chunks[0][:2000])

FOULGER-PRATT HEADQUARTERS 
PROJECT MANUAL 
 
 
12435 PARK POTOMAC AVENUE 
POTOMAC, MD 20852 
 
 
ISSUED FOR PERMIT AND BID 
May 28, 2015 
 
 
 
 
 
 
 
 
 
 
 
 

FOULGER-PRATT HEADQUARTERS 
PERKINS+WILL 
 
860424 
ISSUE FOR PERMIT AND BID 
05/28/15 
 
 
 
Division 01 - General Requirements 
 
01 10 00 
Summary 
01 21 00 
Allowances 
01 22 00 
Unit Prices 
01 23 00 
Alternates 
01 25 00 
Substitution Procedures 
01 26 00 
Contract Modification Procedures 
01 29 00 
Payment Procedures 
01 31 00 
Project Management and Coordination 
01 32 00 
Construction Progress Documentation 
01 32 33 
Photographic Documentation 
01 33 00 
Submittal Procedures 
01 40 00 
Quality Requirements 
01 42 00 
References 
01 50 00 
Temporary Facilities and Controls 
01 60 00 
Product Requirements 
01 73 00 
Execution 
01 74 19 
Construction Waste Management and Disposal 
01 77 00 
Closeout Procedures 
01 78 23 
Operation and Maintenance Data 
01 78 39 
Project Record Documents 
01 79 00 
Demonstration and Tr

#### Create Structured Data

In [10]:
import json

data_structure = {
    "spec_sections": [],
    "drawings": []
}

for i, chunk in enumerate(spec_chunks):
    data_structure["spec_sections"].append({
        "section_id": i+1,
        "raw_text": chunk
    })

for drawing in drawing_ocr_data:
    data_structure["drawings"].append({
        "page": drawing["page"],
        "ocr_text": drawing["text"]
    })

with open("structured_output.json", "w") as f:
    json.dump(data_structure, f, indent=2)


In [11]:
# Simple matching for demonstration
keywords = ["fire-rated door", "Hangers and Supports", "sprinkler", "concrete slab"]

cross_refs = []

for kw in keywords:
    spec_hits = [sec for sec in data_structure["spec_sections"] if kw.lower() in sec["raw_text"].lower()]
    draw_hits = [dwg for dwg in data_structure["drawings"] if kw.lower() in dwg["ocr_text"].lower()]

    if spec_hits or draw_hits:
        cross_refs.append({
            "keyword": kw,
            "spec_sections": [sec["section_id"] for sec in spec_hits],
            "drawing_pages": [dwg["page"] for dwg in draw_hits]
        })

# Show matched results
import pandas as pd
pd.DataFrame(cross_refs)


Unnamed: 0,keyword,spec_sections,drawing_pages
0,fire-rated door,"[171, 173, 174, 192]",[]
1,Hangers and Supports,"[3, 4, 201]",[]
2,sprinkler,"[83, 85, 88, 91, 94, 96, 99, 102, 108, 201]","[2, 3]"
3,concrete slab,"[185, 189]","[2, 3]"


In [12]:
spec_text

[{'page': 1,
  'text': ' \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n                                                                                             \n \nFOULGER-PRATT HEADQUARTERS \nPROJECT MANUAL \n \n \n12435 PARK POTOMAC AVENUE \nPOTOMAC, MD 20852 \n \n \nISSUED FOR PERMIT AND BID \nMay 28, 2015 \n \n \n \n \n \n \n \n \n \n \n \n \n'},
 {'page': 2,
  'text': 'FOULGER-PRATT HEADQUARTERS \nPERKINS+WILL \n \n860424 \nISSUE FOR PERMIT AND BID \n05/28/15 \n \n \n \nDivision 01 - General Requirements \n \n01 10 00 \nSummary \n01 21 00 \nAllowances \n01 22 00 \nUnit Prices \n01 23 00 \nAlternates \n01 25 00 \nSubstitution Procedures \n01 26 00 \nContract Modification Procedures \n01 29 00 \nPayment Procedures \n01 31 00 \nProject Management and Coordination \n01 32 00 \nConstruction Progress Documentation \n01 32 33 \nPhotographic Documentation \n01 33 00 \nSubmittal Procedures \n01 40 00 \nQuality

In [13]:
with open("structured_specs.json", "w") as f:
    json.dump(spec_text, f, indent=2)