In [34]:
!pip install chromadb
!pip install langchain
!pip install pdfplumber
!pip install PyMuPDF
!pip install fitz
!pip install pdf2image
!pip install pytesseract
!pip install pdfreader
!pip install tabula-py



In [35]:
import fitz  

pdf_path = "ACI_318_19.pdf"  
doc = fitz.open(pdf_path)

symbol_pages = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33] 
symbols_text = []

for page_num in symbol_pages:
    text = doc[page_num - 1].get_text("text")  
    symbols_text.append(text)

full_symbols_text = "\n".join(symbols_text)

with open("extracted_symbols.txt", "w", encoding="utf-8") as f:
    f.write(full_symbols_text)


In [36]:
import unicodedata
import re


with open("extracted_symbols.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()


def clean_text(text):
    text = unicodedata.normalize("NFKC", text) 
    text = re.sub(r"[^\x20-\x7E\u00A0-\uFFFF]", " ", text)  
    text = text.replace("̈[", "Δ") 
    text = re.sub(r"\s+", " ", text).strip()
    return text

cleaned_text = clean_text(raw_text)

with open("cleaned_symbols.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("Cleaned text saved as cleaned_symbols.txt!")


Cleaned text saved as cleaned_symbols.txt!


In [37]:
import csv
def check_symbol_after_word(text, symbol):
    pattern = rf"\b(\w+)\s*{re.escape(symbol)}"  
    matches = re.findall(pattern, text)
    return matches  


symbol = "="

with open("cleaned_symbols.txt", "r", encoding="utf-8") as f:
    text = f.read()

for line in text.split("\n"):
    word = check_symbol_after_word(line, symbol)
    if word:
        print(word)
print(len(word))
# with open("symbols.csv", "w", encoding="utf-8") as f:
#     writer = csv.writer(f)
#     writer.writerows(word)
# print("Symbols saved to symbols.csv!")


['a', 'av', 'Ab', 'Abp', 'Abrg', 'Ac', 'Acf', 'Ach', 'Acp', 'Acs', 'Acv', 'Acw', 'Af', 'Ag', 'Ah', 'Ahs', 'AƐ', 'PLQ', 'An', 'Anz', 'ANc', 'ANco', 'Aoh', 'Apd', 'Aps', 'Apt', 'As', 'Asc', 'Ash', 'Asi', 'Ast', 'At', 'Atp', 'Atr', 'Ats', 'Att', 'Av', 'Avd', 'Avf', 'PLQ', 'AVc', 'AVco', 'A1', 'A2', 'b', 'bc', 'bo', 'bs', 'bsl', 'bt', 'bv', 'bw', 'b1', 'b2', 'Bn', 'Bu', 'cac', 'cƍa1', 'C', 'dburst', 'PLQ', 'ca1', 'ca2', 'cb', 'cc', 'cNa', 'csl', 'ct', 'c1', 'c2', 'CP', 'da', 'dagg', 'db', 'eanc', 'dpile', 'eh', 'eƍN', 'eƍV', 'Ec', 'Ecb', 'Ecs', 'Ep', 'Es', 'fdc', 'fpc', 'fr', 'fs', 'Fnn', 'Fns', 'Fnt', 'Fun', 'Fus', 'Fut', 'h', 'ha', 'sl', 'hsl', 'hu', 'fsi', 'hanc', 'hƍef', 'fsif', 'hw', 'hwcs', 'Ib', 'Icr', 'Ig', 'Is', 'Ise', 'kf', 'Ɛa', 'Ɛbe', 'Ɛcb', 'Ɛd', 'Ɛdc', 'Ɛdb', 'Ɛanc', 'Ɛb', 'Ɛdh', 'Ɛdt', 'Ɛe', 'W', 'Ɛn', 'Ɛo', 'Ɛsc', 'Ɛst', 'Ɛt', 'Ɛtr', 'Ɛu', 'Ɛw', 'Ɛ1', 'Ɛ2', 'Ma', 'Mcr', 'M', 'Msa', 'Msc', 'Mu', 'Mua', 'M1', 'M1ns', 'M1s', 'M2', 'PLQ', 'M2ns', 'M2s', 'n', 'nƐ', 'ns', 'Na', '

In [38]:
import re
import pdfplumber
import pytesseract
from pdf2image import convert_from_path

symbol_fixes = {
    "ɂ": "ψ",   
    "Ȫ": "Ω",   
}

def fix_math_symbols(text):
    for wrong, correct in symbol_fixes.items():
        text = text.replace(wrong, correct)
    return text


def cidToChar(cidx):
    """Converts (cid:XX) to a character using an offset."""
    return chr(int(re.findall(r'\(cid\:(\d+)\)', cidx)[0]) + 29)


def clean_cid_text(text):
    """Replaces all (cid:XX) occurrences with their corresponding characters."""
    return re.sub(r'\(cid:\d+\)', lambda x: cidToChar(x.group()), text)


def extract_text_with_math(pdf_path, symbol_pages):
    """Extracts text only from the specified pages, cleaning CID characters."""
    text = ""

    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)

        for page_num in symbol_pages:
            if page_num > total_pages:
                print(f"⚠️ Skipping page {page_num}: Out of range")
                continue
            
            page = pdf.pages[page_num - 1] 
            extracted_text = page.extract_text(layout=True)

            if not extracted_text or len(extracted_text.strip()) < 20:  
                images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
                extracted_text = pytesseract.image_to_string(images[0], config="--psm 6")

            cleaned_text = clean_cid_text(extracted_text)
            cleaned_math_text = fix_math_symbols(cleaned_text)
            text += cleaned_math_text + "\n\n"
    
    return text



pdf_file_path = "ACI_318_19.pdf"
symbol_pages = [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
extracted_text = extract_text_with_math(pdf_file_path, symbol_pages)

with open("certain_extraction.txt", "w", encoding="utf-8") as f:
    f.write(extracted_text)

print("✅ Extraction complete! Processed only symbol pages.")


✅ Extraction complete! Processed only symbol pages.


In [39]:
import re
import csv

def parse_variable_descriptions(input_file, output_csv):
    """Parse variable descriptions from a text file and save to a CSV file."""

    
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    entries = []
    variable = None
    subscript = None
    description = ""
    in_definition = False


    var_pattern = r"^\s*([\wΨΩɃψΩ]+)\s*="  
    subscript_pattern = r"^\s+([\w\d,]+)"

    for line in lines:
        base_var_match = re.match(var_pattern, line)  
        subscript_match = re.match(subscript_pattern, line) if variable else None  
        desc_start_match = re.search(r"=\s*(.*)", line) if base_var_match else None 
        continuation = re.match(r"^\s+(.*)", line) if in_definition and not subscript_match else None  


        if base_var_match:
            if variable and (description or subscript):
                var_name = f"{variable}_{subscript}" if subscript else variable
                entries.append([var_name, description.strip()])

            variable = base_var_match.group(1)
            subscript = None
            description = desc_start_match.group(1) if desc_start_match else ""
            in_definition = True


        elif subscript_match and variable and not subscript:
            subscript = subscript_match.group(1).strip()


        elif continuation and in_definition:
            description += " " + continuation.group(1)


    if variable and (description or subscript):
        var_name = f"{variable}_{subscript}" if subscript else variable
        entries.append([var_name, description.strip()])


    with open(output_csv, 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Variable', 'Description'])
        writer.writerows(entries)

    print(f"✅ Extracted {len(entries)} variable descriptions to {output_csv}")
    return entries


if __name__ == "__main__":
    result = parse_variable_descriptions("certain_extraction.txt", "output.csv")
    for var, desc in result[:10]:  
        print(f"{var}: {desc[:50]}...")



# go through full pdf, and extract all math equations. Once I find all equations, I should use a AI reader or smthing ot read all these 
# horrible equations. 

✅ Extracted 405 variable descriptions to output.csv
a: depth of equivalent rectangular stress block, in....
a_v: shear span, equal to distance from center of conce...
A_b: area of an individual bar or wire, in.2...
A_bp: area of the attachment base plate in contact with...
A_brg: net bearing area of the head of stud, anchor bolt,...
A_c: area of concrete section resisting shear transfer,...
A_cf: greater gross cross-sectional area of the two orth...
A_ch: cross-sectional area of a member mmeeaassuurreedd ...
A_cp: area enclosed by outsidee ppeerriimeter of concret...
A_cs: cross-sectional area at oonnee end ooff aa ssttrru...


In [45]:
import re
import fitz  # PyMuPDF
import os
import csv
from pdfminer.high_level import extract_text
from sympy import sympify, pretty
import pytesseract
from PIL import Image

# CID & Symbol Fixes
symbol_fixes = {
    "ɂ": "ψ",   
    "Ȫ": "Ω",   
}

def fix_math_symbols(text):
    """Replaces incorrect symbols with proper math symbols."""
    for wrong, correct in symbol_fixes.items():
        text = text.replace(wrong, correct)
    return text

def cidToChar(cidx):
    """Converts (cid:XX) to a character using an offset."""
    return chr(int(re.findall(r'\(cid\:(\d+)\)', cidx)[0]) + 29)

def clean_cid_text(text):
    """Replaces all (cid:XX) occurrences with their corresponding characters."""
    return re.sub(r'\(cid:\d+\)', lambda x: cidToChar(x.group()), text)

# Step 1: Extract Text-Based Equations
def extract_equations(pdf_path):
    """Extract equations from a PDF file with CID and symbol fixes."""
    raw_text = extract_text(pdf_path)
    cleaned_text = clean_cid_text(raw_text)  # Fix CID issues
    cleaned_text = fix_math_symbols(cleaned_text)  # Fix symbols

    equation_patterns = [
        r"\b[A-Za-z0-9]+ *= *[^.\n]+",    # Variables assigned to formulas
        r"\([^)]*\)\s*=",                 # Parentheses with equation
        r"∑.*?=",                          # Summations
        r"∫.*?=",                          # Integrals
        r"lim.*?=",                         # Limits
        r"[\w\+\-\*/\^\(\)]+=[^.\n]+"      # General equations with arithmetic
    ]

    equations = []
    for pattern in equation_patterns:
        equations += re.findall(pattern, cleaned_text)

    return equations

# Step 2: Format Equations
def format_equations(equations):
    """Convert extracted equations into human-readable math expressions."""
    formatted = []
    
    for eq in equations:
        try:
            parsed_eq = sympify(eq)
            formatted.append(pretty(parsed_eq))
        except Exception:
            formatted.append(eq)  
    
    return formatted


def extract_equation_images(pdf_path, output_folder="equation_images"):
    """Extract images (possible equations) from a PDF."""
    os.makedirs(output_folder, exist_ok=True)

    doc = fitz.open(pdf_path)
    image_count = 0

    for page_number in range(len(doc)):
        for img_index, img in enumerate(doc[page_number].get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            image_filename = f"{output_folder}/equation_{page_number+1}_{img_index+1}.png"
            with open(image_filename, "wb") as img_file:
                img_file.write(image_bytes)
            image_count += 1

    print(f"✅ Extracted {image_count} possible equation images to {output_folder}")

# Step 4: Apply OCR to Extract Text from Images
def ocr_equations(image_folder="equation_images"):
    """Run OCR on extracted equation images with CID and symbol cleaning."""
    extracted_texts = {}

    for image_file in os.listdir(image_folder):
        img_path = os.path.join(image_folder, image_file)
        text = pytesseract.image_to_string(Image.open(img_path), config="--psm 6")

        cleaned_text = clean_cid_text(text)  # Fix CID in OCR results
        cleaned_text = fix_math_symbols(cleaned_text)  # Fix math symbols

        extracted_texts[image_file] = cleaned_text.strip()

    return extracted_texts

# Step 5: Save Extracted Data to CSV
def save_to_csv(text_equations, ocr_results, output_csv="extracted_equations.csv"):
    """Save extracted equations to a CSV file."""
    with open(output_csv, 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Source', 'Equation'])

        for eq in text_equations:
            writer.writerow(["Text-Based", eq])

        for img, eq in ocr_results.items():
            writer.writerow([img, eq])

    print(f"📄 Equations saved to {output_csv}")

# Step 6: Run the Full Pipeline
def extract_all_equations(pdf_path):
    print("📄 Extracting text-based equations...")
    text_equations = extract_equations(pdf_path)
    formatted_equations = format_equations(text_equations)

    print("🖼 Extracting image-based equations...")
    extract_equation_images(pdf_path)

    print("🔍 Running OCR on extracted images...")
    ocr_results = ocr_equations()

    print("💾 Saving everything to CSV...")
    save_to_csv(formatted_equations, ocr_results)

    print("\n✅ All Extracted Equations:")
    print("\n📜 Text-Based Equations:")
    for eq in formatted_equations[:10]:  # Show first 10
        print(eq)

    print("\n🖼 OCR-Based Equations:")
    for img, eq in ocr_results.items():
        print(f"{img}: {eq}")

    return formatted_equations, ocr_results

if __name__ == "__main__":
    pdf_file = "ACI_318_19.pdf"  # Replace with actual file path
    extract_all_equations(pdf_file)


📄 Extracting text-based equations...
🖼 Extracting image-based equations...
✅ Extracted 2655 possible equation images to equation_images
🔍 Running OCR on extracted images...
💾 Saving everything to CSV...
📄 Equations saved to extracted_equations.csv

✅ All Extracted Equations:

📜 Text-Based Equations:
Abrg = net bearing area of the head of stud, anchor bolt, or 
Acw = area of concrete section of an individual pier, hori-
sl = eඒective bearing area of shear lug, in2
Ahs = total cross-sectional area of hooked or headed bars 
min = minimum  area  of  longitudinal  reinforcement  to 
ANa = projected inÝuence area of a single adhesive anchor 
ANao = projected  inÝuence  area  of  a  single  adhesive 
ANc = projected  concrete  failure  area  of  a  single  anchor 
ANco   = projected concrete failure area of a single anchor, 
N = eඒective cross-sectional area of anchor in tension, 

🖼 OCR-Based Equations:
equation_182_9.png: ~~
equation_268_23.png: 
equation_430_13.png: gt +
equation_505_3.png

In [40]:
import pandas as pd
variables_df = pd.read_csv("output.csv")  # Ensure your CSV has a column with variable names
variable_list = variables_df['Variable'].tolist()
variable_list

['a',
 'a_v',
 'A_b',
 'A_bp',
 'A_brg',
 'A_c',
 'A_cf',
 'A_ch',
 'A_cp',
 'A_cs',
 'A_ct',
 'A_cv',
 'A_cw',
 'A_ef,sl',
 'A_f',
 'A_g',
 'A_h',
 'A_hs',
 'A_j',
 'A_ƭ',
 'A_ƭ,min',
 'A_n',
 'A_nz',
 'A_Na',
 'A_Nao',
 'A_Nc',
 'A_Nco',
 'A_o',
 'A_oh',
 'A_pd',
 'A_ps',
 'A_pt',
 'A_s',
 'Aƪ_s',
 'A_sc',
 'A_se,N',
 'A_se,V',
 'A_sh',
 'A_si',
 'A_s,min',
 'A_st',
 'A_t',
 'A_th',
 'A_tp',
 'A_tr',
 'A_ts',
 'A_parallel',
 'A_v',
 'A_onal',
 'A_vf',
 'A_vh',
 'A_v,min',
 'A_Vc',
 'A_Vco',
 'A_1',
 'A_2',
 'b',
 'b_c',
 'b_f',
 'b_o',
 'b_s',
 'b_sl',
 'b_slab',
 'b_t',
 'b_v',
 'b_w',
 'b_1',
 'b_2',
 'B_n',
 'B_u',
 'c_axis,',
 'c_ac',
 'c_a,max',
 'c_a,min',
 'c_a1',
 'cƪ_a1',
 'c_a2',
 'c_b',
 'c_c',
 'c_Na',
 'c_sl',
 'c_t',
 'c_1',
 'c_2',
 'C',
 'C_m',
 'd_of',
 'dƪ_of',
 'd_a',
 'd_agg',
 'd_b',
 'd_burst',
 'd_p',
 'd',
 'D',
 'D_s',
 'D_structural',
 'e_anc',
 'e_h',
 'eƪ_N',
 'eƪ_V',
 'E_forces',
 'E_c',
 'E_cb',
 'E_cs',
 'EI_eਙ',
 'E_p',
 'E_s',
 'fƪ_c',
 'f_ce',
 'f_d'

In [41]:
import chromadb
import langchain



client = chromadb.PersistentClient(path="./chromadb")
collection = client.get_or_create_collection(name="design_codes")

collection.add(
    ids=["aci_318_11"],
    documents=[pdf_text],  
    metadatas=[{"source": "ACI 318-11"}]
)

print("Text stored in ChromaDB!")

KeyboardInterrupt: 