In [1]:
import cv2
import easyocr
import numpy as np
import pandas as pd
import torch
import os
import urllib.request

from fuzzywuzzy import process
from pdf2image import convert_from_path
from PIL import Image

from webscraping import extract_all_pdfs
from text_extraction import process_image, display_opencv_image

In [2]:
print(f"CUDA Available: {torch.cuda.is_available()}")

# Check if CuDNN is enabled in PyTorch
print(f"CuDNN Enabled: {torch.backends.cudnn.enabled}")

# Check CuDNN version in PyTorch
print(f"CuDNN Version: {torch.backends.cudnn.version()}")

CUDA Available: True
CuDNN Enabled: True
CuDNN Version: 90100


# Link and File Paths

In [3]:
ODCY_LINK = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d"

REL_PATH = "https://childcaresearch.ohio.gov/"

# Helper Functions

In [4]:
def group_into_rows(extracted_text, threshold=5):
    """
    Group extracted_text entries into rows based on their y-coordinates.
    
    :param extracted_text: A list of tuples of the form:
                          [ ( (x, y, w, h), [ (text, conf), ... ] ), ... ] 
                          assumed to be sorted top-to-bottom, left-to-right.
    :param threshold:    The distance in pixels to decide when to start a new row.
    :return:               A list (rows) of lists, each inner list is one row, 
                           containing the sub-rectangle data.
    """
    rows = []
    current_row = []
    if not extracted_text:
        return rows

    # start the first row’s baseline from the very first rectangle's y
    _, first_data = extracted_text[0]
    current_row_y = extracted_text[0][0][1]
    current_row_h = extracted_text[0][0][3]

    for ((x, y, w, h), text_data) in extracted_text:

        if abs(y - current_row_y) > threshold or abs(h - current_row_h) > threshold:
            # push the old row into rows
            rows.append(current_row)
            # start a new row
            current_row = []
            current_row_y = y
            current_row_h = h


        # add current bounding box/data to the current row
        current_row.append(((x, y, w, h), text_data))

    if current_row:
        rows.append(current_row)

    # sort each row by x-coordinate and remove empty rows
    rows = [sorted(r, key=lambda x: x[0][0]) for r in rows if not (len(r) == 1 and len(r[0][1]) == 0)]

    return rows

In [5]:
def process_ocr_to_dataframe(ocr_results, fields):
    """
    Process OCR results and flatten the extracted information into a CSV-ready format.

    Parameters:
    ocr_results (dict): Dictionary of OCR results containing bounding boxes and text.

    Returns:
    pd.DataFrame: A dataframe containing flattened, structured data for CSV storage.
    """
    extracted_data = {field: None for field in fields}

    for field in ocr_results:
        if len(field[1]) > 1:

            label, content = field[1][0][0], field[1][-1][0]
            confidence = field[1][0][1]

            best_match, score = process.extractOne(label, extracted_data.keys())
            if score > 95:
                if extracted_data[best_match] is not None and extracted_data[best_match] != content:
                    print(f"Field '{best_match}' field changed from '{extracted_data[best_match]}' to '{content}'")

                extracted_data[best_match] = content, confidence

    general_df = pd.DataFrame([extracted_data])

    general_df["Program Name"] = general_df.apply(lambda x: x["Program Name"][0], axis=1)
    general_df.set_index("Program Name", inplace=True)

    # general_df = general_df[sorted(general_df.columns)]

    return general_df

In [18]:
def get_row(rows, field, start_idx=0):
    idx = start_idx
    i = start_idx

    while i < len(rows) and idx == start_idx:
        row = rows[i][0][1][0][0]
        if field in row or row in field:
            idx = i

        # else: 
        # print(f"Field: {field} and Row: {row}")

        i += 1

    if idx == start_idx:
        print(f"Field '{field}' not found in rows.")

    return idx

def extract_license_capacity_table(extracted_text):
    rows = group_into_rows(extracted_text)
    # rows = extracted_text

    table_rows = [
        "Infant",
        "Young Toddler",
        "Total Under 2",
        "Older Toddler",
        "Preschool",
        "School",
        "Total Capacity/Enrollment"]

    columns = [
        "Full Time",
        "Part Time",
        "Total"
    ]

    table = {"License Capacity": {}}

    row_idx = 0

    for i, t_row in enumerate(table_rows):

        prev = row_idx
        row_idx = get_row(rows, t_row, row_idx)

        if row_idx == prev:
            continue

        current_row = rows[row_idx]

        # save license capacity totals
        if len(current_row) > len(columns) + 1:
            table["License Capacity"][t_row] = current_row[1][1]
            current_row = current_row[2:]
        else:
            current_row = current_row[1:]

        table[t_row] = {columns[i]: field[1][0] for i, field in enumerate(current_row)}

    return table

# Extract PDF Links

In [7]:
test_link = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d&p=1"

In [8]:
pdf_link_path = "pdf_links.csv"

if os.path.exists(pdf_link_path):
    pdf_links = pd.read_csv(pdf_link_path, index_col=0)
else:
    pdf_links = extract_all_pdfs(test_link, REL_PATH)
    pdf_links.to_csv(pdf_link_path, index=True)
pdf_links

Unnamed: 0_level_0,pdf,Address,City,Zip
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A BRIGHT START 4 KIDZ LEARNING CTR,https://childcaresearch.ohio.gov//pdf/00224002...,8211 PLATT,CLEVELAND,44104
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239
A CHILD'S GARDEN,https://childcaresearch.ohio.gov//pdf/00000020...,5427 JULMAR DRIVE,CINCINNATI,45238
A CHILD'S JOURNEY LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00217001...,846 S. YEARLING RD,WHITEHALL,43213
A CHILD'S PLACE LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00000040...,2010 OFFICEVIEW PLACE,REYNOLDSBURG,43068
A GREAT START PRESCHOOL INC,https://childcaresearch.ohio.gov//pdf/00000020...,7001 FAR HILLS AVE,DAYTON,45459
A JOYFUL JOURNEY ACADEMY,https://childcaresearch.ohio.gov//pdf/00222002...,1536 BARNETT ROAD,COLUMBUS,43227
A JUBILEE ACADEMY,https://childcaresearch.ohio.gov//pdf/00000030...,15751 LAKESHORE BLVD,CLEVELAND,44110
A KIDS ONLY EARLY LEARNING CENTER INC. 4,https://childcaresearch.ohio.gov//pdf/00219001...,2505 SOUTH RIDGE EAST,ASHTABULA,44004
A KIDS ONLY EARLY LEARNING CT INC,https://childcaresearch.ohio.gov//pdf/00000030...,2621 STATE ROAD,ASHTABULA,44004


In [9]:
local_file, _ = urllib.request.urlretrieve(pdf_links.iloc[1]['pdf'])
local_file

'C:\\Users\\WILLBL~1\\AppData\\Local\\Temp\\tmp1eecb9pt'

# Extract Text from PDF

In [10]:
images = convert_from_path(local_file, dpi=300)

In [11]:
image = images[0]

In [12]:
len(images)

12

In [13]:
ocr = easyocr.Reader(['en'], gpu=True)

In [14]:
extracted_text = process_image(image, ocr, verbose=True, display=True)
extracted_text

Detected 72 hierarchical sub-rectangles.
No text detected in sub-rectangle 6.
No text detected in sub-rectangle 39.
No text detected in sub-rectangle 53.
Extracted 69 / 72 text fields with an average confidence of 0.89.


[((152, 757, 2247, 59), [('Program Detalls', np.float64(1.0))]),
 ((152, 819, 858, 168),
  [('Program Name', np.float64(1.0)),
   ('BRIGHTER START CHILDCARE', np.float64(0.94)),
   ('4', np.float64(0.39))]),
 ((1012, 819, 805, 168),
  [('Program Number', np.float64(0.94)), ('000000200979', np.float64(0.58))]),
 ((1818, 819, 582, 168),
  [('Program Type', np.float64(1.0)), ('Child Care Center', np.float64(1.0))]),
 ((152, 989, 1664, 225),
  [('Address', np.float64(1.0)),
   ('2765 BLUE ROCK RD. CINCINNATI', np.float64(0.77)),
   ('OH', np.float64(0.98)),
   ('45239', np.float64(1.0))]),
 ((1818, 989, 582, 225),
  [('County', np.float64(1.0)), ('HAMILTON', np.float64(1.0))]),
 ((152, 1216, 2248, 55), []),
 ((152, 1273, 858, 112), [('Building Approval Date', np.float64(0.81))]),
 ((1012, 1273, 385, 112), [('Use Group/Code', np.float64(0.96))]),
 ((1399, 1273, 499, 112), [('Occupancy Limit', np.float64(1.0))]),
 ((1899, 1273, 501, 112), [('Maximum Under 2 Y', np.float64(0.69))]),
 ((152, 1

In [15]:
p1_fields = [ "Program Name",
        "Program Number",
        "Program Type",
        "County",
        "Building Approval Date",
        "Use Group/Code",
        "Occupancy Limit",
        "Maximum Under 2",   # under 2 1/2 but idk how this will be read...
        "Fire Inspection Approval Date",
        "Food Service Risk Level",
        "Inspection Type",
        "Inspection Scope",
        "Inspection Notice",
        "Inspection Date",
        "Begin Time",
        "End Time",
        "Reviewer",
        "No. Rules Verified",
        "No. Rules with Non-compliances",
        "No. Serious Risk",
        "No. Moderate Risk",
        "No. Low Risk"
]

In [16]:
process_ocr_to_dataframe(extracted_text, p1_fields)

Unnamed: 0_level_0,Program Number,Program Type,County,Building Approval Date,Use Group/Code,Occupancy Limit,Maximum Under 2,Fire Inspection Approval Date,Food Service Risk Level,Inspection Type,...,Inspection Notice,Inspection Date,Begin Time,End Time,Reviewer,No. Rules Verified,No. Rules with Non-compliances,No. Serious Risk,No. Moderate Risk,No. Low Risk
Program Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,"(000000200979, 0.94)","(Child Care Center, 1.0)","(HAMILTON, 1.0)",,,,,"(09/20/2024, 0.84)","(Level II, 0.87)","(Annual, 0.99)",...,"(Unannounced, 0.98)","(10/02/2024, 0.74)","(9:00 AM, 1.0)","(11.30 AM, 0.87)","(Kristin Blassingame, 0.76)","(58, 0.91)","(10, 0.74)",,,"(9, 0.77)"


In [19]:
extract_license_capacity_table(extracted_text)

{'License Capacity': {'Total Under 2': [('21', np.float64(1.0))],
  'Total Capacity/Enrollment': [('44', np.float64(0.96))]},
 'Infant': {'Full Time': ('11', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('11', np.float64(1.0))},
 'Young Toddler': {'Full Time': ('0', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('0', np.float64(0.53))},
 'Total Under 2': {'Full Time': ('11', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('11', np.float64(1.0))},
 'Older Toddler': {'Full Time': ('0', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('0', np.float64(0.95))},
 'Preschool': {'Full Time': ('21', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('21', np.float64(1.0))},
 'School': {'Full Time': ('0', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('0', np.float64(0.57))},
 'Total Capacity/Enrollment': {'Full Time': ('21', np.float64(1.0)),
  'Part Time': ('0', np.floa

In [None]:
rows = group_into_rows(extracted_text)
rows[15]

In [None]:
page_2 = process_image(images[1], ocr, display=True)
group_into_rows(page_2)

In [None]:
page_3 = process_image(images[2], ocr, display=True)
group_into_rows(page_3)

In [None]:
# rows = []
# for i in range(3, len(images)):
#     page = process_image(images[i], ocr, display=True)
#     rows.append(group_into_rows(page))
#    
# rows[-1]

In [None]:
local_file2, _ = urllib.request.urlretrieve(pdf_links.iloc[0]['pdf'])
# Extract Text from PDF
images2 = convert_from_path(local_file2, dpi=300)


rows2 = []
for i, image in enumerate(images2):
    page = process_image(image, ocr, display=True)
    rows2 += group_into_rows(page)
    
rows2

In [None]:
extract_license_capacity_table(rows2)