In [223]:
import cv2
import easyocr
import fuzzywuzzy.fuzz
import numpy as np
import pandas as pd
import torch
import os
import urllib.request

from fuzzywuzzy import process, fuzz
from pdf2image import convert_from_path
from PIL import Image

from webscraping import extract_all_pdfs
from text_extraction import process_image, display_opencv_image

In [98]:
print(f"CUDA Available: {torch.cuda.is_available()}")

# Check if CuDNN is enabled in PyTorch
print(f"CuDNN Enabled: {torch.backends.cudnn.enabled}")

# Check CuDNN version in PyTorch
print(f"CuDNN Version: {torch.backends.cudnn.version()}")

CUDA Available: True
CuDNN Enabled: True
CuDNN Version: 90100


# Link and File Paths

In [99]:
ODCY_LINK = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d"

REL_PATH = "https://childcaresearch.ohio.gov/"

In [352]:
PROGRAM_DETAILS = "Program Details"
LICENSE_CAPACITY = "License Capacity and Enrollment at the Time of Inspection"
RATIO_OBSERVED = "Staff-Child Ratios at the Time of Inspection"
SERIOUS_NC = "Serious Risk Non-Compliances"
MODERATE_NC = "Moderate Risk Non-Compliances"
LOW_NC = "Low Risk Non-Compliances"
IN_COMPLIANCE = "Rules in Compliance/Not Verified"

# Helper Functions

In [213]:
import logging

"""
For logging and scalability, I will need to do the following:
1. Create a parallelizable logger for each process
2. Have exceptions in each process be logged to the logger
    - Record the exception, the link/care center name, and the section being processed
    
Something like this:
logger.info(
            f"Processed {section_name} on page {page_id}\n"
            f"Rows processed: {len(rows)}\n"
            f"Link: {link}"
        )
"""
def configure_logger(log_file="process.log"):
    """Configure and return a logger."""
    # TODO: adapt for parallel processing
    
    logger = logging.getLogger(__name__)  # Use the module name
    logger.setLevel(logging.INFO)  # Set the logging level

    # Add a file handler
    file_handler = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    return logger

In [203]:
def group_into_rows(extracted_text, threshold=5):
    """
    Group extracted_text entries into rows based on their y-coordinates.
    
    :param extracted_text: A list of tuples of the form:
                          [ ( (x, y, w, h), [ (text, conf), ... ] ), ... ] 
                          assumed to be sorted top-to-bottom, left-to-right.
    :param threshold:    The distance in pixels to decide when to start a new row.
    :return:               A list (rows) of lists, each inner list is one row, 
                           containing the sub-rectangle data.
    """
    rows = []
    current_row = []
    if not extracted_text:
        return rows

    # start the first row’s baseline from the very first rectangle's y
    _, first_data = extracted_text[0]
    current_row_y = extracted_text[0][0][1]
    current_row_h = extracted_text[0][0][3]

    for ((x, y, w, h), text_data) in extracted_text:

        if abs(y - current_row_y) > threshold or abs(h - current_row_h) > threshold:
            # push the old row into rows
            rows.append(current_row)
            # start a new row
            current_row = []
            current_row_y = y
            current_row_h = h


        # add current bounding box/data to the current row
        current_row.append(((x, y, w, h), text_data))

    if current_row:
        rows.append(current_row)

    # sort each row by x-coordinate and remove empty rows
    rows = [sorted(r, key=lambda x: x[0][0]) for r in rows if not (len(r) == 1 and len(r[0][1]) == 0)]
    
    # remove rectangles coords from the rows
    rows = [[(data[1]) for data in row] for row in rows]

    return rows

In [350]:
def find_field_in_rows(
    rows, 
    field, 
    end_field=None, 
    start_idx=0, 
    thresh=95, 
    check_all_columns=False
):
    """
    Search through `rows` for `field`.
      - If `check_all_columns` = False, only the first column is checked.
      - If `check_all_columns` = True, all columns in each row are checked.
    Stops early if `end_field` is encountered in a row/column (using the same fuzzy logic).
    
    Returns:
      The row index where `field` is found, or -1 if not found.
    """
    
    i = start_idx
    before_end_field = True
    while i < len(rows) and before_end_field:
        row = rows[i]  

        # decide if we check just the first column or all columns
        columns_to_check = row if check_all_columns else row[:1]
        
        for j, col in enumerate(columns_to_check):
            text_in_col = col[0][0]  # Adjust to your data structure

            # check if we should stop early
            before_end_field = (
                    end_field is None or
                    len(columns_to_check) > 1 or    # the end fields should only be in the first column
                    fuzz.partial_ratio(end_field, text_in_col) <= thresh
            )
            
            # check if the field is in the column 
            if before_end_field and fuzz.partial_ratio(field, text_in_col) > thresh:
                return (i, j) if check_all_columns else i

        i += 1

    return (row_idx, -1) if check_all_columns else -1

In [215]:
def process_ocr_to_dataframe(ocr_results, fields):
    """
    Process OCR results and flatten the extracted information into a CSV-ready format.

    Parameters:
    ocr_results (dict): Dictionary of OCR results containing bounding boxes and text.

    Returns:
    pd.DataFrame: A dataframe containing flattened, structured data for CSV storage.
    """
    extracted_data = {field: None for field in fields}

    for field in ocr_results:
        if len(field[1]) > 1:

            label, content = field[1][0][0], field[1][1][0]
            confidence = field[1][0][1]

            best_match, score = process.extractOne(label, extracted_data.keys())
            if score > 95:
                if extracted_data[best_match] is not None and extracted_data[best_match] != content:
                    print(f"Field '{best_match}' field changed from '{extracted_data[best_match]}' to '{content}'")

                extracted_data[best_match] = content, confidence

    general_df = pd.DataFrame([extracted_data])

    # general_df = general_df[sorted(general_df.columns)]

    return general_df

In [331]:
def process_program_details(rows, thresh=95):
    """
    Process the OCR results for the first page (or first few pages) of the PDF.
    
    Output a dataframe containing the partial results and the index of the last row processed.
    
    Includes:
    - Program Details
    - Inspection Information
    - Summary of Findings

    :param rows the OCR results grouped by rows
    :param thresh the threshold for fuzzy matching
    :return: partial_df, last_row_idx
    """

    p1_fields = [
        "Program Number",
        "Program Type",
        "County",
        "Building Approval Date",
        "Use Group/Code",
        "Occupancy Limit",
        "Maximum Under 2",   # under 2 1/2 but idk how this will be read...
        "Fire Inspection Approval Date",
        "Food Service Risk Level",
        "Inspection Type",
        "Inspection Scope",
        "Inspection Notice",
        "Inspection Date",
        "Begin Time",
        "End Time",
        "Reviewer",
        "No. Rules Verified",
        "No. Rules with Non-compliances",
        "No. Serious Risk",
        "No. Moderate Risk",
        "No. Low Risk",
    ]

    extracted_data = {field: None for field in p1_fields}

    row_idx = 0

    # TODO: refactor to handle mutiple Reviewers, for example
    for field_name in p1_fields:
        row_idx, col_idx = find_field_in_rows(
            rows=rows,
            field=field_name,
            end_field=LICENSE_CAPACITY,
            start_idx=row_idx,
            thresh=thresh,
            check_all_columns=True # check all columns in each row, return the column index
        )

        if col_idx == -1 or len(rows[row_idx][col_idx]) <= 1:
            if col_idx == -1:
                print(f"Field '{field_name}' not found in rows.")
            extracted_data[field_name] = None
        else:
            extracted_data[field_name] = rows[row_idx][col_idx][1]

    p1_df = pd.DataFrame([extracted_data])

    return p1_df, row_idx + 1

In [347]:
def process_license_table(rows):
    
    table_rows = [
        "Infant",
        "Young Toddler",
        "Total Under 2",
        "Older Toddler",
        "Preschool",
        "School",
        "Total Capacity/Enrollment"]

    columns = [
        "Full Time",
        "Part Time",
        "Total"
    ]

    table =  {"License Capacity": {}}

    row_idx = 0

    for i, t_row in enumerate(table_rows):

        prev = row_idx
        # row_idx = get_row(rows, t_row, start_idx=row_idx, end_field=RATIO_OBSERVED)
        row_idx = find_field_in_rows(
            rows, 
            t_row, 
            end_field=RATIO_OBSERVED, 
            start_idx=row_idx, 
        )
        
        if row_idx == -1:
            row_idx = prev
            continue

        current_row = rows[row_idx]

        # save license capacity totals
        if len(current_row) > len(columns) + 1:
            table["License Capacity"][t_row] = current_row[1][0]
            current_row = current_row[2:]
        else:
            current_row = current_row[1:]

        table[t_row] = {columns[i]: field[0] for i, field in enumerate(current_row)}

    df = pd.DataFrame({center_df.index[0]: table}).T
    table_df = pd.concat([pd.json_normalize(df[col]).add_prefix(f"{col} ") for col in df.columns], axis=1)

    return table_df, row_idx + 1

In [296]:
def process_ratio_table(extracted_text):
    rows = group_into_rows(extracted_text)
    # rows = extracted_text

    table_rows = [
        "Infant/Toddler",
        "Preschool"
    ]

    table =  {"License Capacity": {}}

    row_idx = 0

    for i, t_row in enumerate(table_rows):

        prev = row_idx
        row_idx = get_row(rows, t_row, row_idx)

        if row_idx == prev:
            continue
            
        current_row = rows[row_idx][1]
        # print(current_row)

        table[t_row] = {"Ratio Observed": current_row[2]}

    df = pd.DataFrame({center_df.index[0]: table}).T
    table_df = pd.concat([pd.json_normalize(df[col]).add_prefix(f"{col} ") for col in df.columns], axis=1)

    return table_df


In [288]:
def 

# Extract PDF Links

In [103]:
test_link = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d&p=1"

In [104]:
pdf_link_path = "pdf_links.csv"

if os.path.exists(pdf_link_path):
    pdf_links = pd.read_csv(pdf_link_path, index_col=0)
else:
    pdf_links = extract_all_pdfs(test_link, REL_PATH)
    pdf_links.to_csv(pdf_link_path, index=True)
pdf_links

Unnamed: 0_level_0,pdf,Address,City,Zip
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A BRIGHT START 4 KIDZ LEARNING CTR,https://childcaresearch.ohio.gov//pdf/00224002...,8211 PLATT,CLEVELAND,44104
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239
A CHILD'S GARDEN,https://childcaresearch.ohio.gov//pdf/00000020...,5427 JULMAR DRIVE,CINCINNATI,45238
A CHILD'S JOURNEY LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00217001...,846 S. YEARLING RD,WHITEHALL,43213
A CHILD'S PLACE LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00000040...,2010 OFFICEVIEW PLACE,REYNOLDSBURG,43068
A GREAT START PRESCHOOL INC,https://childcaresearch.ohio.gov//pdf/00000020...,7001 FAR HILLS AVE,DAYTON,45459
A JOYFUL JOURNEY ACADEMY,https://childcaresearch.ohio.gov//pdf/00222002...,1536 BARNETT ROAD,COLUMBUS,43227
A JUBILEE ACADEMY,https://childcaresearch.ohio.gov//pdf/00000030...,15751 LAKESHORE BLVD,CLEVELAND,44110
A KIDS ONLY EARLY LEARNING CENTER INC. 4,https://childcaresearch.ohio.gov//pdf/00219001...,2505 SOUTH RIDGE EAST,ASHTABULA,44004
A KIDS ONLY EARLY LEARNING CT INC,https://childcaresearch.ohio.gov//pdf/00000030...,2621 STATE ROAD,ASHTABULA,44004


In [105]:
center_df = pd.DataFrame(pdf_links.iloc[1]).T
center_df.index.name = "program_name"
center_df

Unnamed: 0_level_0,pdf,Address,City,Zip
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239


In [106]:
processed_dfs = [center_df]

In [107]:
local_file, _ = urllib.request.urlretrieve(center_df['pdf'].iloc[0])
local_file

'C:\\Users\\WILLBL~1\\AppData\\Local\\Temp\\tmpq051rr9u'

# Extract Text from PDF

In [108]:
images = convert_from_path(local_file, dpi=300)

In [109]:
image = images[0]

In [110]:
len(images)

12

In [111]:
ocr = easyocr.Reader(['en'], gpu=True)

In [112]:
ocr_kwargs = {
    "width_ths": 1,
    "batch_size": 25,
}

DISPLAY = False

In [113]:
extracted_text = process_image(image, ocr, verbose=True, display=DISPLAY, ocr_kwargs=ocr_kwargs)
extracted_text

Detected 72 hierarchical sub-rectangles.
No text detected in sub-rectangle 6.
No text detected in sub-rectangle 39.
Extracted 70 / 72 text fields with an average confidence of 0.89.


[((152, 757, 2248, 60), [('Program Details', np.float64(1.0))]),
 ((152, 819, 858, 169),
  [('Program Name', np.float64(0.95)),
   ('BRIGHTER START CHILDCARE', np.float64(0.7)),
   ('A', np.float64(0.84))]),
 ((1012, 819, 805, 169),
  [('Program Number', np.float64(1.0)), ('000000200979', np.float64(1.0))]),
 ((1818, 819, 582, 169),
  [('Program Type', np.float64(0.73)),
   ('Child Care Center', np.float64(1.0))]),
 ((152, 989, 1665, 225),
  [('Address', np.float64(1.0)),
   ('2765 BLUE ROCK RD. CINCINNATI', np.float64(0.96)),
   ('OH', np.float64(0.99)),
   ('45239', np.float64(1.0))]),
 ((1818, 989, 582, 225),
  [('County', np.float64(1.0)), ('HAMILTON', np.float64(0.97))]),
 ((152, 1215, 2248, 56), []),
 ((152, 1273, 858, 112), [('Building Approval Date', np.float64(0.86))]),
 ((1012, 1273, 385, 112), [('Use Group/Code', np.float64(1.0))]),
 ((1399, 1273, 499, 112), [('Occupancy Limit', np.float64(0.98))]),
 ((1899, 1273, 501, 112), [('Maximum Under 2 Y', np.float64(0.62))]),
 ((152

In [313]:
rows = group_into_rows(extracted_text)
rows

[[[('Program Details', np.float64(1.0))]],
 [[('Program Name', np.float64(0.95)),
   ('BRIGHTER START CHILDCARE', np.float64(0.7)),
   ('A', np.float64(0.84))],
  [('Program Number', np.float64(1.0)), ('000000200979', np.float64(1.0))],
  [('Program Type', np.float64(0.73)),
   ('Child Care Center', np.float64(1.0))]],
 [[('Address', np.float64(1.0)),
   ('2765 BLUE ROCK RD. CINCINNATI', np.float64(0.96)),
   ('OH', np.float64(0.99)),
   ('45239', np.float64(1.0))],
  [('County', np.float64(1.0)), ('HAMILTON', np.float64(0.97))]],
 [[('Building Approval Date', np.float64(0.86))],
  [('Use Group/Code', np.float64(1.0))],
  [('Occupancy Limit', np.float64(0.98))],
  [('Maximum Under 2 Y', np.float64(0.62))]],
 [[('Fire Inspection Approval Date', np.float64(1.0)),
   ('09/20/2024', np.float64(0.76))],
  [('Food Service Risk Level', np.float64(0.86)),
   ('Level II', np.float64(0.61))]],
 [[('Inspection Information', np.float64(1.0))]],
 [[('Inspection Type', np.float64(1.0)), ('Annual', n

In [332]:
p1_df, row_idx = process_program_details(rows)
p1_df

Unnamed: 0,Program Number,Program Type,County,Building Approval Date,Use Group/Code,Occupancy Limit,Maximum Under 2,Fire Inspection Approval Date,Food Service Risk Level,Inspection Type,...,Inspection Notice,Inspection Date,Begin Time,End Time,Reviewer,No. Rules Verified,No. Rules with Non-compliances,No. Serious Risk,No. Moderate Risk,No. Low Risk
0,"(000000200979, 1.0)","(Child Care Center, 1.0)","(HAMILTON, 0.97)",,,,,"(09/20/2024, 0.76)","(Level II, 0.61)","(Annual, 1.0)",...,"(Unannounced, 1.0)","(10/02/2024, 0.87)","(9:00 AM, 0.95)","(11.30 AM, 0.84)","(Kristin Blassingame, 1.0)","(58, 1.0)","(10, 1.0)",,"(2, 1.0)","(9, 0.91)"


In [333]:
rows[row_idx]

[[('License Capacity and Enrollment at the Time of Inspection',
   np.float64(0.85))]]

In [None]:
rows = rows[row_idx:]

In [351]:
license_df, row_idx = process_license_table(rows)
license_df

Unnamed: 0,License Capacity Total Under 2,License Capacity Total Capacity/Enrollment,Infant Full Time,Infant Part Time,Infant Total,Young Toddler Full Time,Young Toddler Part Time,Young Toddler Total,Total Under 2 Full Time,Total Under 2 Part Time,...,Older Toddler Total,Preschool Full Time,Preschool Part Time,Preschool Total,School Full Time,School Part Time,School Total,Total Capacity/Enrollment Full Time,Total Capacity/Enrollment Part Time,Total Capacity/Enrollment Total
0,"(21, 1.0)","(44, 1.0)","(11, 1.0)","(0, 1.0)","(11, 1.0)","(0, 1.0)","(0, 1.0)","(0, 0.38)","(11, 1.0)","(0, 1.0)",...,"(0, 0.75)","(21, 1.0)","(0, 1.0)","(21, 1.0)","(0, 1.0)","(0, 1.0)","(0, 1.0)","(21, 1.0)","(0, 1.0)","(32, 1.0)"


In [304]:
rows[row_idx]

[[('Staff-Child Ratios at the Time of Inspection', np.float64(0.75))]]

In [None]:
page_2 = process_image(images[1], ocr, display=DISPLAY)

In [204]:
group_into_rows(page_2)

[[[('Infant/ Toddler', np.float64(0.81))],
  [('0to', np.float64(0.56)), ('12 months', np.float64(0.92))],
  [('1to 5', np.float64(0.59))],
  []],
 [[('Infant/ Toddler', np.float64(0.56))],
  [('0to', np.float64(0.77)), ('12 months', np.float64(0.82))],
  [('1to 4', np.float64(0.99))],
  []],
 [[('preschool', np.float64(1.0))],
  [('3 years to < 4 years', np.float64(0.95))],
  [('1t0 8', np.float64(0.61))],
  [("Preschool and 4's", np.float64(0.95)),
   ('were combined at', np.float64(0.72)),
   ('the time ratio was', np.float64(0.92)),
   ('taken.', np.float64(0.88))]],
 [[('preschool', np.float64(1.0))],
  [('3 years to < 4 years', np.float64(0.92))],
  [('1to 8', np.float64(0.99))],
  [("Preschool and 4's", np.float64(0.67)),
   ('were combined at', np.float64(0.75)),
   ('the time ratio was', np.float64(0.98)),
   ('taken.', np.float64(0.94))]],
 [[('Summary of Non-Complances', np.float64(0.73))]],
 [[('Serious Risk Non-Compliances', np.float64(0.84)),
   ('No Serious Risk Non-Comp

In [207]:
ratio_df = process_ratio_table(page_2) 
ratio_df

Field: infant/toddler


IndexError: list index out of range

In [356]:
SECTION_METHODS = [
    (PROGRAM_DETAILS, process_program_details),
    (LICENSE_CAPACITY, process_license_table),
]

In [360]:
extracted_text

[((152, 757, 2248, 60), [('Program Details', np.float64(1.0))]),
 ((152, 819, 858, 169),
  [('Program Name', np.float64(0.95)),
   ('BRIGHTER START CHILDCARE', np.float64(0.7)),
   ('A', np.float64(0.84))]),
 ((1012, 819, 805, 169),
  [('Program Number', np.float64(1.0)), ('000000200979', np.float64(1.0))]),
 ((1818, 819, 582, 169),
  [('Program Type', np.float64(0.73)),
   ('Child Care Center', np.float64(1.0))]),
 ((152, 989, 1665, 225),
  [('Address', np.float64(1.0)),
   ('2765 BLUE ROCK RD. CINCINNATI', np.float64(0.96)),
   ('OH', np.float64(0.99)),
   ('45239', np.float64(1.0))]),
 ((1818, 989, 582, 225),
  [('County', np.float64(1.0)), ('HAMILTON', np.float64(0.97))]),
 ((152, 1215, 2248, 56), []),
 ((152, 1273, 858, 112), [('Building Approval Date', np.float64(0.86))]),
 ((1012, 1273, 385, 112), [('Use Group/Code', np.float64(1.0))]),
 ((1399, 1273, 499, 112), [('Occupancy Limit', np.float64(0.98))]),
 ((1899, 1273, 501, 112), [('Maximum Under 2 Y', np.float64(0.62))]),
 ((152

In [368]:
def process_rows(center_df: pd.DataFrame, extracted_texts: list, section_methods: list[tuple[str, callable]]):
    '''
    Process the extracted text from the PDFs and return a DataFrame with the processed data.
    :param center_df: a DataFrame containing the center's information (e.g., name, address, etc.)
    :param extracted_texts: a list of extracted text from the PDFs 
    :param section_methods: a list of tuples containing the section name and the method to process the section
    :return: 
    '''
    rows = []

    # group the extracted text into rows to avoid page breaks 
    for extracted_text in extracted_texts:
        rows += group_into_rows(extracted_text)

    row_idx = 0
    processed_dfs = [center_df.reset_index(drop=False)]
    try:
        for i, (field, method) in enumerate(section_methods):
            print(f"Processing Section: {field}")
            df, row_idx = method(rows)
            processed_dfs.append(df)

            # ensure at end of the section
            row_idx, find_field_in_rows(rows,
                                        field,
                                        start_idx=row_idx,
                                        end_field=None if i == len(section_methods) - 1 else section_methods[i + 1][0]
                                        )
            rows = rows[row_idx:]

        final_df = pd.concat(processed_dfs, axis=1)
        final_df.set_index("program_name", inplace=True)

        return final_df
    except Exception as e:
        print(f"Error processing rows: {e}")
        print(f"Row Index: {row_idx}")
        print(f"Section: {section_methods[i][0]}")
        return None

In [369]:
process_rows(center_df, [extracted_text], SECTION_METHODS)

Processing Section: Program Details
Processing Section: License Capacity and Enrollment at the Time of Inspection


Unnamed: 0_level_0,pdf,Address,City,Zip,Program Number,Program Type,County,Building Approval Date,Use Group/Code,Occupancy Limit,...,Older Toddler Total,Preschool Full Time,Preschool Part Time,Preschool Total,School Full Time,School Part Time,School Total,Total Capacity/Enrollment Full Time,Total Capacity/Enrollment Part Time,Total Capacity/Enrollment Total
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239,"(000000200979, 1.0)","(Child Care Center, 1.0)","(HAMILTON, 0.97)",,,,...,"(0, 0.75)","(21, 1.0)","(0, 1.0)","(21, 1.0)","(0, 1.0)","(0, 1.0)","(0, 1.0)","(21, 1.0)","(0, 1.0)","(32, 1.0)"


In [175]:
page_3 = process_image(images[2], ocr, display=DISPLAY)
group_into_rows(page_3)

[[((152, 326, 2247, 2718),
   [('4. Child care staff were using a baby monitor to supervise children;',
     np.float64(0.75)),
    ('5. Child care staff were using a walkie talkie to', np.float64(0.8)),
    ('supervise children.', np.float64(0.82)),
    ('6. Child care staff were using mirrors to view children in another room:',
     np.float64(0.91)),
    ('7. Child care staff were using a video camera instead of physically being present in the room',
     np.float64(0.79)),
    ('8. Other [', np.float64(0.67)),
    ('1.', np.float64(0.91)),
    ('Children must be supervised and within sight and hearing of a child care staff member at all times Provide staff',
     np.float64(0.79)),
    ("training; Submit the program's corrective action plan, which includes a statement that training was provided, to",
     np.float64(0.68)),
    ('the Department to verify compliance with the requirements of this rule:',
     np.float64(0.64)),
    ('Corrective Action Plan Due: 11/01/2024', np.float6

In [123]:
# rows = []
# for i in range(3, len(images)):
#     page = process_image(images[i], ocr, display=True)
#     rows.append(group_into_rows(page))
#    
# rows[-1]

In [124]:
# local_file2, _ = urllib.request.urlretrieve(pdf_links.iloc[0]['pdf'])
# # Extract Text from PDF
# images2 = convert_from_path(local_file2, dpi=300)
# 
# 
# rows2 = []
# for i, image in enumerate(images2):
#     page = process_image(image, ocr, display=True)
#     rows2 += group_into_rows(page)
#     
# rows2

In [125]:
# extract_license_capacity_table(rows2)

# Extract Rules

**Notes**:
- Need separate columns for each rule
- Need separate columns for each rule for each compliance level as well
    - May be able to avoid compliance level columns by checking if the rule is in the non-compliance list
- Rule Data
    - List of the sub-rules that were violated (e.g., 1, 4, 8). Just numbers

In [127]:
final_df

Unnamed: 0_level_0,pdf,Address,City,Zip,Program Number,Program Type,County,Building Approval Date,Use Group/Code,Occupancy Limit,...,Older Toddler Total,Preschool Full Time,Preschool Part Time,Preschool Total,School Full Time,School Part Time,School Total,Total Capacity/Enrollment Full Time,Total Capacity/Enrollment Part Time,Total Capacity/Enrollment Total
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239,"(000000200979, 1.0)","(Child Care Center, 0.73)","(HAMILTON, 1.0)",,,,...,"(0, 0.75)","(21, 1.0)","(0, 1.0)","(21, 1.0)","(0, 1.0)","(0, 1.0)","(0, 1.0)","(21, 1.0)","(0, 1.0)","(32, 1.0)"
