In [1]:
import cv2
import easyocr
import fuzzywuzzy.fuzz
import numpy as np
import pandas as pd
import torch
import os
import urllib.request

from fuzzywuzzy import process, fuzz
from pdf2image import convert_from_path
from collections import defaultdict

from webscraping import extract_all_centers
from pdf_text_extraction import process_image, display_opencv_image

In [2]:
print(f"CUDA Available: {torch.cuda.is_available()}")

# Check if CuDNN is enabled in PyTorch
print(f"CuDNN Enabled: {torch.backends.cudnn.enabled}")

# Check CuDNN version in PyTorch
print(f"CuDNN Version: {torch.backends.cudnn.version()}")

CUDA Available: True
CuDNN Enabled: True
CuDNN Version: 90100


# Link and File Paths

In [4]:
ODCY_LINK = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d"

REL_PATH = "https://childcaresearch.ohio.gov/"

In [5]:
ODCY_LINK

'https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d'

In [107]:
PROGRAM_DETAILS = "Program Details"
LICENSE_CAPACITY = "License Capacity and Enrollment at the Time of Inspection"
RATIO_OBSERVED = "Staff-Child Ratios at the Time of Inspection"
SERIOUS_NC = "Serious Risk Non-Compliances"
MODERATE_NC = "Moderate Risk Non-Compliances"
LOW_NC = "Low Risk Non-Compliances"
IN_COMPLIANCE = "Rules in Compliance/Not Verified"

# Helper Functions

In [108]:
import logging

"""
For logging and scalability, I will need to do the following:
1. Create a parallelizable logger for each process
2. Have exceptions in each process be logged to the logger
    - Record the exception, the link/care center name, and the section being processed
    
Something like this:
logger.info(
            f"Processed {section_name} on page {page_id}\n"
            f"Rows processed: {len(rows)}\n"
            f"Link: {link}"
        )
"""
def configure_logger(log_file="process.log"):
    """Configure and return a logger."""
    # TODO: adapt for parallel processing
    
    logger = logging.getLogger(__name__)  # Use the module name
    logger.setLevel(logging.INFO)  # Set the logging level

    # Add a file handler
    file_handler = logging.FileHandler(log_file)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)

    return logger

In [109]:
def group_into_rows(extracted_text, threshold=5):
    """
    Group extracted_text entries into rows based on their y-coordinates.
    
    :param extracted_text: A list of tuples of the form:
                          [ ( (x, y, w, h), [ (text, conf), ... ] ), ... ] 
                          assumed to be sorted top-to-bottom, left-to-right.
    :param threshold:    The distance in pixels to decide when to start a new row.
    :return:               A list (rows) of lists, each inner list is one row, 
                           containing the sub-rectangle data.
    """
    rows = []
    current_row = []
    if not extracted_text:
        return rows

    # start the first row’s baseline from the very first rectangle's y
    _, first_data = extracted_text[0]
    current_row_y = extracted_text[0][0][1]
    current_row_h = extracted_text[0][0][3]

    for ((x, y, w, h), text_data) in extracted_text:

        if abs(y - current_row_y) > threshold or abs(h - current_row_h) > threshold:
            # push the old row into rows
            rows.append(current_row)
            # start a new row
            current_row = []
            current_row_y = y
            current_row_h = h

        current_row.append(((x, y, w, h), text_data))

    if current_row:
        rows.append(current_row)

    # sort each row by x-coordinate and remove empty rows
    rows = [sorted(r, key=lambda x: x[0][0]) for r in rows if not (len(r) == 1 and len(r[0][1]) == 0)]
    
    # remove rectangles coords from the rows
    rows = [[(data[1]) for data in row] for row in rows]

    return rows

In [151]:
def find_field_in_rows(
    rows, 
    field, 
    end_field=None, 
    start_idx=0, 
    thresh=95, 
    check_all_columns=False
):
    """
    Find a field in a list of rows.
    Params:
        - `rows`: A list of rows, where each row is a list of columns.
        - `field`: The field to search for.
        - `end_field`: The field to stop searching at.
        - `start_idx`: The row index to start searching at.
        - `thresh`: The threshold for fuzzy matching.
        - `check_all_columns`: Whether to check all columns in each row. O.w., only the first column is checked.
    Search through `rows` for `field`.
      - If `check_all_columns` = False, only the first column is checked.
      - If `check_all_columns` = True, all columns in each row are checked.
    Stops early if `end_field` is encountered in a row/column (using the same fuzzy logic).
    
    Returns:
      The row index where `field` is found, or -1 if not found.
    """
    
    row_idx = start_idx
    before_end_field = True
    while row_idx < len(rows) and before_end_field:
        row = rows[row_idx]  

        # decide if we check just the first column or all columns
        columns_to_check = row if check_all_columns else row[:1]
        
        
        for col_idx, col in enumerate(columns_to_check):
            if col:
                text_in_col = col[0][0]  # Adjust to your data structure

                # check if we should stop early
                before_end_field = (
                        end_field is None or
                        len(columns_to_check) > 1 or    # the end fields should only be in the first column
                        fuzz.partial_ratio(end_field, text_in_col) <= thresh
                )
                
                # check if the field is in the column 
                if before_end_field and fuzz.partial_ratio(field, text_in_col) > thresh:
                    return (row_idx, col_idx) if check_all_columns else row_idx

        row_idx += 1

    return (row_idx, -1) if check_all_columns else -1

In [111]:
def process_program_details(rows, thresh=95):
    """
    Process the OCR results for the first page (or first few pages) of the PDF.
    
    Output a dataframe containing the partial results and the index of the last row processed.
    
    Includes:
    - Program Details
    - Inspection Information
    - Summary of Findings

    :param rows the OCR results grouped by rows
    :param thresh the threshold for fuzzy matching
    :return: partial_df, last_row_idx
    """

    p1_fields = [
        "Program Number",
        "Program Type",
        "County",
        "Building Approval Date",
        "Use Group/Code",
        "Occupancy Limit",
        "Maximum Under 2",   # under 2 1/2 but idk how this will be read...
        "Fire Inspection Approval Date",
        "Food Service Risk Level",
        "Inspection Type",
        "Inspection Scope",
        "Inspection Notice",
        "Inspection Date",
        "Begin Time",
        "End Time",
        "Reviewer",
        "No. Rules Verified",
        # "No. Rules with Non-compliance", Just use the non-compliance dataframe instead
        # "No. Serious Risk",
        # "No. Moderate Risk",
        # "No. Low Risk",
    ]

    extracted_data = {field: None for field in p1_fields}

    row_idx = 0

    # TODO: refactor to handle mutiple Reviewers, for example
    for field_name in p1_fields:
        row_idx, col_idx = find_field_in_rows(
            rows=rows,
            field=field_name,
            end_field=LICENSE_CAPACITY,
            start_idx=row_idx,
            thresh=thresh,
            check_all_columns=True # check all columns in each row, return the column index
        )

        if col_idx == -1 or len(rows[row_idx][col_idx]) <= 1:
            if col_idx == -1:
                print(f"Field '{field_name}' not found in rows.")
            extracted_data[field_name] = None
        else:
            extracted_data[field_name] = rows[row_idx][col_idx][-1]

    p1_df = pd.DataFrame([extracted_data])

    return p1_df, row_idx + 1

In [112]:
def process_license_table(rows):
    
    table_rows = [
        "Infant",
        "Young Toddler",
        "Total Under 2",
        "Older Toddler",
        "Preschool",
        "School",
        "Total Capacity/Enrollment"]

    columns = [
        "Full Time",
        "Part Time",
        "Total"
    ]

    table =  {"License Capacity": {}}

    row_idx = 0

    for i, t_row in enumerate(table_rows):

        prev = row_idx
        # row_idx = get_row(rows, t_row, start_idx=row_idx, end_field=RATIO_OBSERVED)
        row_idx = find_field_in_rows(
            rows, 
            t_row, 
            end_field=RATIO_OBSERVED, 
            start_idx=row_idx, 
        )
        
        if row_idx == -1:
            row_idx = prev
            continue

        current_row = rows[row_idx]

        # save license capacity totals
        if len(current_row) > len(columns) + 1:
            table["License Capacity"][t_row] = current_row[1][0]
            current_row = current_row[2:]
        else:
            current_row = current_row[1:]

        table[t_row] = {columns[i]: field[0] for i, field in enumerate(current_row)}

    df = pd.DataFrame({0: table}).T
    table_df = pd.concat([pd.json_normalize(df[col]).add_prefix(f"{col} ") for col in df.columns], axis=1)

    return table_df, row_idx + 1

In [185]:
from collections import defaultdict
def process_ratio_table(rows):

    table_rows = [
        # "Infant/Toddler",
        # "Preschool"
        "0 to < 12 months",
        "12 months to < 18 months",
        "18 months to < 30 months",
        "30 months to < 36 months", 
        "3 years to < 4 years",
        "4 years to < 5 years",
        "School Age to < 11 years"
    ]
    
    row_idx = 0
    table = defaultdict(list)
    
    end_row = find_field_in_rows(
        rows,
        "Summary of Non-Compliances",
        start_idx=row_idx,
    )
   
    if end_row == -1: 
        end_row = 0 
    else:
        rows = rows[:end_row]
    
    flattened = []
    for col in rows:
        for textbox in col:
            if len(textbox) > 0:
                it, conf = textbox[0]
                for item, confidence in textbox[1:]:
                    it += f" {item}"
                    conf += confidence
                conf /= len(textbox)
                flattened.append((it, conf))
                
    rows = flattened 

    for i, r in enumerate(rows):
        
        field = process.extractOne(r[0], table_rows, score_cutoff=85, scorer=fuzz.ratio)
        
        if field is not None and i + 1 < len(rows):
            ratio = rows[i + 1]
            match = re.search(r"(\d+)\s*\w+\s*(\d+)", ratio[0])
            if match:
                ratio = f"{match.group(1)}:{match.group(2)}", ratio[1]
            table[field[0]].append(ratio)
            

    # Convert the nested dictionary to a DataFrame
    df = pd.DataFrame({category: [values] for category, values in table.items()})

    return df, end_row

# ratio_df, r = process_ratio_table(rows)
# ratio_df

In [114]:
def flatten_rows(rows, thresh=95):
    """
    Flatten a nested list of tuples while keeping only unique items.
    
    :param rows: List of nested tuples with text and confidence values
    :param thresh: Threshold for fuzzy matching
    :return: List of tuples with unique items and their highest confidence
    """

    # use a fuzzy set
    seen = []
    flattened = []
    for col in rows:
        for textbox in col:
            for item, confidence in textbox:
                # if process.extractOne(item, seen, score_cutoff=thresh) is None:
                #     seen.append(item)
                flattened.append((item, confidence))

    return flattened


# Extract PDF Links

In [115]:
import re
def extract_numbers_with_letters(input_text):
    """
    Extract numbers and their optional associated letters (e.g., '5 (a &b)' -> ['5a', '5b']).
    Save as strings like '4a', '5', etc.

    Args:
        input_text (str): The text to process.

    Returns:
        list: A list of strings representing numbers and their associated letters.
    """

    # match all numbers and their associated letters
    matches = re.findall(r"(\d+[A-Za-z]?)(?:\s*\(([^()]*?)\))?", input_text)

    findings = []
    for m in matches:
        number = m[0]
        
        letters = re.findall(r"\b\w+\b", m[1])
        
        # only add the letter if it is a single character
        if letters and all(len(l) == 1 for l in letters):
            for letter in letters: 
                findings.append(f"{number}{letter}")
        else:
            findings.append(number)

    return findings
extract_numbers_with_letters("2, 3d, 4, 5 (b &c), 6 (a), 7 (A BABY GOO GOO GAGA)")

['2', '3d', '4', '5b', '5c', '6a', '7']

In [116]:
test_link = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d&p=1"

In [119]:
pdf_link_path = "pdf_links_first_50_pages.csv"
nc_path = "nc_first_50_pages.csv"

if os.path.exists(pdf_link_path):
    pdf_links = pd.read_csv(pdf_link_path, index_col=0)
    nc_df = pd.read_csv(nc_path, index_col=0)
else:
    pdf_links, nc_df = extract_all_centers(test_link, REL_PATH)
    pdf_links.to_csv(pdf_link_path, index=True)
    nc_df.to_csv(nc_path, index=True)
pdf_links

Unnamed: 0_level_0,pdf,Address,City,Zip
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING ENRICHMENT AFTER-SCHOOL PROGRAM",https://childcaresearch.ohio.gov//pdf/00219002...,2627 PARK AVE,CINCINNATI,45206
1ST CHOICE CHILD CARE,https://childcaresearch.ohio.gov//pdf/00219002...,4303 CLEVELAND AVE,COLUMBUS,43224
1ST FRIENDS LEARNING ACADEMY,https://childcaresearch.ohio.gov//pdf/00000030...,1930 PEARL RD,BRUNSWICK,44212
3MB AFTERSCHOOL,https://childcaresearch.ohio.gov//pdf/00219001...,18316 ST. CLAIR AVENUE,CLEVELAND,44110
3T LEARNING ACADEMY LLC 11,https://childcaresearch.ohio.gov//pdf/00000020...,7523 READING ROAD,CINCINNATI,45237
...,...,...,...,...
EARTH'S PRECIOUS JEWELS LITERACY AND FITNESS ACADEMY,https://childcaresearch.ohio.gov//pdf/00218001...,4600 VINE ST.,CINCINNATI,45217
EAST DAYTON CHRISTIAN SCHOOL PRESCHOOL AND DAYCARE,https://childcaresearch.ohio.gov//pdf/00000020...,999 SPINNING RD,DAYTON,45431
EAST END INDIVIDUAL LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00000030...,2749 WOODHILL RD,CLEVELAND,44104
EAST END UBUNTU LEARNING CENTER-HARVEY RICE,https://childcaresearch.ohio.gov//pdf/00218001...,2730 EAST 116TH STREET,CLEVELAND,44104


In [120]:
nc_df

Unnamed: 0_level_0,rule,occurrence,domain,compliance,findings
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING ENRICHMENT AFTER-SCHOOL PROGRAM",5101:2-12-07 Administrator Qualifications,0,,,['-1']
"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING ENRICHMENT AFTER-SCHOOL PROGRAM",5101:2-12-07 Written Program Policies and Proc...,0,,,"['1', '3', '4', '5', '6', '7', '9', '11', '12'..."
"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING ENRICHMENT AFTER-SCHOOL PROGRAM",5101:2-12-08 Medical Statement,0,,,['1']
"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING ENRICHMENT AFTER-SCHOOL PROGRAM",5101:2-12-09 Background Check Requirements,0,,,"['1', '2']"
"""BECOMING ME"" SOCIAL AND EMOTIONAL LEARNING ENRICHMENT AFTER-SCHOOL PROGRAM",5101:2-12-10 Health Training Requirements,0,,,"['1', '2', '3', '5', '6']"
...,...,...,...,...,...
EAST END YMCA CHILD CARE,5101:2-12-13 Sanitary Equipment and Environment,0,,,['5']
EAST END YMCA CHILD CARE,5101:2-12-15 Medical / Physical Care Plans,0,,,"['1', '7', '13', '18', '19', '30', '33']"
EAST END YMCA CHILD CARE,5101:2-12-16 Written Disaster Plan,0,,,['11']
EAST END YMCA CHILD CARE,"5101:2-12-16 Medical, Dental, and General Em...",0,,,['3']


In [125]:
center_df = pd.DataFrame(pdf_links.iloc[17]).T
center_df.index.name = "program_name"
center_df

Unnamed: 0_level_0,pdf,Address,City,Zip
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239


In [127]:
center_df['pdf'].iloc[0]

'https://childcaresearch.ohio.gov//pdf/000000200979_2024-10-02_ANNUAL.pdf'

In [128]:
local_file, _ = urllib.request.urlretrieve(center_df['pdf'].iloc[0])
local_file

'C:\\Users\\WILLBL~1\\AppData\\Local\\Temp\\tmpn0pmkqbv'

# Extract Text from PDF

In [129]:
images = convert_from_path(local_file, dpi=300)

In [130]:
len(images)

12

In [131]:
ocr = easyocr.Reader(['en'], gpu=True)

In [132]:
ocr_kwargs = {
    "width_ths": 1,
    "batch_size": 25,
}

DISPLAY = True

In [133]:
extracted_text = process_image(images[1], ocr, verbose=True, display=DISPLAY, ocr_kwargs=ocr_kwargs)
extracted_text

Detected 29 hierarchical sub-rectangles.
No text detected in sub-rectangle 3.
No text detected in sub-rectangle 7.
No text detected in sub-rectangle 20.
No text detected in sub-rectangle 21.
No text detected in sub-rectangle 22.
Extracted 24 / 29 text fields with an average confidence of 0.75.


[((152, 326, 709, 57), [('Infant/ Toddler', np.float64(0.81))]),
 ((862, 326, 711, 57),
  [('12 months', np.float64(0.92)), ('0to', np.float64(0.58))]),
 ((1574, 326, 440, 57), [('1to 5', np.float64(0.59))]),
 ((2015, 326, 384, 57), []),
 ((152, 385, 709, 56), [('Infant/ Toddler', np.float64(0.56))]),
 ((862, 385, 711, 56),
  [('12 months', np.float64(0.82)), ('0to', np.float64(0.79))]),
 ((1574, 385, 440, 56), [('1to 4', np.float64(0.99))]),
 ((2015, 385, 384, 56), []),
 ((152, 443, 709, 224), [('preschool', np.float64(1.0))]),
 ((862, 443, 711, 224), [('3 years to < 4 years', np.float64(0.95))]),
 ((1574, 443, 440, 224), [('1t0 8', np.float64(0.61))]),
 ((2015, 443, 384, 224),
  [("Preschool and 4's", np.float64(0.95)),
   ('were combined at', np.float64(0.72)),
   ('the time ratio was', np.float64(0.92)),
   ('taken.', np.float64(0.88))]),
 ((152, 669, 709, 224), [('preschool', np.float64(1.0))]),
 ((862, 669, 711, 224), [('3 years to < 4 years', np.float64(0.92))]),
 ((1574, 669, 4

In [134]:
rows = group_into_rows(extracted_text)
rows

[[[('Infant/ Toddler', np.float64(0.81))],
  [('12 months', np.float64(0.92)), ('0to', np.float64(0.58))],
  [('1to 5', np.float64(0.59))],
  []],
 [[('Infant/ Toddler', np.float64(0.56))],
  [('12 months', np.float64(0.82)), ('0to', np.float64(0.79))],
  [('1to 4', np.float64(0.99))],
  []],
 [[('preschool', np.float64(1.0))],
  [('3 years to < 4 years', np.float64(0.95))],
  [('1t0 8', np.float64(0.61))],
  [("Preschool and 4's", np.float64(0.95)),
   ('were combined at', np.float64(0.72)),
   ('the time ratio was', np.float64(0.92)),
   ('taken.', np.float64(0.88))]],
 [[('preschool', np.float64(1.0))],
  [('3 years to < 4 years', np.float64(0.92))],
  [('1to 8', np.float64(0.99))],
  [("Preschool and 4's", np.float64(0.7)),
   ('were combined at', np.float64(0.75)),
   ('the time ratio was', np.float64(0.98)),
   ('taken.', np.float64(0.84))]],
 [[('Summary of Non-Complances', np.float64(0.73))]],
 [[('Serious Risk Non-Compliances', np.float64(0.84)),
   ('No Serious Risk Non-Compl

In [135]:
center_rule_df = nc_df[nc_df.index == center_df.index[0]].copy()
center_rule_df.reset_index(inplace=True)
center_rule_df.set_index(["program_name", "rule", "occurrence"], inplace=True)
center_rule_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,domain,compliance,findings
program_name,rule,occurrence,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A BRIGHTER START CHILDCARE,5101:2-12-10 Professional Development Requirements,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Play Equipment,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Space Requirements,0,,,['-1']
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Environment,0,,,['16']
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Equipment,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-15 Child Medical and Enrollment Records,0,,,"['6', '14']"
A BRIGHTER START CHILDCARE,5101:2-12-18 Attendance Records,0,,,['3']
A BRIGHTER START CHILDCARE,5101:2-12-18 Group Size,0,,,['-1']
A BRIGHTER START CHILDCARE,5101:2-12-19 Supervision,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-22 Meal and Snack Requirements,0,,,['1']


In [178]:
def process_rules_in_compliance(rows, rule_df, compliance_level):
    """
    Process the OCR results for the last page of the PDF.
    
    Output a dataframe containing the partial results and the index of the last row processed.
    
    Includes:
    - Rules in Compliance/Not Verified

    :param rows the OCR results grouped by rows
    :param rule_df the dataframe containing the rules for the center along with partial results
    :param compliance_level the compliance level to process
    :return: partial_df, last_row_idx
    """
    
    compliance_dict = {
        "Serious": MODERATE_NC,
        "Moderate": LOW_NC,
        "Low": IN_COMPLIANCE
    } 

    next_section = compliance_dict[compliance_level]
    

    end_idx = find_field_in_rows(
        rows, 
        next_section, 
        start_idx=0, 
        thresh=95,
    )
    
    if end_idx == -1:
        end_idx = len(rows)

    # flatten the rows and remove duplicates 
    rows = flatten_rows(rows[:end_idx])
    
    
    domain = None
    rule = None
    rule_counts = defaultdict(int)
    
    for c, rule, _ in rule_df.index:
        rule_counts[rule] = 0
        center = c
        
    row_idx = 0 
    while row_idx < len(rows):
        row, conf = rows[row_idx] 
        
        if fuzz.partial_ratio("Domain", row) > 95:
            domain = row, conf
        elif fuzz.partial_ratio("Rule", row) > 95:
            # extract the rule
            rule = row[6:]
            
            # find the closest rule in the rule_df
            rule = process.extractOne(rule, rule_counts.keys(), score_cutoff=85)
             
            if rule is not None:
                rule = rule[0]
                
                rule_idx = (center, rule, rule_counts[rule])
                
                if rule_idx in rule_df.index:
                    rule_df.loc[rule_idx, "compliance"] = compliance_level
                    
                    # add the  
                    if domain is not None:
                        rule_df.loc[rule_idx, "domain"] = domain[0]
                
        elif fuzz.partial_ratio("Code", row) > 95:
            code = row
            
            row_idx += 1
            
            # go until the row containss Finding to indicate the end of the rule
            while row_idx < len(rows) and fuzz.partial_ratio("Finding", rows[row_idx]) < 95:
                code += rows[row_idx][0]
                row_idx += 1

            idx = (center, rule, rule_counts[rule])
            if idx in rule_df.index:
                rule_df.loc[idx, "code"] = code[6:]
            
            # update the rule count
            rule_counts[rule] += 1 
            
        row_idx += 1
    
    return None, end_idx
# process_rules_in_compliance(group_into_rows(extract[1:]), center_rule_df, "Moderate")

In [137]:
center_rule_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,domain,compliance,findings
program_name,rule,occurrence,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A BRIGHTER START CHILDCARE,5101:2-12-10 Professional Development Requirements,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Play Equipment,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Space Requirements,0,,,['-1']
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Environment,0,,,['16']
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Equipment,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-15 Child Medical and Enrollment Records,0,,,"['6', '14']"
A BRIGHTER START CHILDCARE,5101:2-12-18 Attendance Records,0,,,['3']
A BRIGHTER START CHILDCARE,5101:2-12-18 Group Size,0,,,['-1']
A BRIGHTER START CHILDCARE,5101:2-12-19 Supervision,0,,,['1']
A BRIGHTER START CHILDCARE,5101:2-12-22 Meal and Snack Requirements,0,,,['1']


In [138]:
extract = [process_image(image, ocr) for image in images]

In [186]:
SECTION_METHODS = [
    (PROGRAM_DETAILS, process_program_details, {}),
    (LICENSE_CAPACITY, process_license_table, {}),
    (RATIO_OBSERVED, process_ratio_table, {}),
    (SERIOUS_NC, process_rules_in_compliance, {"compliance_level": "Serious", "rule_df": center_rule_df}),
    (MODERATE_NC, process_rules_in_compliance, {"compliance_level": "Moderate", "rule_df": center_rule_df}),
    (LOW_NC, process_rules_in_compliance, {"compliance_level": "Low", "rule_df": center_rule_df}),
]
def process_ocr_results(center_df: pd.DataFrame, rule_df: pd.DataFrame, extracted_texts: list, section_methods: list[tuple[str, callable]]):
    '''
    Process the extracted text from the PDFs and return a DataFrame with the processed data.
    :param center_df: a DataFrame containing the center's information (e.g., name, address, etc.)
    :param rule_df: a DataFrame containing the rules for the center along with partial results
    :param extracted_texts: a list of extracted text from the PDFs 
    :param section_methods: a list of tuples containing the section name, the method to process the section, and any additional arguments
    :return: a tuple containing the processed DataFrame and the finding/code dataframe
    '''
    rows = []

    # group the extracted text into rows to avoid page breaks 
    for extracted_text in extracted_texts:
        rows += group_into_rows(extracted_text)

    # use a separate dataframe for the rules
    processed_dfs = [center_df.reset_index(drop=False)]
    
    # try:
    for i, (field, method, kwargs) in enumerate(section_methods):
            
        df, row_idx = method(rows, **kwargs)
        
        if df is not None:
            processed_dfs.append(df)
            
            
        # ensure at end of the section
        row_idx, find_field_in_rows(rows,
                                    field,
                                    start_idx=row_idx,
                                    end_field=None if i == len(section_methods) - 1 else section_methods[i + 1][0]
                                    )
        rows = rows[row_idx:]

    program_df = pd.concat(processed_dfs, axis=1)
    program_df.set_index("program_name", inplace=True)
    
    return program_df, rule_df
    # except Exception as e:
    #     print(f"Error processing rows: {e}")
    #     print(f"Row Index: {row_idx}")
    #     print(f"Section: {section_methods[i][0]}")
    #     return None
program, rules = process_ocr_results(center_df, center_rule_df, extract, SECTION_METHODS)
program

Unnamed: 0_level_0,pdf,Address,City,Zip,Program Number,Program Type,County,Building Approval Date,Use Group/Code,Occupancy Limit,...,Preschool Part Time,Preschool Total,School Full Time,School Part Time,School Total,Total Capacity/Enrollment Full Time,Total Capacity/Enrollment Part Time,Total Capacity/Enrollment Total,0 to < 12 months,3 years to < 4 years
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239,"(000000200979, 1.0)","(Child Care Center, 1.0)","(HAMILTON, 0.97)",,,,...,"(0, 1.0)","(21, 1.0)","(0, 1.0)","(0, 1.0)","(0, 1.0)","(21, 1.0)","(0, 1.0)","(32, 1.0)","[(1:5, 0.59), (1:4, 0.99)]","[(1:8, 0.61), (1:8, 0.99)]"


In [187]:
rules

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,domain,compliance,findings,code
program_name,rule,occurrence,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A BRIGHTER START CHILDCARE,5101:2-12-10 Professional Development Requirements,0,Domain: 08 Staff Files,Low,['1'],The program is required to ensure child care s...
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Play Equipment,0,Domain: 04 Indoor/Outdoor Space,Low,['1'],The program is required to provide equipment t...
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Space Requirements,0,Domain: 04 IndoorIOutdoor Space,Low,['-1'],The program is required to conduct and documen...
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Environment,0,Domain: 02 Safe & Sanitary Environment,Moderate,['16'],The program is required to provide an environm...
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Equipment,0,Domain: 02 Safe & Sanitary Environment,Low,['1'],The program is required to refrain from placin...
A BRIGHTER START CHILDCARE,5101:2-12-15 Child Medical and Enrollment Records,0,Domain: 09 Children's Files,Low,"['6', '14']",The program is required to use the updated JFS...
A BRIGHTER START CHILDCARE,5101:2-12-18 Attendance Records,0,Domain: 01 Ratio & Supervision,Low,['3'],The program is required to have a method for t...
A BRIGHTER START CHILDCARE,5101:2-12-18 Group Size,0,Domain: 01 Ratio & Supervision,Low,['-1'],The program may combine all age groups when th...
A BRIGHTER START CHILDCARE,5101:2-12-19 Supervision,0,Domain: 01 Ratio & Supervision,Moderate,['1'],The program staff is required to supervise chi...
A BRIGHTER START CHILDCARE,5101:2-12-22 Meal and Snack Requirements,0,Domain: 05 Health & Safety,Low,['1'],The program is required to post the current we...


In [141]:
center_rule_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,domain,compliance,findings,code
program_name,rule,occurrence,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
A BRIGHTER START CHILDCARE,5101:2-12-10 Professional Development Requirements,0,Domain: 08 Staff Files,Low,['1'],The program is required to ensure child care s...
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Play Equipment,0,Domain: 04 Indoor/Outdoor Space,Low,['1'],The program is required to provide equipment t...
A BRIGHTER START CHILDCARE,5101:2-12-11 Outdoor Space Requirements,0,Domain: 04 IndoorIOutdoor Space,Low,['-1'],The program is required to conduct and documen...
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Environment,0,Domain: 02 Safe & Sanitary Environment,Moderate,['16'],The program is required to provide an environm...
A BRIGHTER START CHILDCARE,5101:2-12-12 Safe Equipment,0,Domain: 02 Safe & Sanitary Environment,Low,['1'],The program is required to refrain from placin...
A BRIGHTER START CHILDCARE,5101:2-12-15 Child Medical and Enrollment Records,0,Domain: 09 Children's Files,Low,"['6', '14']",The program is required to use the updated JFS...
A BRIGHTER START CHILDCARE,5101:2-12-18 Attendance Records,0,Domain: 01 Ratio & Supervision,Low,['3'],The program is required to have a method for t...
A BRIGHTER START CHILDCARE,5101:2-12-18 Group Size,0,Domain: 01 Ratio & Supervision,Low,['-1'],The program may combine all age groups when th...
A BRIGHTER START CHILDCARE,5101:2-12-19 Supervision,0,Domain: 01 Ratio & Supervision,Moderate,['1'],The program staff is required to supervise chi...
A BRIGHTER START CHILDCARE,5101:2-12-22 Meal and Snack Requirements,0,Domain: 05 Health & Safety,Low,['1'],The program is required to post the current we...


In [142]:
from main import count_compliances
count_compliances(center_df, rules)

Unnamed: 0,program_name,pdf,Address,City,Zip,No. Rules with Non-Compliances,No. Low Risk,No. Moderate Risk
0,A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239,11,9,2


In [143]:
center_df.index[0]

'A BRIGHTER START CHILDCARE'

In [144]:
process_ocr_results(center_df, center_rule_df, [extracted_text], SECTION_METHODS)

Field 'Program Number' not found in rows.
Field 'Program Type' not found in rows.
Field 'County' not found in rows.
Field 'Building Approval Date' not found in rows.
Field 'Use Group/Code' not found in rows.
Field 'Occupancy Limit' not found in rows.
Field 'Maximum Under 2' not found in rows.
Field 'Fire Inspection Approval Date' not found in rows.
Field 'Food Service Risk Level' not found in rows.
Field 'Inspection Type' not found in rows.
Field 'Inspection Scope' not found in rows.
Field 'Inspection Notice' not found in rows.
Field 'Inspection Date' not found in rows.
Field 'Begin Time' not found in rows.
Field 'End Time' not found in rows.
Field 'Reviewer' not found in rows.
Field 'No. Rules Verified' not found in rows.


IndexError: list index out of range

In [None]:
# local_file2, _ = urllib.request.urlretrieve(pdf_links.iloc[0]['pdf'])
# # Extract Text from PDF
# images2 = convert_from_path(local_file2, dpi=300)
# 
# 
# rows2 = []
# for i, image in enumerate(images2):
#     page = process_image(image, ocr, display=True)
#     rows2 += group_into_rows(page)
#     
# rows2

In [None]:
# extract_license_capacity_table(rows2)

# Extract Rules

**Notes**:
- Need separate columns for each rule
- Need separate columns for each rule for each compliance level as well
    - May be able to avoid compliance level columns by checking if the rule is in the non-compliance list
- Rule Data
    - List of the sub-rules that were violated (e.g., 1, 4, 8). Just numbers