In [3]:
import cv2
import easyocr
import numpy as np
import pandas as pd
import requests
import torch
import os
import re
import urllib.request

from fuzzywuzzy import process
from pdf2image import convert_from_path
from bs4 import BeautifulSoup
from datetime import datetime
from PIL import Image
from io import BytesIO

In [4]:
print(f"CUDA Available: {torch.cuda.is_available()}")

# Check if CuDNN is enabled in PyTorch
print(f"CuDNN Enabled: {torch.backends.cudnn.enabled}")

# Check CuDNN version in PyTorch
print(f"CuDNN Version: {torch.backends.cudnn.version()}")

CUDA Available: False
CuDNN Enabled: True
CuDNN Version: None


# Link and File Paths

In [5]:
ODCY_LINK = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d"

REL_PATH = "https://childcaresearch.ohio.gov/"

# Helper Functions

In [6]:
def extract_html(url):
    """
    Parse the html at the given url into a beautiful soup object, for manipulation.
    
    :param url: Any valid URL
    :return: the parsed HTML or None if the request failed
    """ 
    
    response = requests.get(url)
    
    # check if the request was successful (status code 200)
    if response.status_code == 200:

        # parse the HTML content 
        soup = BeautifulSoup(response.text, 'html.parser')
        
        return soup
    else:
        print(f"Failed to retrieve page, status code: {response.status_code}")
        return None

In [7]:
def extract_inspection(url):
    """
    Extract the inspection link from the childcare center page
    
    :param url: the child care center page
    :return: the inspection page link
    """
    inspection_url = None

    # get the html for the program page
    program_page = extract_html(url)

    if program_page:
        inspection_button_span = program_page.find('span', class_='inspectionsButton')

        if inspection_button_span:
            inspection_link_tag = inspection_button_span.find_parent('a')

            if inspection_link_tag and 'href' in inspection_link_tag.attrs:
                inspection_url = REL_PATH + inspection_link_tag['href']
                
    return inspection_url

In [8]:
def extract_pdf(url) -> str:
    """
    Extract the pdf link from the inspection page
    :param url: inspection url
    :return: the pdf link
    """

    inspection_page = extract_html(url)
    most_recent_pdf_link = None
    most_recent_date = None
    
    if inspection_page is not None:
        rows = inspection_page.find_all('div', class_='resultsListRow')
        
        for row in rows:
            date_column = row.find('div', class_='resultsListColumn')

            pdf_col = row.find('span', class_='inspectionPDFlink') 
            pdf_link_tag = pdf_col.find('a', href=True)
            
            if pdf_link_tag and date_column:
                
                # format into a datetime object for date comparisons
                inspection_date = list(date_column)[2].strip()
                inspection_date = datetime.strptime(inspection_date, "%m/%d/%Y")
                pdf_link = pdf_link_tag['href']
                
                # only save the most recent date (may not be necessary, since all appear to be listed in order. Adds robustness though...)
                if most_recent_date is None or inspection_date > most_recent_date:
                    most_recent_date = inspection_date
                    most_recent_pdf_link = REL_PATH + pdf_link
    
    return most_recent_pdf_link

In [9]:
def extract_all_pdfs(url) -> pd.DataFrame:
    """
    Extract all pdf links and associated center info (e.g., name and address info) into a dataFrame for further parsing.
    
    :param url: The Ohio childcaresearch website URL (https://childcaresearch.ohio.gov/search for licensed childcare)
    :return: a dataframe containing the center name, address info, and link to the pdf for the most recent center licensing inspection
    """

    pdf_urls = []
    main_page = None
    page_num = 1
    
    # loop for all available pages
    while not (pdf_urls and main_page is None):
        
        # get the current page of results
        main_page = extract_html(f"{url}&{page_num}")
        
        if main_page is not None:
            # get all results rows for further processing
            results_list = main_page.find('div', class_='resultsList') 
            rows = results_list.find_all('div', class_='resultsListRow')
            
            for row in rows:
                program_name_column = row.find('div', class_='resultsListColumn programListColumnName')

                program_df = pd.DataFrame()
                if program_name_column:
                    
                    program_link_tag = program_name_column.find('a')
                    
                    if program_link_tag:
                        program_name = program_link_tag.text.strip()
                        program_url = REL_PATH + program_link_tag['href']
                        inspection_url = extract_inspection(program_url)  
                        program_pdf_link = extract_pdf(inspection_url) if inspection_url is not None else None
                        program_df['program_name'] = [program_name]
                        program_df['pdf'] = [program_pdf_link]
                        
                address_columns = row.findAll("div", class_="resultsListColumn")
                if address_columns:
                    program_df['Address'] = [address_columns[1].get_text(strip=True)]
                    program_df['City'] = [address_columns[2].get_text(strip=True)]
                    program_df['Zip'] = [address_columns[3].get_text(strip=True)]
                    
                # save the current row  information
                pdf_urls.append(program_df)

            break
            
        # next page 
        page_num += 1
        
    # combine into a single dataframe
    url_df = pd.concat(pdf_urls, axis=0) 
    
    # return with the program name as the index
    return url_df.set_index("program_name")


In [10]:
def download_pdf(pdf_url) -> BytesIO | None:
    """
    Create a temporary pdf file for data extraction 
    :param pdf_url: pdf to download
    :return: BytesIO object containing the pdf or None if invalid URL
    """

    response = requests.get(pdf_url)
    if response.status_code == 200:
        return BytesIO(response.content)
    else:
        print(f"Failed to download PDF: {response.status_code}")
        return None

# Extract PDF Links

In [11]:
test_link = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d&p=1"

In [12]:
pdf_link_path = "pdf_links.csv"

if os.path.exists(pdf_link_path):
    pdf_links = pd.read_csv(pdf_link_path, index_col=0)
else:
    pdf_links = extract_all_pdfs(test_link)
    pdf_links.to_csv(pdf_link_path, index=True)
pdf_links

Unnamed: 0_level_0,pdf,Address,City,Zip
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A BRIGHT START 4 KIDZ LEARNING CTR,https://childcaresearch.ohio.gov//pdf/00224002...,8211 PLATT,CLEVELAND,44104
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239
A CHILD'S GARDEN,https://childcaresearch.ohio.gov//pdf/00000020...,5427 JULMAR DRIVE,CINCINNATI,45238
A CHILD'S JOURNEY LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00217001...,846 S. YEARLING RD,WHITEHALL,43213
A CHILD'S PLACE LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00000040...,2010 OFFICEVIEW PLACE,REYNOLDSBURG,43068
A GREAT START PRESCHOOL INC,https://childcaresearch.ohio.gov//pdf/00000020...,7001 FAR HILLS AVE,DAYTON,45459
A JOYFUL JOURNEY ACADEMY,https://childcaresearch.ohio.gov//pdf/00222002...,1536 BARNETT ROAD,COLUMBUS,43227
A JUBILEE ACADEMY,https://childcaresearch.ohio.gov//pdf/00000030...,15751 LAKESHORE BLVD,CLEVELAND,44110
A KIDS ONLY EARLY LEARNING CENTER INC. 4,https://childcaresearch.ohio.gov//pdf/00219001...,2505 SOUTH RIDGE EAST,ASHTABULA,44004
A KIDS ONLY EARLY LEARNING CT INC,https://childcaresearch.ohio.gov//pdf/00000030...,2621 STATE ROAD,ASHTABULA,44004


In [13]:
local_file, _ = urllib.request.urlretrieve(pdf_links.iloc[1]['pdf'])
local_file

'C:\\Users\\wrb20\\AppData\\Local\\Temp\\tmpxcaes5yf'

# Extract Text from PDF

In [14]:
def display_opencv_image(image):
    """
    Convert an OpenCV image (NumPy array) to a PIL image and display it.
    
    :param image: (numpy array) OpenCV image in BGR format.
    """
    # Convert from BGR (OpenCV default) to RGB (PIL format)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Convert NumPy array to PIL Image
    pil_image = Image.fromarray(image_rgb)
    
    # Display the image
    pil_image.show()

In [15]:
def preprocess_image(image, display=False):
    
    if display: 
        display_opencv_image(np.array(image))

    image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    _, thresh = cv2.threshold(blur, 180, 255, cv2.THRESH_BINARY_INV)
    processed = thresh

    if display: 
        display_opencv_image(processed)
    
    return processed

In [16]:
def draw_hierarchical_rectangles(image, output_path=None):
    """
    Detect and draw hierarchical rectangles on a preprocessed binary image.
    
    :param image: (PIL.Image.Image) The preprocessed binary image (PIL format).
    :param output_path: (str) Path to save the annotated image with rectangles.
    """

    # find all rectangles (fields) in the image 
    contours, hierarchy = cv2.findContours(image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    if output_path:
        annotated_image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
        
    sub_rectangles = []

    for i, cnt in enumerate(contours):
        x, y, w, h = cv2.boundingRect(cnt)

        # Parent-child relationship: Sub-rectangles have a parent in the hierarchy
        if hierarchy[0][i][3] != -1:
            # filter by size to only keep document-field rectangles
            if w > 250 and h > 10:  
                sub_rectangles.append((x, y, w, h))
                if output_path:
                    cv2.rectangle(annotated_image, (x, y), (x + w, y + h), (0, 255, 0), 2)

    # convert to PIL image for display
    if output_path:
        annotated_image_pil = Image.fromarray(cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB))
        annotated_image_pil.save(output_path)
        annotated_image_pil.show()

    print(f"Detected {len(sub_rectangles)} hierarchical sub-rectangles.")
    sub_rectangles = sorted(sub_rectangles, key=lambda x: x[0])
    sub_rectangles = sorted(sub_rectangles, key=lambda x: x[1])
    return sub_rectangles

In [17]:
def extract_text_from_subrectangles(ocr, image, sub_rectangles):
    """
    Extract text from sub-rectangles of an image using EasyOCR.
    
    :param ocr: (easyocr.Reader) EasyOCR reader object.
    :param image: The preprocessed binary image (OpenCV format).
    :param sub_rectangles: (list) Sorted list of sub-rectangles to extract text from.
    :return: (dict) Extracted text from each sub-rectangle. Keys are positions from top-left to bottom-right.
    """
    
    if not isinstance(image, np.ndarray):
        image = np.array(image)
    
    # TODO: use a list instead...
    extracted_text = [None] * len(sub_rectangles)
    
    total = 0
    fields = 0
    total_confidence = 0

    for i, (x, y, w, h) in enumerate(sub_rectangles):
        sub_image = image[y:y+h, x:x+w]
        
        extracted_text[i] = ((x, y, w, h), [])

        if len(sub_image.shape) == 2:
            sub_image = cv2.cvtColor(sub_image, cv2.COLOR_GRAY2RGB)
            
        # skip empty sub-rectangles (background)
        if np.sum(sub_image==0) / sub_image.size < 0.02:
            fields += 1
            continue 

        results = ocr.readtext(sub_image)

        # try to preprocess the image to improve OCR performance (a bit arbitrary as of now)
        if not results:
            sub_image = cv2.GaussianBlur(sub_image, (5, 5), 0)
            sub_image = cv2.morphologyEx(sub_image, cv2.MORPH_CLOSE, (5, 5), iterations=2)
            sub_image = cv2.resize(sub_image, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
            results = ocr.readtext(sub_image)

        if results:
            fields += 1 
        else:
            # display_opencv_image(sub_image)
            print(f"No text detected in sub-rectangle {i}.") 
            
        total += max(1, len(results))

        # save the extracted text and confidence
        for result in results:
            bbox, text, confidence = result
            extracted_text[i][1].append((text, round(confidence, 2)))
            total_confidence += confidence 
            
    print(f"Extracted {fields} / {len(sub_rectangles)} text fields with an average confidence of {total_confidence / total:.2f}.")

    return extracted_text

In [18]:
images = convert_from_path(local_file, dpi=500)

In [19]:
image = images[0]

In [20]:
preprocessed_image = preprocess_image(image, display=False)

In [21]:
sub_rectangles = draw_hierarchical_rectangles(preprocessed_image, output_path="hierarchical_rectangles.png")

Detected 72 hierarchical sub-rectangles.


In [22]:
final_image = cv2.bitwise_not(preprocessed_image)
# display_opencv_image(final_image)

In [23]:
ocr = easyocr.Reader(['en'], gpu=True)
extracted_text = extract_text_from_subrectangles(ocr, final_image, sub_rectangles)
extracted_text

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


No text detected in sub-rectangle 6.
Extracted 71 / 72 text fields with an average confidence of 0.91.


[((254, 1261, 3744, 99), [('Program Details', np.float64(0.96))]),
 ((253, 1365, 1429, 280),
  [('Program Name', np.float64(0.65)),
   ('A BRIGHTER START CHILDCARE', np.float64(0.99))]),
 ((1686, 1365, 1340, 280),
  [('Program Number', np.float64(0.94)), ('000000200979', np.float64(1.0))]),
 ((3030, 1365, 969, 280),
  [('Program Type', np.float64(1.0)),
   ('Child Care Center', np.float64(0.92))]),
 ((253, 1648, 2773, 374),
  [('Address', np.float64(1.0)),
   ('2765 BLUE ROCK RD. CINCINNATI', np.float64(0.77)),
   ('OH', np.float64(1.0)),
   ('45239', np.float64(1.0))]),
 ((3030, 1648, 969, 374),
  [('County', np.float64(1.0)), ('HAMILTON', np.float64(1.0))]),
 ((254, 2026, 3744, 91), []),
 ((253, 2122, 1429, 186), [('Building Approval Date', np.float64(0.84))]),
 ((1686, 2122, 642, 186), [('Use Group/Code', np.float64(0.96))]),
 ((2332, 2122, 829, 186), [('Occupancy Limit', np.float64(1.0))]),
 ((3165, 2122, 834, 186), [('Maximum Under 2 Y', np.float64(0.58))]),
 ((253, 2312, 1429, 18

In [24]:
def group_into_rows(extracted_text, threshold=5):
    """
    Group extracted_text entries into rows based on their y-coordinates.
    
    :param extracted_text: A list of tuples of the form:
                          [ ( (x, y, w, h), [ (text, conf), ... ] ), ... ] 
                          assumed to be sorted top-to-bottom, left-to-right.
    :param threshold:    The distance in pixels to decide when to start a new row.
    :return:               A list (rows) of lists, each inner list is one row, 
                           containing the sub-rectangle data.
    """
    rows = []
    current_row = []
    if not extracted_text:
        return rows
    
    # start the first row’s baseline from the very first rectangle's y
    _, first_data = extracted_text[0]
    current_row_y = extracted_text[0][0][1]
    current_row_h = extracted_text[0][0][3]

    for ((x, y, w, h), text_data) in extracted_text:
        
        if abs(y - current_row_y) > threshold or abs(h - current_row_h) > threshold:
            # push the old row into rows
            rows.append(current_row)
            # start a new row
            current_row = []
            current_row_y = y
            current_row_h = h

        
        # add current bounding box/data to the current row
        current_row.append(((x, y, w, h), text_data))
    
    if current_row:
        rows.append(current_row)

    # sort each row by x-coordinate and remove empty rows
    rows = [sorted(r, key=lambda x: x[0][0]) for r in rows if not (len(r) == 1 and len(r[0][1]) == 0)]

    return rows

In [25]:
rows = group_into_rows(extracted_text)
rows

[[((254, 1261, 3744, 99), [('Program Details', np.float64(0.96))])],
 [((253, 1365, 1429, 280),
   [('Program Name', np.float64(0.65)),
    ('A BRIGHTER START CHILDCARE', np.float64(0.99))]),
  ((1686, 1365, 1340, 280),
   [('Program Number', np.float64(0.94)), ('000000200979', np.float64(1.0))]),
  ((3030, 1365, 969, 280),
   [('Program Type', np.float64(1.0)),
    ('Child Care Center', np.float64(0.92))])],
 [((253, 1648, 2773, 374),
   [('Address', np.float64(1.0)),
    ('2765 BLUE ROCK RD. CINCINNATI', np.float64(0.77)),
    ('OH', np.float64(1.0)),
    ('45239', np.float64(1.0))]),
  ((3030, 1648, 969, 374),
   [('County', np.float64(1.0)), ('HAMILTON', np.float64(1.0))])],
 [((253, 2122, 1429, 186), [('Building Approval Date', np.float64(0.84))]),
  ((1686, 2122, 642, 186), [('Use Group/Code', np.float64(0.96))]),
  ((2332, 2122, 829, 186), [('Occupancy Limit', np.float64(1.0))]),
  ((3165, 2122, 834, 186), [('Maximum Under 2 Y', np.float64(0.58))])],
 [((253, 2312, 1429, 186),
 

In [26]:
nnn = 14

In [27]:
nnn += 1
rows[nnn]

[((253, 4044, 932, 93),
  [('Infant', np.float64(1.0)), ('Birth to < 18 m)', np.float64(0.87))]),
 ((1783, 4044, 372, 93), [('11', np.float64(1.0))]),
 ((2158, 4044, 661, 93), [('0', np.float64(1.0))]),
 ((2822, 4044, 1177, 93), [('11', np.float64(1.0))])]

In [28]:
rows[1][0]

((253, 1365, 1429, 280),
 [('Program Name', np.float64(0.65)),
  ('A BRIGHTER START CHILDCARE', np.float64(0.99))])

In [29]:
p1_fields = [ "Program Name",
        "Program Number",
        "Program Type",
        "County",
        "Building Approval Date",
        "Use Group/Code",
        "Occupancy Limit",
        "Maximum Under 2",   # under 2 1/2 but idk how this will be read...
        "Fire Inspection Approval Date",
        "Food Service Risk Level",
        "Inspection Type",
        "Inspection Scope",
        "Inspection Notice",
        "Inspection Date",
        "Begin Time",
        "End Time",
        "Reviewer",
        "No. Rules Verified",
        "No. Rules with Non-compliances",
        "No. Serious Risk",
        "No. Moderate Risk",
        "No. Low Risk"
]

In [30]:
def process_ocr_to_dataframe(ocr_results, fields):
    """
    Process OCR results and flatten the extracted information into a CSV-ready format.

    Parameters:
    ocr_results (dict): Dictionary of OCR results containing bounding boxes and text.

    Returns:
    pd.DataFrame: A dataframe containing flattened, structured data for CSV storage.
    """
    extracted_data = {field: None for field in fields}

    for field in ocr_results:
        if len(field[1]) > 1:
            
            label, content = field[1][0][0], field[1][-1][0]
            confidence = field[1][0][1]

            best_match, score = process.extractOne(label, extracted_data.keys())
            if score > 95:
                if extracted_data[best_match] is not None and extracted_data[best_match] != content:
                    print(f"Field '{best_match}' field changed from '{extracted_data[best_match]}' to '{content}'")

                extracted_data[best_match] = content, confidence

    general_df = pd.DataFrame([extracted_data])

    general_df["Program Name"] = general_df.apply(lambda x: x["Program Name"][0], axis=1)
    general_df.set_index("Program Name", inplace=True)
    
    # general_df = general_df[sorted(general_df.columns)]

    return general_df

In [31]:
process_ocr_to_dataframe(extracted_text, p1_fields)

Unnamed: 0_level_0,Program Number,Program Type,County,Building Approval Date,Use Group/Code,Occupancy Limit,Maximum Under 2,Fire Inspection Approval Date,Food Service Risk Level,Inspection Type,...,Inspection Notice,Inspection Date,Begin Time,End Time,Reviewer,No. Rules Verified,No. Rules with Non-compliances,No. Serious Risk,No. Moderate Risk,No. Low Risk
Program Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A BRIGHTER START CHILDCARE,"(000000200979, 0.94)","(Child Care Center, 1.0)","(HAMILTON, 1.0)",,,,,"(09/20/2024, 0.95)","(Level II, 0.75)","(Annual, 1.0)",...,"(Unannounced, 1.0)","(10/02/2024, 1.0)","(9:00 AM, 0.96)","(11.30 AM, 0.96)","(Kristin Blassingame, 1.0)","(58, 0.63)","(10, 0.62)",,"(2, 0.98)","(9, 0.98)"


In [52]:
def get_row(rows, field, start_idx=0):
    idx = start_idx
    i = start_idx

    while i < len(rows) and idx == start_idx:
        row = rows[i][0][1][0][0]
        if field in row or row in field:
            idx = i

        # else: 
            # print(f"Field: {field} and Row: {row}")
        
        i += 1

    if idx == start_idx:
        print(f"Field '{field}' not found in rows.")

    return idx

def extract_license_capacity_table(extracted_text):
    rows = group_into_rows(extracted_text)

    table_rows = [
        "Infant",
        "Young Toddler",
        "Total Under 2",
        "Older Toddler",
        "Preschool",
        "School",
        "Total Capacity/Enrollment"]

    columns = [
        "Full Time",
        "Part Time",
        "Total"
    ]

    table = {"License Capacity": {}}

    row_idx = 0

    for i, t_row in enumerate(table_rows):

        prev = row_idx
        row_idx = get_row(rows, t_row, row_idx)

        if row_idx == prev:
            continue

        current_row = rows[row_idx]

        # save license capacity totals
        if len(current_row) > len(columns) + 1:
           table["License Capacity"][t_row] = current_row[1][1]
           current_row = current_row[2:]
        else:
            current_row = current_row[1:]

        table[t_row] = {columns[i]: field[1][0] for i, field in enumerate(current_row)}

    return table

extract_license_capacity_table(extracted_text)

{'License Capacity': {'Total Under 2': [('21', np.float64(1.0))],
  'Total Capacity/Enrollment': [('44', np.float64(1.0))]},
 'Infant': {'Full Time': ('11', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('11', np.float64(1.0))},
 'Young Toddler': {'Full Time': ('0', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('0', np.float64(1.0))},
 'Total Under 2': {'Full Time': ('11', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('11', np.float64(1.0))},
 'Older Toddler': {'Full Time': ('0', np.float64(1.0)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('0', np.float64(0.69))},
 'Preschool': {'Full Time': ('21', np.float64(1.0)),
  'Part Time': ('0', np.float64(0.67)),
  'Total': ('21', np.float64(1.0))},
 'School': {'Full Time': ('0', np.float64(0.99)),
  'Part Time': ('0', np.float64(1.0)),
  'Total': ('0', np.float64(1.0))},
 'Total Capacity/Enrollment': {'Full Time': ('21', np.float64(1.0)),
  'Part Time': ('0', np.float

In [33]:
rows[15]

[((253, 4044, 932, 93),
  [('Infant', np.float64(1.0)), ('Birth to < 18 m)', np.float64(0.87))]),
 ((1783, 4044, 372, 93), [('11', np.float64(1.0))]),
 ((2158, 4044, 661, 93), [('0', np.float64(1.0))]),
 ((2822, 4044, 1177, 93), [('11', np.float64(1.0))])]