In [2]:
import pandas as pd
import pdfminer.high_level
import requests
import os
import re

from bs4 import BeautifulSoup
from datetime import datetime
from io import BytesIO

# Link and File Paths

In [3]:
ODCY_LINK = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d"

REL_PATH = "https://childcaresearch.ohio.gov/"

# Helper Functions

In [4]:
def extract_html(url):
    """
    Parse the html at the given url into a beautiful soup object, for manipulation.
    
    :param url: Any valid URL
    :return: the parsed HTML or None if the request failed
    """ 
    
    response = requests.get(url)
    
    # check if the request was successful (status code 200)
    if response.status_code == 200:

        # parse the HTML content 
        soup = BeautifulSoup(response.text, 'html.parser')
        
        return soup
    else:
        print(f"Failed to retrieve page, status code: {response.status_code}")
        return None

In [5]:
def extract_inspection(url):
    """
    Extract the inspection link from the childcare center page
    
    :param url: the child care center page
    :return: the inspection page link
    """
    inspection_url = None

    # get the html for the program page
    program_page = extract_html(url)

    if program_page:
        inspection_button_span = program_page.find('span', class_='inspectionsButton')

        if inspection_button_span:
            inspection_link_tag = inspection_button_span.find_parent('a')

            if inspection_link_tag and 'href' in inspection_link_tag.attrs:
                inspection_url = REL_PATH + inspection_link_tag['href']
                
    return inspection_url

In [6]:
def extract_pdf(url) -> str:
    """
    Extract the pdf link from the inspection page
    :param url: inspection url
    :return: the pdf link
    """

    inspection_page = extract_html(url)
    most_recent_pdf_link = None
    most_recent_date = None
    
    if inspection_page is not None:
        rows = inspection_page.find_all('div', class_='resultsListRow')
        
        for row in rows:
            date_column = row.find('div', class_='resultsListColumn')

            pdf_col = row.find('span', class_='inspectionPDFlink') 
            pdf_link_tag = pdf_col.find('a', href=True)
            
            if pdf_link_tag and date_column:
                
                # format into a datetime object for date comparisons
                inspection_date = list(date_column)[2].strip()
                inspection_date = datetime.strptime(inspection_date, "%m/%d/%Y")
                pdf_link = pdf_link_tag['href']
                
                # only save the most recent date (may not be necessary, since all appear to be listed in order. Adds robustness though...)
                if most_recent_date is None or inspection_date > most_recent_date:
                    most_recent_date = inspection_date
                    most_recent_pdf_link = REL_PATH + pdf_link
    
    return most_recent_pdf_link

In [7]:
def extract_all_pdfs(url) -> pd.DataFrame:
    """
    Extract all pdf links and associated center info (e.g., name and address info) into a dataFrame for further parsing.
    
    :param url: The Ohio childcaresearch website URL (https://childcaresearch.ohio.gov/search for licensed childcare)
    :return: a dataframe containing the center name, address info, and link to the pdf for the most recent center licensing inspection
    """

    pdf_urls = []
    main_page = None
    page_num = 1
    
    # loop for all available pages
    while not (pdf_urls and main_page is None):
        
        # get the current page of results
        main_page = extract_html(f"{url}&{page_num}")
        
        if main_page is not None:
            # get all results rows for further processing
            results_list = main_page.find('div', class_='resultsList') 
            rows = results_list.find_all('div', class_='resultsListRow')
            
            for row in rows:
                program_name_column = row.find('div', class_='resultsListColumn programListColumnName')

                program_df = pd.DataFrame()
                if program_name_column:
                    
                    program_link_tag = program_name_column.find('a')
                    
                    if program_link_tag:
                        program_name = program_link_tag.text.strip()
                        program_url = REL_PATH + program_link_tag['href']
                        inspection_url = extract_inspection(program_url)  
                        program_pdf_link = extract_pdf(inspection_url) if inspection_url is not None else None
                        program_df['program_name'] = [program_name]
                        program_df['pdf'] = [program_pdf_link]
                        
                address_columns = row.findAll("div", class_="resultsListColumn")
                if address_columns:
                    program_df['Address'] = [address_columns[1].get_text(strip=True)]
                    program_df['City'] = [address_columns[2].get_text(strip=True)]
                    program_df['Zip'] = [address_columns[3].get_text(strip=True)]
                    
                # save the current row  information
                pdf_urls.append(program_df)

            break
            
        # next page 
        page_num += 1
        
    # combine into a single dataframe
    url_df = pd.concat(pdf_urls, axis=0) 
    
    # return with the program name as the index
    return url_df.set_index("program_name")


In [8]:
def download_pdf(pdf_url) -> BytesIO | None:
    """
    Create a temporary pdf file for data extraction 
    :param pdf_url: pdf to download
    :return: BytesIO object containing the pdf or None if invalid URL
    """

    response = requests.get(pdf_url)
    if response.status_code == 200:
        return BytesIO(response.content)
    else:
        print(f"Failed to download PDF: {response.status_code}")
        return None

# Extract PDF Links

In [9]:
test_link = "https://childcaresearch.ohio.gov/search?q=fVLNbhMxEN40v0uapgKhHhAiBy6VQkUR1xwWN1VDIVl1V0gFcXDWk42FY6%2b83pS98Q6ICxdeg1fgyBvwJjB2uhCJqrPS2DOf59tvPPZqnuf9RrOrtd0ddCRhyRXMiVqtlBwO3oDOuZKj46On9hsOSCFMoWEkoTCaiuEgLOaCJ%2bdQxuoDyJEshGhaxkfbREcn8ewoAqqTJdHcgOb0MZ7ZC7VacwZ6WqzmoFtEFdKUDcJN2X7LM6IY7FZH4jKD1gU1XKbdkGrDqZjSFbQnckGlyXuXWJvGijGBknszYUuuIz%2fUkCdLpYQfuSVIoX9Cy3y2mGWgkVPJ%2fTNV6O1E8zVQke%2b%2fgIXSsCkjVEN3vAaJGuy%2bE13x1QqD%2b1EGiRUEwHKy5IJZeC%2fWVOaZ0sYR9oMFNv6PqTdbg5Y8XRob3TnlIFiseZYfjKkWpaPBs2z8MUMaJPDPgLLIYO%2f3Qs3X1MA5l9hmihmQB5tBiPK0wCT7q%2bLuK07nXOCVTmReoKIEmtNgfEma0zEJA9wTEqI%2fJaROZkEjINEEB5FKThsThLxaDce5eSPthnejNTudmrOH%2fw3dXesFlSm8e2%2ffV%2b1Gs8jOk%2bNW3bL59cpVL9MFm51V4DdvloHWt3h7q7TtztrIIo0Htwi0IvyWdZbA71jn2%2b5s7rZCp9cNJub4IseS2bUTgYDEAHM8rkUv%2b%2frymV1%2fPf%2fCnC77p9Z25vvn4JtDOhVynfl0%2bPPwh0P8CqmsYu3%2bAQ%3d%3d&p=1"

In [10]:
pdf_links = extract_all_pdfs(test_link)
pdf_links

Unnamed: 0_level_0,pdf,Address,City,Zip
program_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A BRIGHT START 4 KIDZ LEARNING CTR,https://childcaresearch.ohio.gov//pdf/00224002...,8211 PLATT,CLEVELAND,44104
A BRIGHTER START CHILDCARE,https://childcaresearch.ohio.gov//pdf/00000020...,2765 BLUE ROCK RD.,CINCINNATI,45239
A CHILD'S GARDEN,https://childcaresearch.ohio.gov//pdf/00000020...,5427 JULMAR DRIVE,CINCINNATI,45238
A CHILD'S JOURNEY LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00217001...,846 S. YEARLING RD,WHITEHALL,43213
A CHILD'S PLACE LEARNING CENTER,https://childcaresearch.ohio.gov//pdf/00000040...,2010 OFFICEVIEW PLACE,REYNOLDSBURG,43068
A GREAT START PRESCHOOL INC,https://childcaresearch.ohio.gov//pdf/00000020...,7001 FAR HILLS AVE,DAYTON,45459
A JOYFUL JOURNEY ACADEMY,https://childcaresearch.ohio.gov//pdf/00222002...,1536 BARNETT ROAD,COLUMBUS,43227
A JUBILEE ACADEMY,https://childcaresearch.ohio.gov//pdf/00000030...,15751 LAKESHORE BLVD,CLEVELAND,44110
A KIDS ONLY EARLY LEARNING CENTER INC. 4,https://childcaresearch.ohio.gov//pdf/00219001...,2505 SOUTH RIDGE EAST,ASHTABULA,44004
A KIDS ONLY EARLY LEARNING CT INC,https://childcaresearch.ohio.gov//pdf/00000030...,2621 STATE ROAD,ASHTABULA,44004


In [11]:
import urllib.request
local_file, _ = urllib.request.urlretrieve(pdf_links.iloc[0]['pdf'])
local_file

'C:\\Users\\WILLBL~1\\AppData\\Local\\Temp\\tmpagj7h1ve'

In [17]:
# try imageMagick instead?
from pdf2image import convert_from_path

In [28]:
import easyocr
import numpy as np
from pdf2image import convert_from_path

# Convert PDF to images and run OCR using EasyOCR
def convert_pdf_to_text_via_ocr(pdf_path):
    # Step 1: Convert PDF pages to images using pdf2image
    images = convert_from_path(pdf_path, poppler_path=r"C:\Program Files\poppler-24.08.0\\bin")
    
    text = ""
    
    # Step 2: Initialize EasyOCR Reader (for English language)
    reader = easyocr.Reader(['en'])
    
    # Step 3: Perform OCR on each image
    for i, image in enumerate([images[0]]):
        # Convert PIL Image to numpy array for EasyOCR
        image_array = np.array(image)
        
        # Use EasyOCR to read the text
        result = reader.readtext(image_array)
        
        # Add OCR results for this page to the output text
        text += f"--- Page {i + 1} ---\n"
        for detection in result:
            text += detection[1] + "\n"  # The second element of each result is the extracted text
    
    return text

# Example usage
extracted_text = convert_pdf_to_text_via_ocr(local_file)

# Print the extracted text
print(extracted_text)

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


--- Page 1 ---
Department of Education
Department of Job and Family Services
BOLD
Begtnng"
Center Licensing Inspection Full Report
AIl licensed child care programs are inspected at least once each year. Non-compliances are documented and grouped as Serious,
Moderate or Low risk violations. Documenting statements and supplemental information may be included in this report:
Licensing
inspection reports from the previous three years can be viewed on the child care website at http llifs ohiogovlCDCIchildcare stm 
This includes complaint investigation reports with substantiated allegations
For any other child care records, please contact the
Child Care Help Desk at 1-877-302-2347, option 4.
Program Details_
Program Name
Program Number
Program Type
A Bright Start 4 Kidz Learning Ctr
2240029481
Child Care Center
Address
County
8211 Platt Cleveland
CUYAHOGA
OH
44104
Building Approval Date
Use Group/Code
Occupancy Limit
Maximum Under 2 Yz
05/04/2022
E
36
0
Fire Inspection Approval Date
Food Ser