In [1]:
import pandas as pd
import pdfplumber
import csv
import os
import re

In [2]:
# Extract and parse text
with pdfplumber.open('SHA_PAID_FACILITIES_APRIL_2025.pdf') as pdf:
    file_text = ""
    for page in pdf.pages:
        file_text += page.extract_text()

        # Remove punctuation

In [114]:
def split_parse_text(text: str) -> str:
    """ 
    Splits merged last record on a page with table headers of first record of following 
    page.

    Example: 'NELEPEK MEDICAL CARE CENTRE 63,469.44SORI LAKESIDE HOSPITAL- NDHIWA 11,725.79'
    Split into: 
    ['NELEPEK MEDICAL CARE CENTRE 63,469.44', 'SORI LAKESIDE HOSPITAL- NDHIWA 11,725.79']
    
    """
    pattern = r'(\d{1,4}(?:,\d{1,4},?)*\.*\d{2})([A-Za-z])'
    parsed = re.sub(pattern, r'\1\n\2', text)
    return parsed

In [115]:
# Parsed text
# Remove odd punctuation
punct_to_remove = r'[_ , ;`.\']'
parsed_punt = re.sub(punct_to_remove, '', file_text)
parsed_amper = file_text.replace('$', '&')
parsed_parser = split_parse_text(file_text)
my_rows = parsed_parser.split('\n')

In [None]:
def clean_amount(val: str, i: int) -> float:
    """
    Function to clean mangled OCR outputs 

    Inputs: str
        Alphanumeric strings and symbols(, and .)
    Returns: 
        Float with letters and commas removed
    """
    try:
        # strip leading junk/scrambled letters.
        cleaned = re.sub(r'^[A-Za-z\.\-]*', '', val)
        # remove any leftover letters and commas
        cleaned = re.sub(r'[^0-9.]+', '', cleaned).replace(",", "")
        return float(cleaned)
    except Exception as e:
        print(f" Error: {e} at line {i}")
        return None

In [122]:
# Process text
row_list = my_rows
pattern_to_skip = [
    r'Vendor Name\s*Claim\/s Amount',
    r'Hospital Name\s*Claim\/s Amount'
    ]
        
with open("sha_paid_facilities_april.csv", 'w', newline = "", encoding = "UTF-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['provider_name', 'amount'])

        for i, line in enumerate(row_list):
            line = line.strip()
            if not line:
                break

            # Check pattern match
            skip_line = 0
            for pattern in pattern_to_skip:
                if re.search(pattern, line, re.IGNORECASE):
                    skip_line += 1
                    print(f"Skipping line: {line}, {i}")
                    break 

            if skip_line:
                continue
            # Parse content
            try:
                row = line.split()
                value_string = ''.join(row[-2:])
                amount = clean_amount(value_string, i) 
                if row[-2].replace(",", '')[0].isdigit():
                    provider_name = ' '.join(row[:-2])
                else:
                    provider_name = ' '.join(row[:-1])

                writer.writerow([provider_name.title(), amount])

            except Exception as e:
                print(f"Error processing line {i}:", {e})

Skipping line: Vendor Name Claim/s Amount, 0
Skipping line: Vendor Name Claim/s Amount, 81
Skipping line: Vendor Name Claim/s Amount, 565
Skipping line: Vendor Name Claim/s Amount, 3791
Skipping line: Vendor Name Claim/s Amount, 7646
Skipping line: Vendor Name Claim/s Amount, 10274
Skipping line: Vendor Name Claim/s Amount, 13375
Skipping line: Vendor Name Claim/s Amount, 16418
Skipping line: Vendor Name Claim/s Amount, 19404
Skipping line: Vendor Name Claim/s Amount, 19654
