In [1]:
import pandas as pd 
import os
import pdfplumber 
import csv
import re 

In [3]:
# Parse text to remove joined records
def parse_text(text: str) -> str:
    """ 
    Splits merged last record on a page with table headers or first record of following 
    page.

    Example: 'NELEPEK MEDICAL CARE CENTRE 63,469.44SORI LAKESIDE HOSPITAL- NDHIWA 11,725.79'
    Split into: 
    ['NELEPEK MEDICAL CARE CENTRE 63,469.44', 'SORI LAKESIDE HOSPITAL- NDHIWA 11,725.79']
    
    """
    pattern = r'(\d{1,3}(?:,\d{3})*\.\d{2})([A-Z][a-zA-Z])'
    parsed = re.sub(pattern, r'\1\n\2', text)
    parsed = parsed.replace('$', '&')
    return parsed

In [4]:
# Extract amount from scrambled text
def clean_amount(val):

    clean_amount = re.sub('[^0-9.]', "", val)
    return round(float(clean_amount), 2)

In [8]:
# Read pdf into a text file
with pdfplumber.open(r"C:\Users\user\Desktop\Wizi Peupe\sha_payments\pdfs\SHA_PAID_FACILITIES_JULY_2025.pdf") as pdf:
    file_text = ""
    for page in pdf.pages:
        file_text += page.extract_text()
        parsed = parse_text(file_text)
    


In [None]:
# Extract text and write to CSV
records = parsed.split("\n")

patterns_to_skip = [
    r'^HOSPITAL\s?Name',
    r'^Vendor',
    r'^TOTAL',
    r'^SHA\s?Paid',
    r'^Provider\s?Name',
    r'^Page\s?\d+',
    r'^Grand\s?Total',
    r'Paid\s?Facilities'
    ]
    #r'^\d+']


with open(r"C:\Users\user\Desktop\Wizi Peupe\sha_payments\csvs\sha_paid_facilities_july_2025.csv", 
'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['provider_name', 'amount'])

    # Preprocess records
    for i, line in enumerate(records):
        line = line.strip()
        if not line:
            continue
        
        # Skip table or document headers
        skip_line = 0
        for pattern in patterns_to_skip:
            if re.match(pattern, line, re.IGNORECASE):
                skip_line += 1
                print(f"Skipping line: {line}")
                break 

        # Continue to next line
        if skip_line:
            continue

        # Separate provider name and amount
        try:
            data = line.strip().split()
            
            if not len(data) >= 15:
                provider_name = ' '.join(data[:-1])
                amount = clean_amount((data[-1]))
                
            else:
                provider_name = ' '.join(data[:-3])
                amount_str = ''.join(data[-5:])
                amount = clean_amount(amount_str)

            # Write records
            writer.writerow([provider_name, amount])
        except Exception as e:
            print(f"Error {e} on line: {line}, {i}")




