In [3]:
! pip install pdfplumber
! pip install pypdf2
!apt-get install tesseract-ocr
!apt-get install ghostscript
!pip install ocrmypdf

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ghostscript is already the newest version (9.55.0~dfsg1-0ubuntu5.9).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


## Import data

In [70]:
import PyPDF2 as pdf
import pandas as pd
import json
import pdfplumber
import ocrmypdf
import re
from datetime import datetime

## CPT code and Description

In [62]:
file_path = r"Mario_Gutierrez_BR.xlsx"
sheet_name = 'CPT DESC'  # Your sheet name

# Read the Excel file into a DataFrame
df_cpt = pd.read_excel(file_path, sheet_name=sheet_name, dtype={'Procedure Code': str, 'Description': str})

##

In [63]:
input_pdf = r'R000530 - R000547 M. Gutierrez_Allied Health_Billing.pdf'
data = []

# Open the PDF using pdfplumber
with pdfplumber.open(input_pdf) as pdf:
    for page in pdf.pages:
        text = page.extract_text()
        if text:
            data.append({
                'page': page.page_number,
                'text': text
            })

In [64]:
print(data)

[{'page': 1, 'text': 'PERTAIN TO : Mario Alberto Gutierrez\nFROM : Allied Health\n814 West Mcneese Street, Suite 100.\nLAKE CHARLES., LOUISIANA 70605\nPHONE : (337) 602-8686 FAX : No fax number on file\nRECORD TYPE : Billing\nDELIVER TO : Trent Shelton\nARNOLD & ITKIN, LLP\n6009 MEMORIAL DRIVE\nHOUSTON, TEXAS 77007\nPHONE : (713) 222-3800 FAX : (713) 222-3850\nOrder No : 225590-1-35\nMARIO GUTIERREZ\nR000530'}, {'page': 2, 'text': 'CAUSE NO.\nAFFIDAVIT\nRecords Pert ning fo. Marlo Alberto Gutierrez\nType of Re rdsiAny and all ailing records FROM 06:2 1202: PRESENT pectamning to\nii S Gutters DOB 2797 SEN 494 including but et finited to, any\naa Hemized line nts, ledger cards andlor sheats, insuranc:tocl Tai hm es, an any aaft\nuocuments canta ined Sor ite egarding charge: incurred taining examinatios\ntre: ment batent\nBetore me the w sunhonty. pers onaily 0 Laphe\nng by me duly svar capasen as\n;\nPa - a the Is\nase on fa " Fi Hi i wl : pa nic ih re\nVan - a cian of 2a for Alh z\nthis

## Function to Capture **HOSPITAL NAME** from page 1 PDF

In [65]:
def extract_hospital_name(text):
    # Adjusted regex pattern to capture the hospital name after "FROM :"
    match = re.search(r"FROM\s*:\s*([^\n]+)", text)
    if match:
        return match.group(1).strip()  # Return the captured group
    return "Unknown Hospital"

In [66]:
first_page = data[0] # First_page dic format
extract_hospital_name(first_page['text']) # extracting hospital name from first page

'Allied Health'

## Function for **HEALTH CLAIM FORM** from page 5 of PDF

In [77]:
def allied_health_claim_form(data, start_page=4):
    # Define regex patterns
    date_pattern = r'(\d{2} \d{2} \d{2})'  # Dates in MM DD YY format
    service_line_pattern = r'(\d{2} \d{2} \d{2}).+?(\b[A-Z]?\d{4,5}\b).+?(\d+)'  # Date, CPT, Charge

    # Extract hospital name from the first page
    hospital_name = extract_hospital_name(data[0]['text'])

    records = []

    # Iterate through pages starting from the specified page index
    for page_data in data[start_page:]:
        text = page_data['text']

        # Extract dates and convert them to datetime objects
        dates = re.findall(date_pattern, text)
        from_date = datetime.strptime(dates[0], '%m %d %y') if dates else None
        to_date = datetime.strptime(dates[-1], '%m %d %y') if len(dates) > 1 else None

        # Extract service lines: Date, CPT/HCPCS codes, and charges
        service_lines = re.findall(service_line_pattern, text)

        # Append extracted records to the list
        for service_date, code, charge in service_lines:
            service_date_obj = datetime.strptime(service_date, '%m %d %y')

            record = {
                "Page": page_data['page'],
                "Hospital": hospital_name,
                "TO Date": to_date,
                "CPT": str(code),
                "Charges": float(charge)  # Convert charge to float
            }
            records.append(record)

    # Create a DataFrame from the list of records
    df_claim_info = pd.DataFrame(records)

    return df_claim_info


In [78]:
df_extracted = allied_health_claim_form(data)

In [79]:
final_data = pd.merge(df_extracted,df_cpt,left_on='CPT', right_on='Procedure Code', how='left')
final_data.drop(columns=['Procedure Code'], inplace=True)
final_data.rename(columns={'Description': 'Provided_Description'}, inplace=True)

Unnamed: 0,Page,Hospital,TO Date,CPT,Charges,Provided_Description
0,5,Allied Health,2023-09-29,99213,500.0,Office Visit
1,6,Allied Health,2024-02-01,62323,5000.0,Njx interlaminar
2,7,Allied Health,2024-01-05,99213,500.0,Office Visit
3,8,Allied Health,2024-02-16,99213,500.0,Office Visit
4,9,Allied Health,2024-03-20,99442,500.0,Phone e/m


In [80]:
final_data

Unnamed: 0,Page,Hospital,TO Date,CPT,Charges,Provided_Description
0,5,Allied Health,2023-09-29,99213,500.0,Office Visit
1,6,Allied Health,2024-02-01,62323,5000.0,Njx interlaminar
2,7,Allied Health,2024-01-05,99213,500.0,Office Visit
3,8,Allied Health,2024-02-16,99213,500.0,Office Visit
4,9,Allied Health,2024-03-20,99442,500.0,Phone e/m
5,10,Allied Health,2024-04-12,99213,500.0,Office Visit
6,11,Allied Health,2024-03-28,62321,8000.0,Njx interlaminar
7,11,Allied Health,2024-03-28,Q9967,0.0,LOCM 300-399mg/m
8,11,Allied Health,2024-03-28,S0020,0.0,"InjeCTion, bupiv"
9,11,Allied Health,2024-03-28,J1100,0.0,Dexamethasone so


## Function for **Access Healthcare Management Invoice** page 4 PDF

In [82]:
page_5 = data[3]['text']

In [85]:
data[3]['text']

'f ALLIED\n82-2152448\nDate Invoice #\n814 W. McNeese Street 8/15/2023 28974\nSuite 100\nLake Charles, LA 70605\n337-602-8686\nBill To\nAccess Healthcare Management\nP.O. Box 4910\nLake Charles, LA 70606\nP.O. No. Terms Project\nMario A. Gutierrez\nQuantity Description Rate Amount\n1 Dr. Chaiban - Appt. No Show Fee - DOS:08/15/2023 250.00 250.00\nPayments/Credits\n$0.00\nBalance Due $250.00\n0002\nMARIO GUTIERREZ\nR000533'

In [92]:
def extract_invoice_details(text):
    # Define refined regex patterns
    date_pattern = r'\b(\d{1,2}/\d{1,2}/\d{4})\b'  # Matches dates in MM/DD/YYYY format
    description_pattern = r'([A-Za-z\s\.\-]+Appt\. No Show Fee)'  # Matches specific description pattern
    amount_pattern = r'\b(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)\b'  # Matches monetary values

    # Extract values using regex
    dates = re.findall(date_pattern, text)
    descriptions = re.findall(description_pattern, text)
    amounts = re.findall(amount_pattern, text)

    # Extract last amount as Balance Due (assuming the last occurrence is the total amount)
    balance_due = float(amounts[-1].replace(',', '')) if amounts else 0.0

    # Create records from extracted data
    records = []
    for i, description in enumerate(descriptions):
        record = {
            "Date": dates[i] if i < len(dates) else None,
            "Description": description.strip(),
            "Amount": balance_due if i == len(descriptions) - 1 else float(amounts[i].replace(',', ''))
        }
        records.append(record)

    # Create a DataFrame from the extracted data
    df_invoice = pd.DataFrame(records)

    return df_invoice

In [93]:
extract_invoice_details(page_5)

Unnamed: 0,Date,Description,Amount
0,8/15/2023,Dr. Chaiban - Appt. No Show Fee,250.0
