In [6]:
!pip install pymupdf



In [7]:
import fitz
import re
from datetime import datetime
import pandas as pd


### Creating CPT  Dictionary

In [17]:

# Load the Excel file (replace with the actual file path)
file_path = r"Mario_Gutierrez_BR.xlsx"
sheet_name = 'CPT DESC'  # Your sheet name

# Read the Excel file into a DataFrame
df_cpt = pd.read_excel(file_path, sheet_name=sheet_name, dtype={'Procedure Code': str, 'Description': str})


In [18]:
df_cpt.head() 

Unnamed: 0,Procedure Code,Description
0,95004,Percut allergy s
1,95010,PERCUT ALLERGY T
2,95012,Exhaled nitric o
3,95015,ID ALLERGY TITRA
4,95017,Perq & icut allg


### Hospital Name extraction

In [19]:
my_path = r'R000345 - R000349 M. Gutierrez_The Houston Spine & Rehabilitation Centers_Billing.pdf'
doc = fitz.open(my_path)

In [20]:
page = doc[0]
all_infos = page.get_text(sort=True)
print(all_infos)

      PERTAIN TO :    Mario Alberto Gutierrez

         FROM :    The Houston Spine & Rehabilitation Centers
                      3101 College Park Dr.
                   THE WOODLANDS., TEXAS 77384



        PHONE :    (281) 362-0006          FAX :  No fax number on file

   RECORD TYPE :     Billing


      DELIVER TO :    Trent Shelton

                 ARNOLD & ITKIN, LLP
                      6009 MEMORIAL DRIVE
                  HOUSTON, TEXAS 77007


        PHONE :    (713) 222-3800          FAX :   (713) 222-3850





Order No : 225590-1-32

                                                   MARIO GUTIERREZ
                                                                  R000345


In [42]:
def extract_hospital_name(text):

    match = re.search(r"FROM\s*:\s*([^\n]+)", text)
    if match:
        return match.group(1).strip() 
    return "Unknown Hospital"

In [43]:
extract_hospital_name(all_infos)

'The Houston Spine & Rehabilitation Centers'

### Hospital Billing Extraction

In [47]:
import fitz  # PyMuPDF
import re
import pandas as pd
from datetime import datetime

def extract_hospital_name(text):
    match = re.search(r"FROM\s*:\s*([^\n]+)", text)
    if match:
        return match.group(1).strip()
    return "Unknown Hospital"

def extract_billing_info_pdf(file_path):
    # Open the PDF file
    doc = fitz.open(file_path)
    billing_records = []

    # Extract hospital name from the first page of the PDF
    
    first_page_text = doc[0].get_text(sort=True)
    hospital_name = extract_hospital_name(first_page_text)

    # Loop through each page of the PDF
    for page in doc:
        # Extract the text from the page
        all_medicine = page.get_text(sort=True)

        # Regular expression pattern to match the relevant billing data
        pattern = r'(\d{2}/\d{2}/\d{4})\s+Charge\s+[A-Za-z,\s]+\s+([A-Z0-9]*[0-9]+[A-Z0-9]*)\s+([\w\s\+\-\.\,\(\)]+)\s+(\d+\.\d{2})\s+(\d+\.\d{2})'

        # Find all matches in the text
        matches = re.findall(pattern, all_medicine)

        # Loop through each match and create a record
        for match in matches:
            record = {
                "Hospital": hospital_name,  # Add hospital name here
                "Date": datetime.strptime(match[0], '%m/%d/%Y'),  # Convert to datetime object
                "CPT": str(match[1]),  # Convert to string
                "Extracted_Description": str(match[2].strip()),  # Convert to string and strip any extra whitespace
                "Units": float(match[3]),  # Convert to float
                "Charges": float(match[4])  # Convert to float
            }
            billing_records.append(record)  # Append each record to the list

    # Create a DataFrame from the list of records
    df_billing_records = pd.DataFrame(billing_records)

    return df_billing_records  # Return the DataFrame


In [48]:
df_extracted = extract_billing_info_pdf(my_path)

In [49]:
df_extracted.head(50)

Unnamed: 0,Hospital,Date,CPT,Extracted_Description,Units,Charges
0,The Houston Spine & Rehabilitation Centers,2024-01-24,97163,Complex Initial Evaluation,1.0,110.0
1,The Houston Spine & Rehabilitation Centers,2024-01-24,97014,APPL MODALITY 1+ AREAS ELEC STIMJ UNATTN,1.0,50.0
2,The Houston Spine & Rehabilitation Centers,2024-01-30,97113,AQUATIC THERAPY,4.0,500.0
3,The Houston Spine & Rehabilitation Centers,2024-02-02,97140,MANUAL THERAPY,1.0,100.0
4,The Houston Spine & Rehabilitation Centers,2024-02-02,97035,ULTRASOUND,1.0,38.0
5,The Houston Spine & Rehabilitation Centers,2024-02-02,97014,APPL MODALITY 1+ AREAS ELEC STIMJ UNATTN,1.0,50.0
6,The Houston Spine & Rehabilitation Centers,2024-02-06,97113,AQUATIC THERAPY,4.0,500.0
7,The Houston Spine & Rehabilitation Centers,2024-02-08,97113,AQUATIC THERAPY,3.0,375.0
8,The Houston Spine & Rehabilitation Centers,2024-02-08,97140,MANUAL THERAPY,1.0,100.0
9,The Houston Spine & Rehabilitation Centers,2024-02-13,97113,AQUATIC THERAPY,3.0,375.0


In [50]:
# Matching columns CPT with Procedure Code


final_data = pd.merge(df_extracted,df_cpt,left_on='CPT', right_on='Procedure Code', how='left')
final_data.drop(columns=['Procedure Code'], inplace=True)
final_data.rename(columns={'Description': 'Provided_Description'}, inplace=True)
final_data.head()

Unnamed: 0,Hospital,Date,CPT,Extracted_Description,Units,Charges,Provided_Description
0,The Houston Spine & Rehabilitation Centers,2024-01-24,97163,Complex Initial Evaluation,1.0,110.0,Pt eval high com
1,The Houston Spine & Rehabilitation Centers,2024-01-24,97014,APPL MODALITY 1+ AREAS ELEC STIMJ UNATTN,1.0,50.0,Elec stim
2,The Houston Spine & Rehabilitation Centers,2024-01-30,97113,AQUATIC THERAPY,4.0,500.0,Aquatic therapy/
3,The Houston Spine & Rehabilitation Centers,2024-02-02,97140,MANUAL THERAPY,1.0,100.0,Man ther
4,The Houston Spine & Rehabilitation Centers,2024-02-02,97035,ULTRASOUND,1.0,38.0,US therapy


In [51]:
# Biling Charges Matching

final_data['Charges'].sum()

21131.0

### Convert to CSV

In [None]:
# final_data.to_csv('R000345 - R000349.csv')