## OCR 

#### In this notebook I use EasyOCR to extract the data from my pdf file that contains patient information. I then use that to match with my excel file containing the prices of procedures and display if the charge is above or below the average.

In [1]:
pip install pytesseract pillow pdf2image

Note: you may need to restart the kernel to use updated packages.


In [1]:
from pdf2image import convert_from_path

# Path to your PDF file
pdf_path = 'ub04-mod.pdf'
poppler_path = r'C:\Program Files (x86)\poppler-24.07.0\Library\bin'

# Convert PDF to images
pages = convert_from_path(pdf_path, 300, poppler_path=poppler_path)  # 300 DPI is a good resolution for OCR

# Save the pages as images
for i, page in enumerate(pages):
    page.save(f'page_{i + 1}.png', 'PNG')


In [2]:
import easyocr

# Initialize EasyOCR Reader
reader = easyocr.Reader(['en'])

# List to store the OCR results
all_text = []

# Perform OCR on each page image
for i in range(len(pages)):
    image_path = f'page_{i + 1}.png'
    text = reader.readtext(image_path, detail=0)  # `detail=0` returns only the text
    all_text.append("\n".join(text))

# Combine all text into a single string for easier parsing
ocr_text = "\n".join(all_text)



Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


In [3]:
import re
import pandas as pd

# Extract Patient Name
name_match = re.search(r'PATIENT NAME\s+(.+)', ocr_text)
patient_name = name_match.group(1).strip() if name_match else "Not found"

# Extract Birthdate
dob_match = re.search(r'BIRTHDATE\s+(.+)', ocr_text)
birthdate = dob_match.group(1).strip() if dob_match else "Not found"

# Extract Sex
sex_match = re.search(r'SEX\s+([MF])', ocr_text)
sex = sex_match.group(1).strip() if sex_match else "Not found"

# Extract Patient Address
address_match = re.search(r'PATIENT ADDRESS\s+(.+)', ocr_text)
patient_address = address_match.group(1).strip() if address_match else "Not found"

# Extract HCPCS Codes and Charges
hcpcs_codes = re.findall(r'(\d{5})\n\S(\d+)', ocr_text)
hcpcs_codes_with_charges = [(code, charge) for code, charge in hcpcs_codes]

# Convert HCPCS Codes and Charges to a DataFrame
df_hcpcs = pd.DataFrame(hcpcs_codes_with_charges, columns=['HCPCS Code', 'Charge'])

# Extract Total Charges
total_match = re.search(r'TOTALS\s+S(\d+)', ocr_text)
total_charges = total_match.group(1).strip() if total_match else "Not found"

# Display Extracted Information
print(f"Patient Name: {patient_name}")
print(f"Date of Birth: {birthdate}")
print(f"Sex: {sex}")
print(f"Patient Address: {patient_address}")
print(f"Total Charges: ${total_charges}")

# Display the DataFrame
print("HCPCS Codes and Charges DataFrame:")
print(df_hcpcs)


Patient Name: Rodrigo Ameixa
Date of Birth: SEX
Sex: Not found
Patient Address: 82 cidermill DR
Total Charges: $3500
HCPCS Codes and Charges DataFrame:
  HCPCS Code Charge
0      10061    500
1      10120   1000
2      11102   2000


In [4]:
df_hcpcs
df_hcpcs['Charge'] = df_hcpcs['Charge'].apply(lambda x: f"${x}")
df_hcpcs

Unnamed: 0,HCPCS Code,Charge
0,10061,$500
1,10120,$1000
2,11102,$2000


In [5]:
procedural_costs = pd.read_csv('Procedural_costs.csv')
procedural_costs

Unnamed: 0,Code,Info,Hospital Outpatient Price
0,10005,"Fine needle aspiration biopsy, including ultra...",$148
1,10007,"Fine needle aspiration biopsy, including fluor...",$151
2,10009,"Fine needle aspiration biopsy, including ct gu...",$154
3,10021,"Fine needle aspiration biopsy, without imaging...",
4,10030,Image-guided fluid collection drainage by cath...,$159
...,...,...,...
2268,46257,"Hemorrhoidectomy, internal and external, singl...",$617
2269,46258,"Hemorrhoidectomy, internal and external, singl...",$630
2270,46260,"Hemorrhoidectomy, internal and external, 2 or ...",$630
2271,46261,"Hemorrhoidectomy, internal and external, 2 or ...",$640


In [6]:
# Remove the $ symbol and convert to numeric in df_hcpcs
df_hcpcs['Charge'] = df_hcpcs['Charge'].replace({'\$': ''}, regex=True).astype(float)

# Remove the $ symbol and convert to numeric in procedural_costs, handling errors
procedural_costs['Hospital Outpatient Price'] = pd.to_numeric(
    procedural_costs['Hospital Outpatient Price'].replace({'\$': ''}, regex=True), errors='coerce'
)

# Convert both 'HCPCS Code' and 'Code' to the same type (string) for merging
df_hcpcs['HCPCS Code'] = df_hcpcs['HCPCS Code'].astype(str)
procedural_costs['Code'] = procedural_costs['Code'].astype(str)

# Join the two DataFrames on the 'Code' and 'HCPCS Code' columns
merged_df = pd.merge(df_hcpcs, procedural_costs, left_on='HCPCS Code', right_on='Code', how='left')

# Calculate the Discrepancy as a percentage difference
merged_df['Discrepancy'] = ((merged_df['Charge'] - merged_df['Hospital Outpatient Price']) / merged_df['Hospital Outpatient Price']) * 100

# Format the Discrepancy with + or - and add a % sign
merged_df['Discrepancy'] = merged_df['Discrepancy'].apply(lambda x: f"{'+' if x > 0 else ''}{x:.2f}%" if pd.notnull(x) else 'NaN')

# Add the $ sign back to the Charge and Hospital Outpatient Price columns
merged_df['Charge'] = merged_df['Charge'].apply(lambda x: f"${x:,.2f}")
merged_df['Hospital Outpatient Price'] = merged_df['Hospital Outpatient Price'].apply(lambda x: f"${x:,.2f}" if pd.notnull(x) else 'NaN')

# Display the final DataFrame
print(merged_df)

  HCPCS Code     Charge   Code  \
0      10061    $500.00  10061   
1      10120  $1,000.00  10120   
2      11102  $2,000.00  11102   

                                                Info  \
0  Incision and drainage of abscess (eg, carbuncl...   
1  Incision and removal of foreign body, subcutan...   
2  Tangential biopsy of skin (eg, shave, scoop, s...   

  Hospital Outpatient Price Discrepancy  
0                   $111.00    +350.45%  
1                    $95.00    +952.63%  
2                    $45.00   +4344.44%  


In [7]:
merged_df

Unnamed: 0,HCPCS Code,Charge,Code,Info,Hospital Outpatient Price,Discrepancy
0,10061,$500.00,10061,"Incision and drainage of abscess (eg, carbuncl...",$111.00,+350.45%
1,10120,"$1,000.00",10120,"Incision and removal of foreign body, subcutan...",$95.00,+952.63%
2,11102,"$2,000.00",11102,"Tangential biopsy of skin (eg, shave, scoop, s...",$45.00,+4344.44%
