In [6]:
import PyPDF2
import re
from datetime import datetime
import os

In [10]:

def extract_information_from_pdf(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
        text = ""
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    
    patterns = {
        
        "Cause/ Civil action number": r"Cause\s*Number\s*\(if applicable\)\s*:?\s*(C\.A\.\s*No\.\s*[\w\-]+\s*[\w\-]+)",
        "Court": r"Jurisdiction\s*:?\s*([\w\s]+)",
        "Jurisdiction": r"Jurisdiction\s*:?\s*([\w\s]+)",
        "Number of Plaintiffs": r"Names of Plaintiffs\s*\(if applicable\)\s*:?\s*([\w\s]+)",
        "Case caption Plaintiffs": r"Claimant/Plaintiff\s*Name\s*:?\s*([\w\s]+)",
        "Date of Injury": r"Date of Injury\s*:?\s*([\d]{2}\s\w{3}\s\d{4})"
    }
    

    extracted_data = {}


    for field, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:

            extracted_data[field] = match.group(1).strip()
        else:
            extracted_data[field] = "Not found"
    
    return extracted_data




In [11]:

# Usage Example
pdf_path = 'OneDrive_2024-09-20 (1)/PMS BR-MEI Ref# 7301193380 DOI; 2023-06-26/2. Communication/Mario Gutierrez SRF_ MEI Ref _ 7301193380.pdf'
extracted_data = extract_information_from_pdf(pdf_path)

for key, value in extracted_data.items():
    print(f"{key}: {value}")


Cause/ Civil action number: Not found
Court: Southern District of Texas
Carrier Name
            Medical Equation Service Request Form
Jurisdiction: Southern District of Texas
Carrier Name
            Medical Equation Service Request Form
Number of Plaintiffs: Not found
Case caption Plaintiffs: Mario Gutierrez
Names of Plaintiffs
Date of Injury: 26 Jun 2023


In [28]:
def save_to_json(data, pdf_path):
   
    current_date = datetime.now().strftime("%Y-%m-%d")
    
    
    base_filename = os.path.basename(pdf_path).split('.')[0]
    
    
    json_filename = f"{base_filename}_{current_date}.json"
    
    
    with open(json_filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)
    
    print(f"Data saved to {json_filename}")


In [29]:
save_to_json(extracted_data, 'service_request')

Data saved to service_request_2024-09-25.json


In [30]:
extracted_data

{'Cause/ Civil action number': 'C.A. No. 23 -CV-03955',
 'Court': 'Southern District of Texas',
 'Jurisdiction': 'Southern District of Texas',
 'Number of Plaintiffs': 'Not found',
 'Case caption Plaintiffs': 'Mario Gutierrez  \nNames of Plaintiffs',
 'Date of Injury': '26 Jun 2023'}

In [12]:
def extract_billing_information_from_pdf(pdf_path):
   
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
   
        text = ""
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()
    

    billing_pattern = re.compile(
        r"([\w\s&]+)\s+(\d{2}-\d{2}-\d{4})\s+(\w+)\s+([A-Za-z\s/]+)\s+\$([\d,]+\.\d{2})"
    )
    

    matches = billing_pattern.findall(text)
    

    billing_records = []
    

    for match in matches:
        record = {
            "Provider": match[0].strip(),
            "Date": match[1],
            "CPT": match[2],
            "Description": match[3].strip(),
            "Charges": match[4].replace(',', '')  # Remove commas from charges
        }
        billing_records.append(record)
    
    return billing_records


In [27]:
def extract_hospital_name(text):
    
    match = re.search(r"FROM\s*:\s*([^\n]+)", text)
    if match:
        return match.group(1).strip() 
    return "Unknown Hospital"

def extract_billing_information_from_pdf(pdf_path):
    # Open the PDF file
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        
        
        text = ""
        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text += page.extract_text()


    hospital_name = extract_hospital_name(text)

   
    billing_pattern = re.compile(
        r"(\d{2}/\d{2}/\d{4})\s+Charge\s+[\w\s,]+\s+(\d{5})\s+([\w\s-]+)\s+(\d+\.\d{2})",
        re.MULTILINE
    )

    
    matches = billing_pattern.findall(text)

    
    billing_records = []
    
   
    for match in matches:
        record = {
            "Provider": hospital_name, 
            "Date": match[0],
            "CPT": match[1],
            "Description": match[2].strip(),
            "Charges": match[3]
        }
        billing_records.append(record)

    return billing_records

In [28]:
extracted_data = extract_billing_information_from_pdf('sample.pdf')

In [30]:
with open('sample.pdf', 'rb') as file:
    reader = PyPDF2.PdfReader(file)
    num_pages = len(reader.pages)


    text = ""
    for page_num in range(num_pages):
        page = reader.pages[page_num]
        text += page.extract_text()

In [31]:
text

'[Type here]  \n \n[Type here]  \n  PERTAIN TO :  Mario  Alberto Gutierrez  \nFROM :  The Houston Spine & Rehabilitation Centers \n3101 College Park Dr.  \nTHE WOODLANDS., TEXAS 77384  \n PHONE :  (281) 362 -0006  FAX :  No fax number on file  \n RECORD TYPE :  Billing  \n DELIVER TO :  Trent  Shelton  \nARNOLD & ITKIN, LLP  \n6009 MEMORIAL DRIVE  \nHOUSTON, TEXAS 77007  \n PHONE :  (713) 222 -3800  FAX :  (713) 222 -3850  [Type here]  \n \n[Type here]  \n 32 [Type here]  \n \n[Type here]  \n R000345  [Type here]  \n \n[Type here]  \n \n \n[Type here]  \n \n[Type here]  \n R000346  \nHOUSTON SPINE AND REHABILITATION CENTERS  \n3101 COLLEGE PARK DR    The Woodlands, TX 773844099  \nPhone: (281) 362 -0006  Fax: (281) 362 -0233  \nPatient Financial History  \n \nDate Type  Provider  Procedure  Units  Amount  \n \nGutierrez , Mario (pi)  \n    1.00 110.00  \n01/24/2024  Charge  Barton, Jenny  97014    APPL MODALITY 1+ AREAS ELEC STIMJ UNATT  1.00 50.00  \n01/30/2024  Charge  Barton, Jenny 

In [29]:
extracted_data

[{'Provider': 'The Houston Spine & Rehabilitation Centers',
  'Date': '01/30/2024',
  'CPT': '97113',
  'Description': 'AQUATIC THERAPY',
  'Charges': '4.00'},
 {'Provider': 'The Houston Spine & Rehabilitation Centers',
  'Date': '02/02/2024',
  'CPT': '97140',
  'Description': 'MANUAL THERAPY',
  'Charges': '1.00'},
 {'Provider': 'The Houston Spine & Rehabilitation Centers',
  'Date': '02/02/2024',
  'CPT': '97035',
  'Description': 'ULTRASOUND',
  'Charges': '1.00'},
 {'Provider': 'The Houston Spine & Rehabilitation Centers',
  'Date': '02/06/2024',
  'CPT': '97113',
  'Description': 'AQUATIC THERAPY',
  'Charges': '4.00'},
 {'Provider': 'The Houston Spine & Rehabilitation Centers',
  'Date': '02/08/2024',
  'CPT': '97113',
  'Description': 'AQUATIC THERAPY',
  'Charges': '3.00'},
 {'Provider': 'The Houston Spine & Rehabilitation Centers',
  'Date': '02/08/2024',
  'CPT': '97140',
  'Description': 'MANUAL THERAPY',
  'Charges': '1.00'},
 {'Provider': 'The Houston Spine & Rehabilitati

In [19]:
import pandas as pd

# Load the data
data = {
    'Information Availability': ['High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High', 'Medium'],
    'House Cost': ['High', 'Medium', 'High', 'Medium', 'Low', 'Medium', 'High', 'Medium', 'Low', 'Medium', 'High'],
    'School Quality': ['Good', 'Average', 'Bad', 'Excellent', 'Good', 'Bad', 'Excellent', 'Good', 'Bad', 'Excellent', 'Good'],
    'Trust in Police': ['High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High', 'Medium', 'Low', 'High', 'Medium'],
    'Street Quality': ['Good', 'Average', 'Bad', 'Excellent', 'Good', 'Bad', 'Excellent', 'Good', 'Bad', 'Excellent', 'Good'],
    'Events': ['Many', 'Some', 'Few', 'Many', 'Some', 'Few', 'Many', 'Some', 'Few', 'Many', 'Some'],
    'Happiness': ['Happy', 'Content', 'Sad', 'Very Happy', 'Happy', 'Sad', 'Very Happy', 'Happy', 'Sad', 'Very Happy', 'Happy']
}

# Create a DataFrame
df = pd.DataFrame(data)

# Filter the data where `Information Availability` is 'High'
high_info_df = df[df['Information Availability'] == 'High']

# Find the most frequent value in the `House Cost` column
most_common_house_cost = high_info_df['House Cost'].mode()

# Display the result
print(f"The most common house cost in cities with high information availability is: {most_common_house_cost}")

The most common house cost in cities with high information availability is: 0      High
1    Medium
Name: House Cost, dtype: object


In [20]:
high_info_df['House Cost']

0      High
3    Medium
6      High
9    Medium
Name: House Cost, dtype: object

In [21]:
most_common_house_cost

0      High
1    Medium
Name: House Cost, dtype: object