In [8]:
import os
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import csv
import re

In [9]:
# Function to extract text from a PDF file
def pdf_to_text(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image)
    return text

# Function to extract text from an image file (like PNG)
def image_to_text(image_path):
    image = Image.open(image_path)
    return pytesseract.image_to_string(image)


In [10]:
# Define the fields and their regex patterns
fields = {
    "Policy Number": r"Policy Number\s*:\s*(.*)",
    "Expiring Policy Number": r"Expiring Policy Number\s*:\s*(.*)",
    "Insured Name": r"Insured Name\s*:\s*(.*)",
    "Policy Effective Date": r"Policy Effective Date\s*:\s*(.*)",
    "Policy Expiration Date": r"Policy Expiration Date\s*:\s*(.*)",
    "Insurance Limit": r"Insurance Limit\s*:\s*(.*)",
    "Risk State": r"Risk State\s*:\s*(.*)",
    "Insured Address": r"Insured Address\s*:\s*(.*)",
    "Location Address": r"Location Address\s*:\s*(.*)",
    "Class of Business/Risk Code": r"Class of Business/Risk Code\s*:\s*(.*)",
    "Premium": r"Premium\s*:\s*(.*)",
    "Broker Fee": r"Broker Fee\s*:\s*(.*)",
    "Taxes": r"Taxes\s*:\s*(.*)",
    "Carrier/Insurer Name": r"Carrier/Insurer Name\s*:\s*(.*)",
    "Carrier Percentage": r"Carrier Percentage\s*:\s*(.*)",
    "Lloyd’s Participation": r"Lloyd’s Participation\s*:\s*(.*)",
    "Inspection Fee": r"Inspection Fee\s*:\s*(.*)",
    "Stamping Fee": r"Stamping Fee\s*:\s*(.*)",
}

In [11]:
# Function to extract required fields using regex
def extract_fields(text):
    extracted_data = {}
    
    for field, pattern in fields.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            extracted_data[field] = match.group(1).strip()
        else:
            extracted_data[field] = None  # If the field is not found

    return extracted_data



In [13]:
# Directory containing PDF and PNG files
pdf_dir = r"C:\Users\Sanchana\Desktop\Extraction\SampleDoc"
pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.endswith(('.pdf', '.png'))]
csv_file = r"C:\Users\Sanchana\Desktop\Extraction\extracted_text.csv"

# Open the CSV file in write mode and write extracted text to CSV
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    
    writer.writerow(['File Name'] + list(fields.keys()))  # Write header

    for pdf_file in pdf_files:
        try:
            if pdf_file.endswith('.pdf'):
                extracted_text = pdf_to_text(pdf_file)
            else:
                extracted_text = image_to_text(pdf_file)
            
            # Extract fields from the text
            extracted_fields = extract_fields(extracted_text)
            
            # Write to CSV
            writer.writerow([os.path.basename(pdf_file)] + list(extracted_fields.values()))
            
            # Print the output to the console
            print(f"File Name: {os.path.basename(pdf_file)}")
            for field, value in extracted_fields.items():
                print(f"{field}: {value}")
            print("\n" + "-"*40 + "\n")  # Separator for clarity
        
        except Exception as e:
            print(f"Error processing {pdf_file}: {e}")

File Name: Sample-document-1.pdf
Policy Number: None
Expiring Policy Number: None
Insured Name: None
Policy Effective Date: None
Policy Expiration Date: None
Insurance Limit: Two million USD ($2,000,000.00)
Risk State: None
Insured Address: None
Location Address: None
Class of Business/Risk Code: None
Premium: None
Broker Fee: None
Taxes: None
Carrier/Insurer Name: None
Carrier Percentage: None
Lloyd’s Participation: None
Inspection Fee: None
Stamping Fee: $100

----------------------------------------

File Name: Screenshot 2024-09-21 222056.png
Policy Number: None
Expiring Policy Number: None
Insured Name: None
Policy Effective Date: None
Policy Expiration Date: None
Insurance Limit: None
Risk State: None
Insured Address: None
Location Address: None
Class of Business/Risk Code: None
Premium: None
Broker Fee: None
Taxes: None
Carrier/Insurer Name: None
Carrier Percentage: None
Lloyd’s Participation: None
Inspection Fee: None
Stamping Fee: None

----------------------------------------