In [4]:
import os
import fitz

def convert_pdf_to_images(pdf_path, output_folder, dpi=600):
    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # Open the PDF
    pdf_document = fitz.open(pdf_path)
    
    # Iterate through each page
    for page_number in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_number)
        
        # Convert the page to an image
        pix = page.get_pixmap(matrix=fitz.Matrix(dpi/72, dpi/72))
        
        # Save the image
        image_path = os.path.join(output_folder, f"page_{page_number + 1}.png")
        pix.save(image_path)
        print(f"Page {page_number + 1} saved as {image_path}")

    # Close the PDF
    pdf_document.close()

# Path to your PDF file
pdf_path = "C:/Users/Shreshtha/Labelled_MNS_Sample.pdf"

# Output folder for images
output_folder = 'output_images2'

# Convert PDF to images with high quality (300 DPI)
convert_pdf_to_images(pdf_path, output_folder, dpi=600)


Page 1 saved as output_images2\page_1.png
Page 2 saved as output_images2\page_2.png
Page 3 saved as output_images2\page_3.png
Page 4 saved as output_images2\page_4.png
Page 5 saved as output_images2\page_5.png
Page 6 saved as output_images2\page_6.png
Page 7 saved as output_images2\page_7.png


In [7]:
json_key_file_path = "C:/Users/Shreshtha/Downloads/balmy-outcome-412805-e9aa761e058c.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = json_key_file_path

In [9]:
from google.cloud import vision_v1
from google.cloud.vision_v1 import types
import os

# Update the function to accept a folder path instead of a single image path
def detect_text_folder(folder_path):
    client = vision_v1.ImageAnnotatorClient()

    for filename in os.listdir(folder_path):
        if filename.endswith(".png") or filename.endswith(".jpg"):
            image_path = os.path.join(folder_path, filename)
            with open(image_path, 'rb') as image_file:
                content = image_file.read()

            image = types.Image(content=content)
            response = client.text_detection(image=image)

            texts = response.text_annotations
            print(f"Text extracted from {filename}:")
            for text in texts:
                print(f'"{text.description}"')

# Call the function with the folder path
detect_text_folder("C:/Users/Shreshtha/output_images2/")



Text extracted from page_1.png:
"Title ICICI Ltd.
=
Basic sum insured
Premium
Hosptalization benefits
1 Waiting period
2 Hospital accomodation
3 Pre-hospitalization
4 Post hospitalization
5 Hospital cash/Daily Cash
6 Emergency ambulance
7 Organ donor expense
Co-payment feature
8/Annual deductible
9 Day care procedures
10 Domicilliary hospitalization
Maternity benefits
12 (delivery expenses)
13 New born baby cover
Renewal benefits
14 Renewal benefits
15 Loading on claims
ICICI Lombard
Ihealth
10 lacs
10,643
16 Health checkup
Pre-existing diseases:
2 years
Specific illnesses/
treatments: 2 years
No restriction
/sub-limits
30 days
60 days after
No
Rs. 1500 per
hospitalization
Not covered
Not applicable
140 day care procedures
covered
No
Waiting period of 36 months
None
No Claim :
Additional 10% sum insured
at the time of renewal for
every claim free year.
In case of a claim,
cumulative additional sum
insured to go down by 50%
NA
Max Bupa
Heartbeat Gold
10 lacs
22,696
Yes
Pre-existing dise

Text extracted from page_2.png:
"Title = MNS-937434854 ABC Ltd.
Group Name
Group Number
Renewal Date
ABC Ltd
Copay
IN Coinsurance %
IN Deductible
Out of Network (OON):
OON Coinsurance %
OON Deductible
123
01-01-2024
No of employees enrolled
Group State
CT
Evaluation Riet: The document was created with SpirLS for Python
Large
In Network (IN):
20
30/45
100/0
2500/5000
100/0
2500/5000"
"Title"
"="
"MNS"
"-"
"937434854"
"ABC"
"Ltd."
"Group"
"Name"
"Group"
"Number"
"Renewal"
"Date"
"ABC"
"Ltd"
"Copay"
"IN"
"Coinsurance"
"%"
"IN"
"Deductible"
"Out"
"of"
"Network"
"("
"OON"
")"
":"
"OON"
"Coinsurance"
"%"
"OON"
"Deductible"
"123"
"01-01-2024"
"No"
"of"
"employees"
"enrolled"
"Group"
"State"
"CT"
"Evaluation"
"Riet"
":"
"The"
"document"
"was"
"created"
"with"
"SpirLS"
"for"
"Python"
"Large"
"In"
"Network"
"("
"IN"
")"
":"
"20"
"30/45"
"100/0"
"2500/5000"
"100/0"
"2500/5000"
Text extracted from page_3.png:
"Title = Apollo Pvt. Ltd
X
X
C:\Users\Shreshthal
Downloads\List of
Orders.csv
C:\Users\Sh

Text extracted from page_6.png:
"Title Piramal Finance Corp.
Plan Design:
Office Copay:
ER Copay:
PCP:
Specialist:
Hospital Copay
Inpatient:
Coinsurance:
Single Deductible:
Family Deductible:
Single M.O.O.P.
Family M.O.O.P.
Financial Accumulation Period:
No Charge after Deductible
No Charge after Deductible
No Charge after Deductible
$2,850
$5,700
In-Network
None
$4,000
$8,000
No Charge after Deductible
Outpatient Hospital Setting:
No Charge after Deductible
Calendar Year
KACTOHP63RD
********
Group Financial Information
Please see Rate Model for available plan combinations
Please see Rate Model for available plan combinations
Modified Cost Share:
III
UCR:
Prescription Plan
100% of Medicare
Single Deductible:
Family Deductible:
Coinsurance:
Out-of-Network
Single M.O.O.P.
Family M.O.O.P.
Link to PDML:
Rx Tracking ID:
$2,850
$5,700
30%
$5,850
http://pdml/
$11,700
Modified UCR:
Standard
Modified Cost Share:"
"Title"
"Piramal"
"Finance"
"Corp."
"Plan"
"Design"
":"
"Office"
"Copay"
":"
"ER"


In [10]:
import os
from google.cloud import vision
import io

# Initialize the Vision API client
client = vision.ImageAnnotatorClient()

# Path to the folder containing images
folder_path = ("C:/Users/Shreshtha/output_images2/")

# Iterate through the files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.jpg') or filename.endswith('.png'):  # Filter for image files
        image_path = os.path.join(folder_path, filename)

        # Load the image
        with io.open(image_path, 'rb') as image_file:
            content = image_file.read()

        image = vision.Image(content=content)

        # Perform text detection
        response = client.text_detection(image=image)
        texts = response.text_annotations

        # Extract detected text
        print(f"Text detected in {filename}:")
        for text in texts:
            print('\n"{}"'.format(text.description))

            vertices = (['({},{})'.format(vertex.x, vertex.y)
                        for vertex in text.bounding_poly.vertices])

            print('bounds: {}'.format(','.join(vertices)))


Text detected in page_1.png:

"Title ICICI Ltd.
=
Basic sum insured
Premium
Hosptalization benefits
1 Waiting period
2 Hospital accomodation
3 Pre-hospitalization
4 Post hospitalization
5 Hospital cash/Daily Cash
6 Emergency ambulance
7 Organ donor expense
Co-payment feature
8/Annual deductible
9 Day care procedures
10 Domicilliary hospitalization
Maternity benefits
12 (delivery expenses)
13 New born baby cover
Renewal benefits
14 Renewal benefits
15 Loading on claims
ICICI Lombard
Ihealth
10 lacs
10,643
16 Health checkup
Pre-existing diseases:
2 years
Specific illnesses/
treatments: 2 years
No restriction
/sub-limits
30 days
60 days after
No
Rs. 1500 per
hospitalization
Not covered
Not applicable
140 day care procedures
covered
No
Waiting period of 36 months
None
No Claim :
Additional 10% sum insured
at the time of renewal for
every claim free year.
In case of a claim,
cumulative additional sum
insured to go down by 50%
NA
Max Bupa
Heartbeat Gold
10 lacs
22,696
Yes
Pre-existing diseas

Text detected in page_2.png:

"Title = MNS-937434854 ABC Ltd.
Group Name
Group Number
Renewal Date
ABC Ltd
Copay
IN Coinsurance %
IN Deductible
Out of Network (OON):
OON Coinsurance %
OON Deductible
123
01-01-2024
No of employees enrolled
Group State
CT
Evaluation Riet: The document was created with SpirLS for Python
Large
In Network (IN):
20
30/45
100/0
2500/5000
100/0
2500/5000"
bounds: (0,0),(1787,0),(1787,1980),(0,1980)

"Title"
bounds: (87,14),(254,14),(254,82),(87,82)

"="
bounds: (284,14),(327,14),(327,82),(284,82)

"MNS"
bounds: (363,14),(562,14),(562,82),(363,82)

"-"
bounds: (561,14),(591,14),(591,82),(561,82)

"937434854"
bounds: (597,14),(1049,14),(1049,82),(597,82)

"ABC"
bounds: (1082,14),(1261,14),(1261,82),(1082,82)

"Ltd."
bounds: (1297,14),(1443,14),(1443,82),(1297,82)

"Group"
bounds: (197,216),(430,210),(432,282),(199,288)

"Name"
bounds: (457,210),(676,205),(678,277),(459,282)

"Group"
bounds: (197,339),(430,333),(432,405),(199,411)

"Number"
bounds: (454,333),(769

Text detected in page_5.png:

"Title = ABC Ltd.
Please note that all this information is only for demonstration purposes.
STANDARD Plan Tracking ID- SP23704682627985
Group Name
Group Number
Renewal Date
ABC Ltd
In Network (IN):
Copay
IN Coinsurance %
IN Deductible
Out of Network (OON):
OON Coinsurance %
OON Deductible
123
01-01-2024
No of employees enrolled
20
Evaluation Group State document was created with Spire.XLS for Python
Market
Large
30/45
100/0
2500/5000
100/0
2500/5000"
bounds: (3,0),(3142,0),(3142,2743),(3,2743)

"Title"
bounds: (86,14),(254,14),(254,83),(86,83)

"="
bounds: (284,14),(326,14),(326,83),(284,83)

"ABC"
bounds: (362,14),(544,14),(544,83),(362,83)

"Ltd."
bounds: (578,14),(724,14),(724,83),(578,83)

"Please"
bounds: (200,199),(433,200),(433,278),(200,277)

"note"
bounds: (463,201),(628,202),(628,279),(463,278)

"that"
bounds: (650,202),(804,203),(804,281),(650,280)

"all"
bounds: (828,203),(909,203),(909,281),(828,281)

"this"
bounds: (933,203),(1065,204),(1065,

Text detected in page_7.png:

"Spire.XLS for Python
e-iceblue Inc. 2002-2024 All rights reserverd
Home page
https://www.e-iceblue.com
Contact US
mailto:support@e-iceblue.com
Buy Now!
https://www.e-iceblue.com/Buy/Spire.XLS-Python.html"
bounds: (8,633),(3278,633),(3278,1919),(8,1919)

"Spire.XLS"
bounds: (904,633),(1254,633),(1254,712),(904,712)

"for"
bounds: (1272,633),(1387,633),(1387,712),(1272,712)

"Python"
bounds: (1409,633),(1673,633),(1673,712),(1409,712)

"e"
bounds: (905,753),(948,753),(948,828),(905,828)

"-"
bounds: (952,753),(976,753),(976,828),(952,828)

"iceblue"
bounds: (979,753),(1249,753),(1249,828),(979,828)

"Inc."
bounds: (1275,753),(1406,753),(1406,828),(1275,828)

"2002-2024"
bounds: (1427,753),(1828,753),(1828,828),(1427,828)

"All"
bounds: (1849,753),(1953,753),(1953,828),(1849,828)

"rights"
bounds: (1974,753),(2186,753),(2186,828),(1974,828)

"reserverd"
bounds: (2213,753),(2566,753),(2566,828),(2213,828)

"Home"
bounds: (910,993),(1127,1005),(1123,1067),(906