In [9]:
import os
import zipfile
import mimetypes
import io
import csv
import re
from PIL import Image
import xml.etree.ElementTree as ET
from urllib.parse import unquote
import shutil

def process_embedded_excel_files(input_xlsx_path, output_folder):
    def extract_embedded_files(file_path, save_path):
        """
        Extracts arbitrary embedded files from an Excel (xlsx) file and saves them.

        Parameters:
        ----------
        file_path : str,
            The path to the xlsx file.

        save_path : str,
            Directory path to save the extracted files.
        """
            # Create the directory if it doesn't exist
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        with zipfile.ZipFile(file_path, 'r') as zip_file:
            # List all files in the archive
            all_files = zip_file.namelist()

            # Iterate through all files in the archive
            for file_info in all_files:
                # Extract the file content
                file_content = zip_file.read(file_info)

                # Save the extracted file to the specified directory
                extracted_file_path = os.path.join(save_path, os.path.basename(file_info))
                with open(extracted_file_path, 'wb') as f:
                    f.write(file_content)

    def convert_files_to_txt(input_folder, output_folder):
        """
        Converts each file in the input folder to a plain text file and saves it in the output folder.

        Parameters:
        ----------
        input_folder : str,
            The path to the folder containing XML and RELS files.

        output_folder : str,
            The path to the folder where the converted text files will be saved.
        """
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        for filename in os.listdir(input_folder):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

            try:
                with open(input_file_path, 'r', encoding='utf-8') as input_file:
                    # Read the content of the input file
                    file_content = input_file.read()

                # Optionally, you can parse XML content if needed
                # For example, if the file is in XML format
                # tree = ET.fromstring(file_content)
                # parsed_content = ET.tostring(tree, encoding='utf-8').decode('utf-8')

            # Write the content to the output text file
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(file_content)
            except Exception as e:
                print(f"Error processing file {input_file_path}: {e}")
    def extract_text_from_folder(folder_path):
        """
        Extracts text content matching the pattern from text files within a folder.

        Parameters:
        ----------
        folder_path : str,
            The path to the folder containing text files.

        Returns:
        -------
        matching_paths : list,
            A list of concatenated file paths found within the text files.
        """
        matching_paths = []

        # Iterate through each file in the folder
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)

            # Check if it's a text file
            if os.path.isfile(file_path) and filename.lower().endswith('.txt'):
                try:
                    # Read the content of the text file
                    with open(file_path, 'r', encoding='utf-8') as file:
                        file_content = file.read()

                    # Use format() to build the string
                    matches = re.findall(r'file:///C:\\Users\\([^"]+)', file_content)
                    matching_paths.extend(["C:/Users/{}".format(match.replace('\\', '/')) for match in matches])
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

        return matching_paths

    def fetch_files(matching_paths, output_folder):
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        for file_path in matching_paths:
            try:
                file_name = os.path.basename(unquote(file_path))
                shutil.copy(unquote(file_path), os.path.join(output_folder, file_name))
            except Exception as e:
                print(f"Error fetching file {file_path}: {e}")

    # Create output folders
    extracted_folder = os.path.join(output_folder, 'extracted_embedded_files')
    converted_folder = os.path.join(output_folder, 'converted_text_files')
    fetched_folder = os.path.join(output_folder, 'fetched_excel_files')

    # Extract embedded files
    extract_embedded_files(input_xlsx_path, extracted_folder)

    # Convert files to text
    convert_files_to_txt(extracted_folder, converted_folder)

    # Extract matching paths from text files
    matching_paths = extract_text_from_folder(converted_folder)

    # Fetch and copy Excel files
    fetch_files(matching_paths, fetched_folder)

    # Delete 'converted_text_files' and 'extracted_embedded_files' folders
    shutil.rmtree(converted_folder)
    shutil.rmtree(extracted_folder)
    
    

# Example usage:
input_xlsx_path = "C:/Users/Shreshtha/Downloads/Sample Source of Truth.xlsx"
output_folder = "C:/Users/Shreshtha/Downloads/output_final2"
process_embedded_excel_files(input_xlsx_path, output_folder)

Error processing file C:/Users/Shreshtha/Downloads/output_final2\extracted_embedded_files\image1.jpeg: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/output_final2\extracted_embedded_files\image2.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/output_final2\extracted_embedded_files\image3.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/output_final2\extracted_embedded_files\image4.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/output_final2\extracted_embedded_files\image5.png: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/output_final2\extracted_embedded_files\image6.jpeg: 'utf-8' codec can't decode b

In [13]:
import pandas as pd

def extract_and_print_table(excel_path):
    xls = pd.ExcelFile(excel_path)

    for sheet_name in xls.sheet_names:
        # Read each sheet from the Excel file
        df = pd.read_excel(xls, sheet_name)

        # Print the tabular data for each sheet
        print(f"\nSheet: {sheet_name}")
        print(df)

# Example usage
excel_path = "C:/Users/Shreshtha/Downloads/Sample Source of Truth.xlsx"
extract_and_print_table(excel_path)



Sheet: Sheet1
Empty DataFrame
Columns: []
Index: []

Sheet: Sheet2
                  Group Name              ABC Ltd
0               Group Number                  123
1               Renewal Date  2024-01-01 00:00:00
2                        NaN                  NaN
3                        NaN                  NaN
4   No of employees enrolled                   20
5                Group State                   CT
6                     Market                Large
7           In Network (IN):                  NaN
8                      Copay                30/45
9           IN Coinsurance %                100/0
10             IN Deductible            2500/5000
11     Out of Network (OON):                  NaN
12         OON Coinsurance %                100/0
13            OON Deductible            2500/5000

Sheet: Sheet3
                                                 Name  \
0                       10-Day Green Smoothie Cleanse   
1                                   11/22/63: A Novel

In [27]:
import pandas as pd
from IPython.display import display

def extract_and_print_table(excel_path):
    xls = pd.ExcelFile(excel_path)

    for sheet_name in xls.sheet_names:
        # Read each sheet from the Excel file
        df = pd.read_excel(xls, sheet_name)

        # Print the tabular data for each sheet using display for better formatting
        print(f"\nSheet: {sheet_name}")
        display(df)

# Example usage
excel_path = "C:/Users/Shreshtha/Downloads/Sample Source of Truth.xlsx"
extract_and_print_table(excel_path)



Sheet: Sheet1



Sheet: Sheet2


Unnamed: 0,Group Name,ABC Ltd
0,Group Number,123
1,Renewal Date,2024-01-01 00:00:00
2,,
3,,
4,No of employees enrolled,20
5,Group State,CT
6,Market,Large
7,In Network (IN):,
8,Copay,30/45
9,IN Coinsurance %,100/0



Sheet: Sheet3


Unnamed: 0,Name,Author,User Rating,Reviews,Price,Year,Genre
0,10-Day Green Smoothie Cleanse,JJ Smith,4.7,17350,8,2016,Non Fiction
1,11/22/63: A Novel,Stephen King,4.6,2052,22,2011,Fiction
2,12 Rules for Life: An Antidote to Chaos,Jordan B. Peterson,4.7,18979,15,2018,Non Fiction
3,1984 (Signet Classics),George Orwell,4.7,21424,6,2017,Fiction
4,"5,000 Awesome Facts (About Everything!) (Natio...",National Geographic Kids,4.8,7665,12,2019,Non Fiction
5,A Dance with Dragons (A Song of Ice and Fire),George R. R. Martin,4.4,12643,11,2011,Fiction
6,A Game of Thrones / A Clash of Kings / A Storm...,George R. R. Martin,4.7,19735,30,2014,Fiction
7,A Gentleman in Moscow: A Novel,Amor Towles,4.7,19699,15,2017,Fiction
8,"A Higher Loyalty: Truth, Lies, and Leadership",James Comey,4.7,5983,3,2018,Non Fiction
9,A Man Called Ove: A Novel,Fredrik Backman,4.6,23848,8,2016,Fiction



Sheet: Sheet4



Sheet: Sheet5



Sheet: Sheet6



Sheet: Sheet7


In [14]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract

def extract_text_from_pdf(pdf_path):
    text_content = []

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    for page_number in range(pdf_document.page_count):
        # Get the page
        page = pdf_document[page_number]

        # Convert the page to an image
        image_list = page.get_pixmap()

        # Convert the image to a PIL Image
        pil_image = Image.frombytes("RGB", [image_list.width, image_list.height], image_list.samples)

        # Use Tesseract OCR to extract text from the image
        text = pytesseract.image_to_string(pil_image, lang='eng')

        # Append the extracted text to the result list
        text_content.append(text)

    # Close the PDF document
    pdf_document.close()

    return text_content

# Example usage
pdf_path = "C:/Users/Shreshtha/Downloads/Sample Source of Truth.pdf"
result = extract_text_from_pdf(pdf_path)

# Print the extracted text for each page
for i, page_text in enumerate(result):
    print(f"Page {i + 1} Text:\n{page_text}\n{'='*50}\n")


Page 1 Text:
TEC Lombard Wax Bupa ‘poli
theath Heartbeat Gold opti
Gasiesuminsured a0laes a0lacs a
Premium 10,643 72,656 1
| Aitesptatization benefits [PP
Pre-existing diseases: | Pre-existing disesres: | Pre-existi
2years 2ysars fi
specific ilnesses/ Specific specifi
alWatting periog treatments:2years __|itinesses/treatments: None| _treatme
No restriction ‘No restriction Nor
2|Hospital accomodation sibel Zsub-timts Jsu
[Pre-hospitalization a0days 20days a
<4lpost hospitalization codays after 0 days after 18
onivine,
Rs 800perd
[Hospital eash/Daily Cash No Not applicable ot
“tual cost at Network
Rs. 1500 per hospital, Re2000 per Rei
Emergency ambulance hospitalization | hosptalization otherwice hospi
Tlorgan donor expense Not covered ‘Covered a
[co-payment feature
|/Annual deductible Not applicable Not applicable Nota
Ta0 daycare procedures | Allday care procedures | All day ca
s|Day care procedures covered covered ce
40|Domiciliary hospitalization No Yes, upto Rs 50,000
11)atternative

In [17]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/Shreshtha/Downloads/balmy-outcome-412805-e9aa761e058c.json"


In [28]:
from typing import Optional, Sequence

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Replace the placeholder values with your actual project and processor information
project_id = 'balmy-outcome-412805'
location = "us"  # Format is "us" or "eu"
processor_id = '1c327dd87f42b98b'  # Create processor before running sample
processor_version = "rc"  # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
file_path = "C:/Users/Shreshtha/Downloads/4F82B400.pdf"
mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types


def process_document_form_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    # Online processing request to Document AI
    document = process_document(
        project_id, location, processor_id, processor_version, file_path, mime_type
    )

    # Read the table and form fields output from the processor
    # The form processor also contains OCR data. For more information
    # on how to parse OCR data please see the OCR sample.

    text = document.text
    print(f"Full document text: {repr(text)}\n")
    print(f"There are {len(document.pages)} page(s) in this document.")

    # Read the form fields and tables output from the processor
    for page in document.pages:
        print(f"\n\n**** Page {page.page_number} ****")

        print(f"\nFound {len(page.tables)} table(s):")
        for table in page.tables:
            num_columns = len(table.header_rows[0].cells)
            num_rows = len(table.body_rows)
            print(f"Table with {num_columns} columns and {num_rows} rows:")

            # Print header rows
            print("Columns:")
            print_table_rows(table.header_rows, text)
            # Print body rows
            print("Table body data:")
            print_table_rows(table.body_rows, text)

        print(f"\nFound {len(page.form_fields)} form field(s):")
        for field in page.form_fields:
            name = layout_to_text(field.field_name, text)
            value = layout_to_text(field.field_value, text)
            print(f"    * {repr(name.strip())}: {repr(value.strip())}")

    # Supported in version `pretrained-form-parser-v2.0-2022-11-10` and later.
    # For more information: https://cloud.google.com/document-ai/docs/form-parser
    if document.entities:
        print(f"Found {len(document.entities)} generic entities:")
        for entity in document.entities:
            print_entity(entity)
            # Print Nested Entities
            for prop in entity.properties:
                print_entity(prop)

    return document


def print_table_rows(
    table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> None:
    for table_row in table_rows:
        row_text = ""
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text)
            row_text += f"{repr(cell_text.strip())} | "
        print(row_text)


def print_entity(entity: documentai.Document.Entity) -> None:
    # Fields detected. For a full list of fields for each processor see
    # the processor documentation:
    # https://cloud.google.com/document-ai/docs/processors-list
    key = entity.type_

    # Some other value formats in addition to text are available
    # e.g. dates: `entity.normalized_value.date_value.year`
    text_value = entity.text_anchor.content
    confidence = entity.confidence
    normalized_value = entity.normalized_value.text
    print(f"    * {repr(key)}: {repr(text_value)}({confidence:.1%} confident)")

    if normalized_value:
        print(f"    * Normalized Value: {repr(normalized_value)}")


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


# Call the main function to process the document
document_result = process_document_form_sample(project_id, location, processor_id, processor_version, file_path, mime_type)

# The result is a `documentai.Document` object. You can access its attributes as needed.
# For example, you can print the extracted text and form fields using the provided functions.
print(f"Full document text: {repr(document_result.text)}")
print(f"There are {len(document_result.pages)} page(s) in this document.")

# Print form fields and tables
for page in document_result.pages:
    # ... (print form fields and tables as needed)
    pass

# Print generic entities (supported in version `pretrained-form-parser-v2.0-2022-11-10` and later)
if document_result.entities:
    print(f"Found {len(document_result.entities)} generic entities:")
    for entity in document_result.entities:
        print_entity(entity)
        # Print Nested Entities
        for prop in entity.properties:
            print_entity(prop)


Full document text: "ICICI Lombard\nMax Bupa\nApoll\nIhealth\nHeartbeat Gold\nOptin\nBasic sum insured\n10 lacs\n10 lacs\nPremium\n10,643\n22,696\n1\nHosptalization benefits\nPre-existing diseases:\nPre-existing diseases:\nPre-existi\n2 years\n2 years\n}Specifi\nSpecific illnesses/\nSpecificillnesses/treatments: None\n1 Waiting period\ntreatments: 2 years\ntreatme\nNo restriction\nNo restriction\nNo re\n2 Hospital accomodation\n/sub-limits\n/sub-limits\n/su\n3 Pre-hospitalization\n30 days\n30 days\n60\n4 Post hospitalization\n60 days after\n60 days after\n18\nOnly in c\naccon\nRs 800 per d\n5 Hospital cash/Daily Cash\nNo\nNot applicable\nof\nActual cost at Network\nRs. 1500 per\nhospital, Rs 2000 per\nRs 2\nhospitalization\nhosptalization otherwise\nhospi\nNot covered\nCovered\nCc\n6 Emergency ambulance7 Organ donor expenseCo-payment feature8/Annual deductible\nNot applicable\nNot applicable\nNot a\n140 day care procedures\nAll day care procedures\nAll day cal\ncovered\ncovered\nCO\n9 

In [29]:
from typing import Optional, Sequence

from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Replace the placeholder values with your actual project and processor information
project_id = 'balmy-outcome-412805'
location = "us"  # Format is "us" or "eu"
processor_id = '1c327dd87f42b98b'  # Create processor before running sample
processor_version = "rc"  # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
file_path = "C:/Users/Shreshtha/Downloads/4F82B400.pdf"
mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

def process_document_ocr_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> None:
    # Optional: Additional configurations for Document OCR Processor.
    # For more information: https://cloud.google.com/document-ai/docs/enterprise-document-ocr
    process_options = documentai.ProcessOptions(
        ocr_config=documentai.OcrConfig(
            enable_native_pdf_parsing=True,
            enable_image_quality_scores=True,
            enable_symbol=True,
            # Comment out the premium_features section to disable premium OCR features
            # premium_features=documentai.OcrConfig.PremiumFeatures(
            #     compute_style_info=True,
            #     enable_math_ocr=False,  # Enable to use Math OCR Model
            #     enable_selection_mark_detection=True,
            # ),
        )
    )
    # Online processing request to Document AI
    document = process_document(
        project_id,
        location,
        processor_id,
        processor_version,
        file_path,
        mime_type,
        process_options=process_options,
    )

    text = document.text
    print(f"Full document text: {text}\n")
    print(f"There are {len(document.pages)} page(s) in this document.\n")

    for page in document.pages:
        print(f"Page {page.page_number}:")
        print_page_dimensions(page.dimension)
        print_detected_languages(page.detected_languages)

        print_blocks(page.blocks, text)
        print_paragraphs(page.paragraphs, text)
        print_lines(page.lines, text)
        print_tokens(page.tokens, text)

        if page.symbols:
            print_symbols(page.symbols, text)

        if page.image_quality_scores:
            print_image_quality_scores(page.image_quality_scores)

        if page.visual_elements:
            print_visual_elements(page.visual_elements, text)


def print_page_dimensions(dimension: documentai.Document.Page.Dimension) -> None:
    print(f"    Width: {str(dimension.width)}")
    print(f"    Height: {str(dimension.height)}")


def print_detected_languages(
    detected_languages: Sequence[documentai.Document.Page.DetectedLanguage],
) -> None:
    print("    Detected languages:")
    for lang in detected_languages:
        print(f"        {lang.language_code} ({lang.confidence:.1%} confidence)")


def print_blocks(blocks: Sequence[documentai.Document.Page.Block], text: str) -> None:
    print(f"    {len(blocks)} blocks detected:")
    first_block_text = layout_to_text(blocks[0].layout, text)
    print(f"        First text block: {repr(first_block_text)}")
    last_block_text = layout_to_text(blocks[-1].layout, text)
    print(f"        Last text block: {repr(last_block_text)}")


def print_paragraphs(
    paragraphs: Sequence[documentai.Document.Page.Paragraph], text: str
) -> None:
    print(f"    {len(paragraphs)} paragraphs detected:")
    first_paragraph_text = layout_to_text(paragraphs[0].layout, text)
    print(f"        First paragraph text: {repr(first_paragraph_text)}")
    last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)
    print(f"        Last paragraph text: {repr(last_paragraph_text)}")


def print_lines(lines: Sequence[documentai.Document.Page.Line], text: str) -> None:
    print(f"    {len(lines)} lines detected:")
    first_line_text = layout_to_text(lines[0].layout, text)
    print(f"        First line text: {repr(first_line_text)}")
    last_line_text = layout_to_text(lines[-1].layout, text)
    print(f"        Last line text: {repr(last_line_text)}")


def print_tokens(tokens: Sequence[documentai.Document.Page.Token], text: str) -> None:
    print(f"    {len(tokens)} tokens detected:")
    first_token_text = layout_to_text(tokens[0].layout, text)
    first_token_break_type = tokens[0].detected_break.type_.name
    print(f"        First token text: {repr(first_token_text)}")
    print(f"        First token break type: {repr(first_token_break_type)}")
    if tokens[0].style_info:
        print_style_info(tokens[0].style_info)

    last_token_text = layout_to_text(tokens[-1].layout, text)
    last_token_break_type = tokens[-1].detected_break.type_.name
    print(f"        Last token text: {repr(last_token_text)}")
    print(f"        Last token break type: {repr(last_token_break_type)}")
    if tokens[-1].style_info:
        print_style_info(tokens[-1].style_info)


def print_symbols(
    symbols: Sequence[documentai.Document.Page.Symbol], text: str
) -> None:
    print(f"    {len(symbols)} symbols detected:")
    first_symbol_text = layout_to_text(symbols[0].layout, text)
    print(f"        First symbol text: {repr(first_symbol_text)}")
    last_symbol_text = layout_to_text(symbols[-1].layout, text)
    print(f"        Last symbol text: {repr(last_symbol_text)}")


def print_image_quality_scores(
    image_quality_scores: documentai.Document.Page.ImageQualityScores,
) -> None:
    print(f"    Quality score: {image_quality_scores.quality_score:.1%}")
    print("    Detected defects:")

    for detected_defect in image_quality_scores.detected_defects:
        print(f"        {detected_defect.type_}: {detected_defect.confidence:.1%}")


def print_style_info(style_info: documentai.Document.Page.Token.StyleInfo) -> None:
    """
    Only supported in version `pretrained-ocr-v2.0-2023-06-02`
    """
    print(f"           Font Size: {style_info.font_size}pt")
    print(f"           Font Type: {style_info.font_type}")
    print(f"           Bold: {style_info.bold}")
    print(f"           Italic: {style_info.italic}")
    print(f"           Underlined: {style_info.underlined}")
    print(f"           Handwritten: {style_info.handwritten}")
    print(
        f"           Text Color (RGBa): {style_info.text_color.red}, {style_info.text_color.green}, {style_info.text_color.blue}, {style_info.text_color.alpha}"
    )


def print_visual_elements(
    visual_elements: Sequence[documentai.Document.Page.VisualElement], text: str
) -> None:
    """
    Only supported in version `pretrained-ocr-v2.0-2023-06-02`
    """
    checkboxes = [x for x in visual_elements if "checkbox" in x.type]
    math_symbols = [x for x in visual_elements if x.type == "math_formula"]

    if checkboxes:
        print(f"    {len(checkboxes)} checkboxes detected:")
        print(f"        First checkbox: {repr(checkboxes[0].type)}")
        print(f"        Last checkbox: {repr(checkboxes[-1].type)}")

    if math_symbols:
        print(f"    {len(math_symbols)} math symbols detected:")
        first_math_symbol_text = layout_to_text(math_symbols[0].layout, text)
        print(f"        First math symbol: {repr(first_math_symbol_text)}")


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


# Call the function and capture the results
process_document_ocr_sample(project_id, location, processor_id, processor_version, file_path, mime_type)


Full document text: ICICI Lombard
Max Bupa
Apoll
Ihealth
Heartbeat Gold
Optin
Basic sum insured
10 lacs
10 lacs
Premium
10,643
22,696
1
Hosptalization benefits
Pre-existing diseases:
Pre-existing diseases:
Pre-existi
2 years
2 years
}Specifi
Specific illnesses/
Specificillnesses/treatments: None
1 Waiting period
treatments: 2 years
treatme
No restriction
No restriction
No re
2 Hospital accomodation
/sub-limits
/sub-limits
/su
3 Pre-hospitalization
30 days
30 days
60
4 Post hospitalization
60 days after
60 days after
18
Only in c
accon
Rs 800 per d
5 Hospital cash/Daily Cash
No
Not applicable
of
Actual cost at Network
Rs. 1500 per
hospital, Rs 2000 per
Rs 2
hospitalization
hosptalization otherwise
hospi
Not covered
Covered
Cc
6 Emergency ambulance7 Organ donor expenseCo-payment feature8/Annual deductible
Not applicable
Not applicable
Not a
140 day care procedures
All day care procedures
All day cal
covered
covered
CO
9 Day care procedures10 Domicilliary hospitalization
No
Yes, upto Rs 5

In [30]:
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Replace the placeholder values with your actual project and processor information
project_id = 'balmy-outcome-412805'
location = "us"  # Format is "us" or "eu"
processor_id = '1c327dd87f42b98b'  # Create processor before running sample
processor_version = "rc"  # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
file_path = "C:/Users/Shreshtha/Downloads/4F82B400.pdf"
mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

def process_document_ocr_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> None:
    # Optional: Additional configurations for Document OCR Processor.
    # For more information: https://cloud.google.com/document-ai/docs/enterprise-document-ocr
    process_options = documentai.ProcessOptions(
        ocr_config=documentai.OcrConfig(
            enable_native_pdf_parsing=True,
            enable_image_quality_scores=True,
            enable_symbol=True,
            # Comment out the premium_features section to disable premium OCR features
            # premium_features=documentai.OcrConfig.PremiumFeatures(
            #     compute_style_info=True,
            #     enable_math_ocr=False,  # Enable to use Math OCR Model
            #     enable_selection_mark_detection=True,
            # ),
        )
    )
    # Online processing request to Document AI
    document = process_document(
        project_id,
        location,
        processor_id,
        processor_version,
        file_path,
        mime_type,
        process_options=process_options,
    )

    text = document.text
    print(f"Full document text: {text}\n")
    print(f"There are {len(document.pages)} page(s) in this document.\n")

    for page in document.pages:
        print(f"Page {page.page_number} text:")
        print(layout_to_text(page.layout, text))


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


# Call the function and capture the results
process_document_ocr_sample(project_id, location, processor_id, processor_version, file_path, mime_type)


Full document text: ICICI Lombard
Max Bupa
Apoll
Ihealth
Heartbeat Gold
Optin
Basic sum insured
10 lacs
10 lacs
Premium
10,643
22,696
1
Hosptalization benefits
Pre-existing diseases:
Pre-existing diseases:
Pre-existi
2 years
2 years
}Specifi
Specific illnesses/
Specificillnesses/treatments: None
1 Waiting period
treatments: 2 years
treatme
No restriction
No restriction
No re
2 Hospital accomodation
/sub-limits
/sub-limits
/su
3 Pre-hospitalization
30 days
30 days
60
4 Post hospitalization
60 days after
60 days after
18
Only in c
accon
Rs 800 per d
5 Hospital cash/Daily Cash
No
Not applicable
of
Actual cost at Network
Rs. 1500 per
hospital, Rs 2000 per
Rs 2
hospitalization
hosptalization otherwise
hospi
Not covered
Covered
Cc
6 Emergency ambulance7 Organ donor expenseCo-payment feature8/Annual deductible
Not applicable
Not applicable
Not a
140 day care procedures
All day care procedures
All day cal
covered
covered
CO
9 Day care procedures10 Domicilliary hospitalization
No
Yes, upto Rs 5

In [31]:
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Replace the placeholder values with your actual project and processor information
project_id = 'balmy-outcome-412805'
location = "us"  # Format is "us" or "eu"
processor_id = '1c327dd87f42b98b'  # Create processor before running sample
processor_version = "rc"  # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
file_path = "C:/Users/Shreshtha/Downloads/4F82B400.pdf"
mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

def process_document_ocr_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> None:
    # Optional: Additional configurations for Document OCR Processor.
    # For more information: https://cloud.google.com/document-ai/docs/enterprise-document-ocr
    process_options = documentai.ProcessOptions(
        ocr_config=documentai.OcrConfig(
            enable_native_pdf_parsing=True,
            enable_image_quality_scores=True,
            enable_symbol=True,
            # Comment out the premium_features section to disable premium OCR features
            # premium_features=documentai.OcrConfig.PremiumFeatures(
            #     compute_style_info=True,
            #     enable_math_ocr=False,  # Enable to use Math OCR Model
            #     enable_selection_mark_detection=True,
            # ),
        )
    )
    # Online processing request to Document AI
    document = process_document(
        project_id,
        location,
        processor_id,
        processor_version,
        file_path,
        mime_type,
        process_options=process_options,
    )

    text = document.text
    print(f"Full document text: {text}\n")
    print(f"There are {len(document.pages)} page(s) in this document.\n")

    for page_num, page in enumerate(document.pages, start=1):
        print(f"Page {page_num} text:")
        print(layout_to_text(page.layout, text))
        print("\n" + "="*50 + "\n")  # Adding a separator for better visibility


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


# Call the function and capture the results
process_document_ocr_sample(project_id, location, processor_id, processor_version, file_path, mime_type)


Full document text: ICICI Lombard
Max Bupa
Apoll
Ihealth
Heartbeat Gold
Optin
Basic sum insured
10 lacs
10 lacs
Premium
10,643
22,696
1
Hosptalization benefits
Pre-existing diseases:
Pre-existing diseases:
Pre-existi
2 years
2 years
}Specifi
Specific illnesses/
Specificillnesses/treatments: None
1 Waiting period
treatments: 2 years
treatme
No restriction
No restriction
No re
2 Hospital accomodation
/sub-limits
/sub-limits
/su
3 Pre-hospitalization
30 days
30 days
60
4 Post hospitalization
60 days after
60 days after
18
Only in c
accon
Rs 800 per d
5 Hospital cash/Daily Cash
No
Not applicable
of
Actual cost at Network
Rs. 1500 per
hospital, Rs 2000 per
Rs 2
hospitalization
hosptalization otherwise
hospi
Not covered
Covered
Cc
6 Emergency ambulance7 Organ donor expenseCo-payment feature8/Annual deductible
Not applicable
Not applicable
Not a
140 day care procedures
All day care procedures
All day cal
covered
covered
CO
9 Day care procedures10 Domicilliary hospitalization
No
Yes, upto Rs 5