### Converting the Excel Workbook to PDF

In [2]:
from spire.xls import *
from spire.xls.common import *
# Create a Workbook object

workbook = Workbook()
# Load an Excel document

workbook.LoadFromFile("C:/Users/Shreshtha/Downloads/Bartrack Sample.xlsx")

# Iterate through the worksheets in the workbook

for sheet in workbook.Worksheets:
# Get the PageSetup object

    pageSetup = sheet.PageSetup

# Set page margins
    pageSetup.TopMargin = 0.3

    pageSetup.BottomMargin = 0.3

    pageSetup.LeftMargin = 0.3

    pageSetup.RightMargin = 0.3



# Set worksheet to fit to page when converting

workbook.ConverterSetting.SheetFitToPage = True



# Convert to PDF file

workbook.SaveToFile("C:/Users/Shreshtha/Downloads/Bartrack-Sample-pdf.pdf", FileFormat.PDF)

workbook.Dispose()

### Initialising Document AI

In [3]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "C:/Users/Shreshtha/Downloads/balmy-outcome-412805-e9aa761e058c.json"

### Using Document AI's OCR Processor

In [4]:
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Replace the placeholder values with your actual project and processor information
project_id = 'balmy-outcome-412805'
location = "us"  # Format is "us" or "eu"
processor_id = '1c327dd87f42b98b'  # Create processor before running sample
processor_version = "rc"  # Refer to https://cloud.google.com/document-ai/docs/manage-processor-versions for more information
file_path = "C:/Users/Shreshtha/Downloads/Bartrack-Sample-pdf.pdf"
mime_type = "application/pdf"  # Refer to https://cloud.google.com/document-ai/docs/file-types for supported file types

def process_document_ocr_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> None:
    # Optional: Additional configurations for Document OCR Processor.
    # For more information: https://cloud.google.com/document-ai/docs/enterprise-document-ocr
    process_options = documentai.ProcessOptions(
        ocr_config=documentai.OcrConfig(
            enable_native_pdf_parsing=True,
            enable_image_quality_scores=True,
            enable_symbol=True,
            # Comment out the premium_features section to disable premium OCR features
            # premium_features=documentai.OcrConfig.PremiumFeatures(
            #     compute_style_info=True,
            #     enable_math_ocr=False,  # Enable to use Math OCR Model
            #     enable_selection_mark_detection=True,
            # ),
        )
    )
    # Online processing request to Document AI
    document = process_document(
        project_id,
        location,
        processor_id,
        processor_version,
        file_path,
        mime_type,
        process_options=process_options,
    )

    print(f"There are {len(document.pages)} page(s) in this document.\n")

    for page_num, page in enumerate(document.pages, start=1):
        print(f"Page {page_num} text:")
        print(layout_to_text(page.layout, document.text))
        print("\n" + "="*50 + "\n")  # Adding a separator for better visibility


def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    # For a full list of `Document` object attributes, reference this page:
    # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document
    return result.document


def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document"s text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )


# Call the function and capture the results
process_document_ocr_sample(project_id, location, processor_id, processor_version, file_path, mime_type)

There are 8 page(s) in this document.

Page 1 text:
period 
of 
24
ICICI Lombard
Max Bupa
Apollo Munich
Tata AIG
Ihealth
Heartbeat Gold
Optima Restore
MediPrime
Star Health Comprehensive
Basic sum insured
10 lacs
10 lacs
10 lacs
10 lacs
10 lacs
Premium
10,643
22,696
13,607
12,205
17,483
Hosptalization benefits
Pre-existing diseases:
Pre-existing diseases:
Pre-existing diseases: 3
Pre-existing diseases:
Pre-existing diseases:
2 years
2 years
years
4 years
4 years
Specific illnesses/
Specific
Specific illnesses/
Specific illnesses/
Specific illnesses/
1 Waiting period
treatments: 2 years
illnesses/treatments: None
treatments: 2 years
treatments: 2 years
treatments: 2 years
No restriction
No restriction
No restriction
No restriction
No restriction
2 Hospital accomodation
/sub-limits
/sub-limits
/sub-limits
/sub-limits
/sub-limits
60 days if informed of
hospitalization 5 days in
3 Pre-hospitalization
30 days
30 days
60 days
advance, else 30 days
30 days
90 days if informed of
4 Post hospit

### Using Document AI's Form Parser

In [6]:
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Replace these values with your actual configuration
project_id = 'balmy-outcome-412805' 
location = 'us' 
processor_id = '1c327dd87f42b98b' 
processor_version = 'rc' 
local_file_path = "C:/Users/Shreshtha/Downloads/Bartrack-Sample-pdf.pdf" 
mime_type = 'application/pdf'

def process_document_form_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    # Online processing request to Document AI
    document = process_document(
        project_id, location, processor_id, processor_version, file_path, mime_type
    )

    # Read the table and form fields output from the processor
    # The form processor also contains OCR data. For more information
    # on how to parse OCR data please see the OCR sample.

    text = document.text

    # Read the form fields and tables output from the processor
    for page in document.pages:
        print(f"\n\n**** Page {page.page_number} ****")

        print(f"\nFound {len(page.tables)} table(s):")
        for table in page.tables:
            num_columns = len(table.header_rows[0].cells)
            num_rows = len(table.body_rows)
            print(f"Table with {num_columns} columns and {num_rows} rows:")

            # Print header rows
            print_table_rows(table.header_rows, text)
            # Print body rows
            print_table_rows(table.body_rows, text)

        print(f"\nFound {len(page.form_fields)} form field(s):")
        for field in page.form_fields:
            name = layout_to_text(field.field_name, text)
            value = layout_to_text(field.field_value, text)
            print(f"    * {repr(name.strip())}: {repr(value.strip())}")

    return document

def print_table_rows(
    table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> None:
    for table_row in table_rows:
        row_text = ""
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text)
            row_text += f"{repr(cell_text.strip())} | "
        print(row_text)

def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    return result.document

def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document's text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

def process_document_form_sample_and_save(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    output_file_path: str
) -> documentai.Document:
    processed_document = process_document_form_sample(
        project_id, location, processor_id, processor_version, file_path, mime_type
    )

    # Save the output to a text file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        # Redirect standard output to the file
        import sys
        original_stdout = sys.stdout
        sys.stdout = output_file

        try:
            # Output the processed document to the file
            print_processed_document(processed_document)
        finally:
            # Restore standard output
            sys.stdout = original_stdout

    return processed_document

def print_processed_document(processed_document: documentai.Document) -> None:
    # This function prints the processed document content
    # You can customize this function based on your requirements

    print(f"There are {len(processed_document.pages)} page(s) in this document.")

    # Print the content of each page
    for page in processed_document.pages:
        print(f"\n\n**** Page {page.page_number} ****")

        # Print tables in the document
        print(f"\nFound {len(page.tables)} table(s):")
        for table in page.tables:
            num_columns = len(table.header_rows[0].cells)
            num_rows = len(table.body_rows)
            print(f"Table with {num_columns} columns and {num_rows} rows:")

            # Print header rows
            print_table_rows(table.header_rows, processed_document.text)
            # Print body rows
            print_table_rows(table.body_rows, processed_document.text)

        # Print form fields in the document
        print(f"\nFound {len(page.form_fields)} form field(s):")
        for field in page.form_fields:
            name = layout_to_text(field.field_name, processed_document.text)
            value = layout_to_text(field.field_value, processed_document.text)
            print(f"    * {repr(name.strip())}: {repr(value.strip())}")

def print_table_rows(
    table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> None:
    for table_row in table_rows:
        row_text = ""
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text)
            # Replace newline characters with spaces
            cell_text = cell_text.replace('\n', ' ')
            row_text += f"{repr(cell_text.strip())} | "
        print(row_text)

In [7]:
# Replace these values with your actual configuration
project_id = 'balmy-outcome-412805' 
location = 'us' 
processor_id = '1c327dd87f42b98b' 
processor_version = 'rc' 
local_file_path = "C:/Users/Shreshtha/Downloads/Bartrack-Sample-pdf.pdf"  
mime_type = 'application/pdf'
output_file_path = "C:/Users/Shreshtha/Downloads/Bartrack-sample.txt"

# Call the main function to process the document and save the output to a file
processed_document = process_document_form_sample_and_save(
    project_id, location, processor_id, processor_version, local_file_path, mime_type, output_file_path
)



**** Page 1 ****

Found 2 table(s):
Table with 7 columns and 13 rows:
'' | 'Basic sum insured' | 'ICICI Lombard Ihealth 10 lacs' | 'Max Bupa Heartbeat Gold 10 lacs' | 'Apollo Munich Optima Restore 10 lacs' | 'Tata AIG MediPrime 10 lacs' | 'Star Health Comprehensive 10 lacs' | 
'' | 'Premium' | '10,643' | '22,696' | '13,607' | '12,205' | '17,483' | 
'' | 'Hosptalization benefits' | '' | '' | '' | '' | '' | 
'' | '' | 'Pre-existing diseases:' | 'Pre-existing diseases:' | 'Pre-existing diseases: 3' | 'Pre-existing diseases:' | 'Pre-existing diseases:' | 
'1' | 'Waiting period' | '2 years Specific illnesses/ treatments: 2 years No restriction' | '2 years Specific illnesses/treatments: None No restriction' | 'years Specific illnesses/ treatments: 2 years No restriction' | '4 years Specific illnesses/ treatments: 2 years No restriction' | '4 years Specific illnesses/ treatments: 2 years No restriction' | 
'2' | 'Hospital accomodation' | '/sub-limits' | '/sub-limits' | '/sub-limits' | '/sub

### Extracting embedded files

In [1]:
import os
import zipfile
import mimetypes
import io
import csv
import re
from PIL import Image
import xml.etree.ElementTree as ET
from urllib.parse import unquote
import shutil

def process_embedded_excel_files(input_xlsx_path, output_folder):
    def extract_embedded_files(file_path, save_path):
        """
        Extracts arbitrary embedded files from an Excel (xlsx) file and saves them.

        Parameters:
        ----------
        file_path : str,
            The path to the xlsx file.

        save_path : str,
            Directory path to save the extracted files.
        """
            # Create the directory if it doesn't exist
        if not os.path.exists(save_path):
            os.makedirs(save_path)

        with zipfile.ZipFile(file_path, 'r') as zip_file:
            # List all files in the archive
            all_files = zip_file.namelist()

            # Iterate through all files in the archive
            for file_info in all_files:
                # Extract the file content
                file_content = zip_file.read(file_info)

                # Save the extracted file to the specified directory
                extracted_file_path = os.path.join(save_path, os.path.basename(file_info))
                with open(extracted_file_path, 'wb') as f:
                    f.write(file_content)

    def convert_files_to_txt(input_folder, output_folder):
        """
        Converts each file in the input folder to a plain text file and saves it in the output folder.

        Parameters:
        ----------
        input_folder : str,
            The path to the folder containing XML and RELS files.

        output_folder : str,
            The path to the folder where the converted text files will be saved.
        """
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        for filename in os.listdir(input_folder):
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, f"{os.path.splitext(filename)[0]}.txt")

            try:
                with open(input_file_path, 'r', encoding='utf-8') as input_file:
                    # Read the content of the input file
                    file_content = input_file.read()

                # Optionally, you can parse XML content if needed
                # For example, if the file is in XML format
                # tree = ET.fromstring(file_content)
                # parsed_content = ET.tostring(tree, encoding='utf-8').decode('utf-8')

            # Write the content to the output text file
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(file_content)
            except Exception as e:
                print(f"Error processing file {input_file_path}: {e}")
    def extract_text_from_folder(folder_path):
        """
        Extracts text content matching the pattern from text files within a folder.

        Parameters:
        ----------
        folder_path : str,
            The path to the folder containing text files.

        Returns:
        -------
        matching_paths : list,
            A list of concatenated file paths found within the text files.
        """
        matching_paths = []

        # Iterate through each file in the folder
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)

            # Check if it's a text file
            if os.path.isfile(file_path) and filename.lower().endswith('.txt'):
                try:
                    # Read the content of the text file
                    with open(file_path, 'r', encoding='utf-8') as file:
                        file_content = file.read()

                    # Use format() to build the string
                    matches = re.findall(r'file:///C:\\Users\\([^"]+)', file_content)
                    matching_paths.extend(["C:/Users/{}".format(match.replace('\\', '/')) for match in matches])
                except Exception as e:
                    print(f"Error processing file {file_path}: {e}")

        return matching_paths

    def fetch_files(matching_paths, output_folder):
        # Create the output folder if it doesn't exist
        os.makedirs(output_folder, exist_ok=True)

        for file_path in matching_paths:
            try:
                file_name = os.path.basename(unquote(file_path))
                shutil.copy(unquote(file_path), os.path.join(output_folder, file_name))
            except Exception as e:
                print(f"Error fetching file {file_path}: {e}")

    # Create output folders
    extracted_folder = os.path.join(output_folder, 'extracted_embedded_files')
    converted_folder = os.path.join(output_folder, 'converted_text_files')
    fetched_folder = os.path.join(output_folder, 'fetched_excel_files')

    # Extract embedded files
    extract_embedded_files(input_xlsx_path, extracted_folder)

    # Convert files to text
    convert_files_to_txt(extracted_folder, converted_folder)

    # Extract matching paths from text files
    matching_paths = extract_text_from_folder(converted_folder)

    # Fetch and copy Excel files
    fetch_files(matching_paths, fetched_folder)

    # Delete 'converted_text_files' and 'extracted_embedded_files' folders
    shutil.rmtree(converted_folder)
    shutil.rmtree(extracted_folder)
    
    

# Example usage:
input_xlsx_path = "C:/Users/Shreshtha/Downloads/Bartrack Sample.xlsx"
output_folder = "C:/Users/Shreshtha/Downloads/embedded_files_directory"
process_embedded_excel_files(input_xlsx_path, output_folder)

Error processing file C:/Users/Shreshtha/Downloads/embedded_files_directory\extracted_embedded_files\image1.jpeg: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/embedded_files_directory\extracted_embedded_files\image2.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/embedded_files_directory\extracted_embedded_files\image3.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/embedded_files_directory\extracted_embedded_files\image4.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/embedded_files_directory\extracted_embedded_files\image5.emf: 'utf-8' codec can't decode byte 0x8e in position 16: invalid start byte
Error processing file C:/Users/Shreshtha/Downloads/embedded_files_directory

### Combining all embedded files and converting to PDF

In [12]:
from spire.xls import *
from spire.xls.common import *
import os

def combine_and_convert_to_pdf(input_folder, output_directory, output_file_name):
    
    def combine_excel_files(input_folder, output_directory, output_file_name):
        # Create a new workbook
        newbook = Workbook()
        newbook.Version = ExcelVersion.Version2013
        # Clear all default worksheets
        newbook.Worksheets.Clear()

        # Create a temporary workbook
        tempbook = Workbook()

        # Iterate through each file in the folder
        for file in os.listdir(input_folder):
            if file.endswith('.xlsx') or file.endswith('.xls'):
                file_path = os.path.join(input_folder, file)

                # Load the file into the temporary workbook
                tempbook.LoadFromFile(file_path)

                # Iterate through each worksheet in the temporary workbook
                for sheet_index in range(tempbook.Worksheets.Count):
                    # Copy the entire worksheet from the temporary workbook to the new workbook
                    new_sheet = newbook.Worksheets.AddCopy(tempbook.Worksheets[sheet_index], WorksheetCopyType.CopyAll)

        # Specify the output file path
        output_path = os.path.join(output_directory, output_file_name)

        # Create the output directory if it doesn't exist
        os.makedirs(output_directory, exist_ok=True)

        # Save the merged file to the specified directory
        newbook.SaveToFile(output_path, ExcelVersion.Version2013)

        # Dispose of the workbooks
        newbook.Dispose()
        tempbook.Dispose()

    def convert_excel_to_pdf(input_excel_path, output_pdf_path):
        # Create a Workbook object
        workbook = Workbook()

        try:
            # Load an Excel document
            workbook.LoadFromFile(input_excel_path)

            # Iterate through the worksheets in the workbook
            for sheet in workbook.Worksheets:
                # Get the PageSetup object
                pageSetup = sheet.PageSetup

                # Set page margins
                pageSetup.TopMargin = 0.3
                pageSetup.BottomMargin = 0.3
                pageSetup.LeftMargin = 0.3
                pageSetup.RightMargin = 0.3

            # Set worksheet to fit to page when converting
            workbook.ConverterSetting.SheetFitToPage = True

            # Convert to PDF file
            workbook.SaveToFile(output_pdf_path, FileFormat.PDF)
            print(f"Conversion successful. PDF saved to {output_pdf_path}")

        except Exception as e:
            print(f"Error during conversion: {e}")

        finally:
            # Dispose of the workbook
            workbook.Dispose()

    # Combine Excel files
    combined_excel_path = os.path.join(output_directory, "CombinedExcelFiles1.xlsx")
    combine_excel_files(input_folder, output_directory, "CombinedExcelFiles1.xlsx")

    # Convert combined Excel file to PDF
    output_pdf_path = os.path.join(output_directory, output_file_name)
    convert_excel_to_pdf(combined_excel_path, output_pdf_path)

# Example usage:
input_folder_path = "C:/Users/Shreshtha/Downloads/embedded_files_directory/fetched_excel_files"
output_directory_path = "C:/Users/Shreshtha/Downloads/Embedded_files_workbook2/"
output_file_name = "Embedded Bartrack Sample Data1.pdf"

combine_and_convert_to_pdf(input_folder_path, output_directory_path, output_file_name)

Conversion successful. PDF saved to C:/Users/Shreshtha/Downloads/Embedded_files_workbook2/Embedded Bartrack Sample Data1.pdf


### Using Form Parser on Combined Embedded Files PDF

In [41]:
from typing import Optional, Sequence
from google.api_core.client_options import ClientOptions
from google.cloud import documentai

# Replace these values with your actual configuration
project_id = 'balmy-outcome-412805' 
location = 'us' 
processor_id = '1c327dd87f42b98b' 
processor_version = 'rc' 
local_file_path = "C:/Users/Shreshtha/Downloads/Embedded_files_workbook2/Embedded Bartrack Sample Data1.pdf"
mime_type = 'application/pdf'

def process_document_form_sample(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
) -> documentai.Document:
    # Online processing request to Document AI
    document = process_document(
        project_id, location, processor_id, processor_version, file_path, mime_type
    )

    # Read the table and form fields output from the processor
    # The form processor also contains OCR data. For more information
    # on how to parse OCR data please see the OCR sample.

    text = document.text

    # Read the form fields and tables output from the processor
    for page in document.pages:
        print(f"\n\n**** Page {page.page_number} ****")

        print(f"\nFound {len(page.tables)} table(s):")
        for table in page.tables:
            num_columns = len(table.header_rows[0].cells)
            num_rows = len(table.body_rows)
            print(f"Table with {num_columns} columns and {num_rows} rows:")

            # Print header rows
            print_table_rows(table.header_rows, text)
            # Print body rows
            print_table_rows(table.body_rows, text)

        print(f"\nFound {len(page.form_fields)} form field(s):")
        for field in page.form_fields:
            name = layout_to_text(field.field_name, text)
            value = layout_to_text(field.field_value, text)
            print(f"    * {repr(name.strip())}: {repr(value.strip())}")

    return document

def print_table_rows(
    table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> None:
    for table_row in table_rows:
        row_text = ""
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text)
            row_text += f"{repr(cell_text.strip())} | "
        print(row_text)

def process_document(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    process_options: Optional[documentai.ProcessOptions] = None,
) -> documentai.Document:
    # You must set the `api_endpoint` if you use a location other than "us".
    client = documentai.DocumentProcessorServiceClient(
        client_options=ClientOptions(
            api_endpoint=f"{location}-documentai.googleapis.com"
        )
    )

    # The full resource name of the processor version, e.g.:
    # `projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version_id}`
    # You must create a processor before running this sample.
    name = client.processor_version_path(
        project_id, location, processor_id, processor_version
    )

    # Read the file into memory
    with open(file_path, "rb") as image:
        image_content = image.read()

    # Configure the process request
    request = documentai.ProcessRequest(
        name=name,
        raw_document=documentai.RawDocument(content=image_content, mime_type=mime_type),
        # Only supported for Document OCR processor
        process_options=process_options,
    )

    result = client.process_document(request=request)

    return result.document

def layout_to_text(layout: documentai.Document.Page.Layout, text: str) -> str:
    """
    Document AI identifies text in different parts of the document by their
    offsets in the entirety of the document's text. This function converts
    offsets to a string.
    """
    # If a text segment spans several lines, it will
    # be stored in different text segments.
    return "".join(
        text[int(segment.start_index) : int(segment.end_index)]
        for segment in layout.text_anchor.text_segments
    )

def process_document_form_sample_and_save(
    project_id: str,
    location: str,
    processor_id: str,
    processor_version: str,
    file_path: str,
    mime_type: str,
    output_file_path: str
) -> documentai.Document:
    processed_document = process_document_form_sample(
        project_id, location, processor_id, processor_version, file_path, mime_type
    )

    # Save the output to a text file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        # Redirect standard output to the file
        import sys
        original_stdout = sys.stdout
        sys.stdout = output_file

        try:
            # Output the processed document to the file
            print_processed_document(processed_document)
        finally:
            # Restore standard output
            sys.stdout = original_stdout

    return processed_document

def print_processed_document(processed_document: documentai.Document) -> None:
    # This function prints the processed document content
    # You can customize this function based on your requirements

    print(f"There are {len(processed_document.pages)} page(s) in this document.")

    # Print the content of each page
    for page in processed_document.pages:
        print(f"\n\n**** Page {page.page_number} ****")

        # Print tables in the document
        print(f"\nFound {len(page.tables)} table(s):")
        for table in page.tables:
            num_columns = len(table.header_rows[0].cells)
            num_rows = len(table.body_rows)
            print(f"Table with {num_columns} columns and {num_rows} rows:")

            # Print header rows
            print_table_rows(table.header_rows, processed_document.text)
            # Print body rows
            print_table_rows(table.body_rows, processed_document.text)

        # Print form fields in the document
        print(f"\nFound {len(page.form_fields)} form field(s):")
        for field in page.form_fields:
            name = layout_to_text(field.field_name, processed_document.text)
            value = layout_to_text(field.field_value, processed_document.text)
            print(f"    * {repr(name.strip())}: {repr(value.strip())}")

def print_table_rows(
    table_rows: Sequence[documentai.Document.Page.Table.TableRow], text: str
) -> None:
    for table_row in table_rows:
        row_text = ""
        for cell in table_row.cells:
            cell_text = layout_to_text(cell.layout, text)
            # Replace newline characters with spaces
            cell_text = cell_text.replace('\n', ' ')
            row_text += f"{repr(cell_text.strip())} | "
        print(row_text)

In [43]:
# Replace these values with your actual configuration
project_id = 'balmy-outcome-412805' 
location = 'us' 
processor_id = '1c327dd87f42b98b' 
processor_version = 'rc' 
local_file_path = "C:/Users/Shreshtha/Downloads/Embedded_files_workbook2/Embedded Bartrack Sample Data1.pdf"  
mime_type = 'application/pdf'
output_file_path = "C:/Users/Shreshtha/Downloads/embedded-bartrack-sample.txt"

# Call the main function to process the document and save the output to a file
processed_document = process_document_form_sample_and_save(
    project_id, location, processor_id, processor_version, local_file_path, mime_type, output_file_path
)



**** Page 1 ****

Found 1 table(s):
Table with 2 columns and 44 rows:
"Name Author User 10-Day Green Smoothie C 11/22/63: A NovelStephen King 12 Rules for Life: An Antidote to Ch 1984 (Signet Classics) George Orwell 5,000 Awesome Facts (About Every A Dance with Dragons (A Song A Game of Thrones / A Clash of King A Gentleman in Moscow: A NovAmor Towles of Ice A Higher Loyalty: Truth, Lies, an A Man Called Ove: A NovelFredrik  Backman A Man Called Ove: A NovelFredrik  Backman A Patriot's History of the United St A Stolen Life: A MemoirJaycee  Dugard A Wrinkle in Time (Time Quintet)Madeleine  L'Engle Act Like a Lady, Think Like a Ma Adult Coloring Book Designs: Stress Adult Coloring Book: Stress Relievin" | 'Rating Reviews Price Year 4.7 17350 8 Genre 2016 Non Fiction 4.6 2052 22 2011 Fiction 4.7 18979 15 2018 Non Fiction 4.7 21424 6 2017 Fiction nal G Kids)7665 12 2019 Non Fiction 4.4 12643 11 2011 Fiction f Sw ast of e 30 w 2014 Fiction 4.7 19699 15 2017 Fiction 4.7 5983 3 2018 Non Fi