## Import Required Libraries

In [3]:
# %%
import os
import tempfile
import cloudmersive_convert_api_client
import cloudmersive_ocr_api_client
from cloudmersive_convert_api_client.rest import ApiException as ConvertApiException
from cloudmersive_ocr_api_client.rest import ApiException as OcrApiException
from google.cloud import storage
from dotenv import load_dotenv
import time
import logging



## Load Environment Variables and Initialize Clients

In [4]:
# %%
# Load environment variables from .env file
load_dotenv()

# Set your Cloudmersive API key and GCP credentials from the .env file
cloudmersive_api_key = os.getenv("CLOUDMERSIVE_API_KEY")
gcp_credentials = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")

# Ensure your GCP credentials are set as an environment variable
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcp_credentials

# Initialize GCP storage client
storage_client = storage.Client()

# Configure Cloudmersive PDF Text Extraction API
convert_configuration = cloudmersive_convert_api_client.Configuration()
convert_configuration.api_key['Apikey'] = cloudmersive_api_key
convert_api_instance = cloudmersive_convert_api_client.ConvertDocumentApi(
    cloudmersive_convert_api_client.ApiClient(convert_configuration)
)

# Configure Cloudmersive OCR API
ocr_configuration = cloudmersive_ocr_api_client.Configuration()
ocr_configuration.api_key['Apikey'] = cloudmersive_api_key
ocr_api_instance = cloudmersive_ocr_api_client.ImageOcrApi(
    cloudmersive_ocr_api_client.ApiClient(ocr_configuration)
)


## Metrics Tracking Variables

In [5]:
# %%
# Metrics tracking variables
total_files_processed = 0  # Tracks the number of PDF files processed
total_pages_processed = 0  # Tracks the total number of images (pages) processed across all files
total_errors = 0  # Tracks the number of errors encountered during processing
start_time = time.time()  # Records the start time for calculating total processing time

## Extract Text from PDF Using Cloudmersive API

In [6]:
# %%
def extract_text_from_pdf_cloudmersive(pdf_bytes):
    """Extracts text from PDF using Cloudmersive API."""
    try:
        # Create a temporary file from the PDF bytes
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf.write(pdf_bytes)
            temp_pdf_path = temp_pdf.name

        # Convert PDF to text using the file path
        result = convert_api_instance.convert_document_pdf_to_txt(temp_pdf_path)

        return result.text_result  # Extract the text result from the object
    except ConvertApiException as e:
        global total_errors
        total_errors += 1  # Increment error count
        print(f"Exception when calling Cloudmersive API for PDF text extraction: {e}\n")
        return None
    finally:
        # Clean up the temp file if it exists
        if os.path.exists(temp_pdf_path):
            os.remove(temp_pdf_path)


## Extract Text from Images in PDF Using Cloudmersive OCR API


In [7]:
# %%
def extract_text_from_images_in_pdf(pdf_bytes):
    """Extracts text from images in PDF using Cloudmersive OCR API."""
    try:
        ocr_text = ""

        # Create a temporary file from the PDF bytes
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
            temp_pdf.write(pdf_bytes)
            temp_pdf_path = temp_pdf.name

        # Convert PDF pages to images
        image_result = convert_api_instance.convert_document_pdf_to_png_array(temp_pdf_path)

        # If there are images in the PDF
        if hasattr(image_result, 'png_result'):
            global total_pages_processed
            total_pages_processed += len(image_result.png_result)  # Count each page as an image
            for page_num, image_data in enumerate(image_result.png_result, start=1):
                with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_image:
                    temp_image.write(image_data)
                    temp_image_path = temp_image.name

                # Apply OCR on the image to extract text
                ocr_response = ocr_api_instance.image_ocr_post(temp_image_path)
                ocr_text += f"\n\nPage {page_num} OCR Text:\n{ocr_response.text_result}"

                # Remove the temporary image file
                os.remove(temp_image_path)

        return ocr_text
    except (OcrApiException, ConvertApiException) as e:
        global total_errors
        total_errors += 1  # Increment error count
        print(f"Exception during OCR processing: {e}")
        return None
    finally:
        # Ensure temporary files are cleaned up
        if os.path.exists(temp_pdf_path):
            os.remove(temp_pdf_path)

## Save Extracted Data to GCP

In [8]:
# %%
def save_extracted_data_to_gcp(bucket_name, folder_name, file_name, extracted_content):
    """Saves the extracted content as a .txt file to the specified GCP bucket."""
    bucket = storage_client.bucket(bucket_name)

    # Remove '.pdf' extension and replace with '.txt'
    txt_file_name = file_name.replace('.pdf', '.txt')

    # Define the target folder (e.g., cloudmersive_API_extracted)
    blob = bucket.blob(f"{folder_name}/{txt_file_name}")

    # Upload the extracted content as a text file
    blob.upload_from_string(extracted_content, content_type='text/plain')

    print(f"Extracted data saved to: {folder_name}/{txt_file_name} in bucket {bucket_name}")


 ## Process PDFs Using Cloudmersive API

In [9]:
# %%
def process_pdfs_in_gcp_cloudmersive(bucket_name, source_folder, target_folder):
    """Processes all PDFs in the source folder, extracts content using Cloudmersive API."""
    global total_files_processed
    bucket = storage_client.bucket(bucket_name)

    # List all PDF files in the source folder
    blobs = bucket.list_blobs(prefix=f"{source_folder}/")

    for blob in blobs:
        if blob.name.endswith('.pdf'):
            file_name = os.path.basename(blob.name)
            print(f"Processing file: {file_name} using Cloudmersive API")

            # Download the PDF as bytes
            pdf_bytes = blob.download_as_bytes()

            # Increment file count
            total_files_processed += 1

            # Extract text content from the PDF using Cloudmersive API
            extracted_text_content = extract_text_from_pdf_cloudmersive(pdf_bytes)

            # Extract text from images in the PDF using OCR
            image_text_content = extract_text_from_images_in_pdf(pdf_bytes)

            # Combine all extracted content
            full_extracted_content = f"Text and Tables:\n{extracted_text_content}\n\nText from Images:\n{image_text_content}"

            # Save extracted content back to GCP in the target folder
            save_extracted_data_to_gcp(bucket_name, target_folder, file_name, full_extracted_content)

## Calculating Performance Metrics

In [10]:
# %%
def calculate_performance_metrics():
    """Calculates and prints performance metrics."""
    end_time = time.time()  # Capture the end time
    total_time = end_time - start_time  # Calculate total time spent processing
    latency = total_time / total_files_processed if total_files_processed > 0 else 0  # Average time per file
    throughput = total_pages_processed / total_time if total_time > 0 else 0  # Pages processed per second
    error_rate = total_errors / total_files_processed if total_files_processed > 0 else 0  # Percentage of files with errors

    print(f"Total files processed: {total_files_processed}")
    print(f"Total errors: {total_errors}")
    print(f"Total time: {total_time:.2f} seconds")
    print(f"Latency: {latency:.2f} seconds per file")
    print(f"Error rate: {error_rate:.2%}")



## Run Processing and Calculate Metrics

In [11]:
# %%
bucket_name = 'gaia_files_pdf'
source_folder = 'gaia_pdfs'  # Single folder for PDFs to be processed
target_folder = 'cloudmersive_API_extracted'  # New folder for extracted content

# Process PDFs using Cloudmersive API and save extracted data in cloudmersive_API_extracted folder
process_pdfs_in_gcp_cloudmersive(bucket_name, source_folder, target_folder)

Processing file: 021a5339-744f-42b7-bd9b-9368b3efda7a.pdf using Cloudmersive API
Extracted data saved to: cloudmersive_API_extracted/021a5339-744f-42b7-bd9b-9368b3efda7a.txt in bucket gaia_files_pdf
Processing file: 32f386b9-73ee-4455-b412-ddad508aa979.pdf using Cloudmersive API
Extracted data saved to: cloudmersive_API_extracted/32f386b9-73ee-4455-b412-ddad508aa979.txt in bucket gaia_files_pdf
Processing file: 366e2f2b-8632-4ef2-81eb-bc3877489217.pdf using Cloudmersive API
Extracted data saved to: cloudmersive_API_extracted/366e2f2b-8632-4ef2-81eb-bc3877489217.txt in bucket gaia_files_pdf
Processing file: 4044eab7-1282-42bd-a559-3bf3a4d5858e.pdf using Cloudmersive API
Extracted data saved to: cloudmersive_API_extracted/4044eab7-1282-42bd-a559-3bf3a4d5858e.txt in bucket gaia_files_pdf
Processing file: 634fca59-03b2-4cdf-9ce4-0205df22f256.pdf using Cloudmersive API
Extracted data saved to: cloudmersive_API_extracted/634fca59-03b2-4cdf-9ce4-0205df22f256.txt in bucket gaia_files_pdf
Proce

In [12]:
# Calculate and display performance metrics
calculate_performance_metrics()

Total files processed: 13
Total errors: 0
Total time: 283.47 seconds
Latency: 21.81 seconds per file
Error rate: 0.00%


Explanation of Metrics:
Latency: Average time (in seconds) to process each file. Calculated as total_time / total_files_processed.
Throughput: The number of pages processed per second. Calculated as total_pages_processed / total_time.
Error Rate: The percentage of files that encountered errors during processing. Calculated as total_errors / total_files_processed.
Total Time: The overall time taken to process all files.