# Azure - Splitter Container Apps + Extraction  Azure Function

### Test Only the Azure Function

In [1]:
# Function Endpoint
function_endpoint = "https://extracttextfrompdf3.azurewebsites.net/api/pdf_extraction?code=gSHBVysFZI5l8NkoWbBnjNe1ilAeDpDdlSq6OGM4fvzdAzFuLBQIyg%3D%3D"

In [2]:
# Read in PDF as bytes
def load_pdf_as_bytes(file_path):
    with open(file_path, 'rb') as file:
        return file.read()

# Usage example
pdf_path = '../data/lorem_ipsum.pdf'
pdf_bytes = load_pdf_as_bytes(pdf_path)

In [3]:
import fitz

def split_pdf_bytes(file_bytes, window_size: int = 10):
    """
    Split a PDF file into smaller chunks of specified window size.

    Args:
        file_bytes (bytes): The PDF file as bytes.
        window_size (int): Number of pages per chunk. Defaults to WINDOW_SIZE.

    Yields:
        bytes: Chunks of the PDF file, each containing up to window_size pages.
    """
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    for i in range(0, len(doc), window_size):
        new_doc = fitz.open()
        new_doc.insert_pdf(
            doc,
            from_page=i,
            to_page=min(i + window_size - 1, len(doc) - 1),
            annots=False,
            links=False,
        )
        yield new_doc.tobytes()

In [4]:
split_bytes = split_pdf_bytes(pdf_bytes)
pdf_chunk = next(split_bytes)

In [9]:
import requests
# files = {'file': ('test.pdf', pdf_file, 'application/pdf')}
response = requests.post(function_endpoint, files={'pdf_data': pdf_chunk})

In [10]:
response.text

'{"ValueError": "No PDF data provided - 1"}'

In [12]:
import requests
import base64

# Read PDF file as bytes
with open(pdf_path, 'rb') as file:
    pdf_bytes = file.read()

# Convert to base64
pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')

# Prepare headers
headers = {
    'Content-Type': 'application/json'
}

# Make request
response = requests.post(
    function_endpoint,
    json={'pdf_data': pdf_base64},
    headers=headers
)

# Check if request was successful
response.raise_for_status()

In [15]:
print(response.json()['text'])

Test document PDF 
 
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla est purus, ultrices in porttitor 
in, accumsan non quam. Nam consectetur porttitor rhoncus. Curabitur eu est et leo feugiat 
auctor vel quis lorem. Ut et ligula dolor, sit amet consequat lorem. Aliquam porta eros sed 
velit imperdiet egestas. Maecenas tempus eros ut diam ullamcorper id dictum libero 
tempor. Donec quis augue quis magna condimentum lobortis. Quisque imperdiet ipsum vel 
magna viverra rutrum. Cras viverra molestie urna, vitae vestibulum turpis varius id. 
Vestibulum mollis, arcu iaculis bibendum varius, velit sapien blandit metus, ac posuere lorem 
nulla ac dolor. Maecenas urna elit, tincidunt in dapibus nec, vehicula eu dui. Duis lacinia 
fringilla massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur 
ridiculus mus. Ut consequat ultricies est, non rhoncus mauris congue porta. Vivamus viverra 
suscipit felis eget condimentum. Cum sociis natoque penatibus et magnis

# AWS - Splitter ECS + Extraction Lambda Function


In [4]:
import requests

: 

In [None]:
service_url="http://ml-pdf-splitter.test.rnwl-backend-services.internal"

In [6]:
service_url="http://localhost:5006"

In [43]:
service_url="http://ml-pdf-splitter.rnwl-backend-services.local:5006" #Run on development server in test environment (development-test)

In [1]:
def load_pdf_as_bytes(file_path):
    with open(file_path, 'rb') as file:
        return file.read()

# Usage example
pdf_path = '../data/lorem_ipsum.pdf'
pdf_bytes = load_pdf_as_bytes(pdf_path)

In [2]:
pdf_bytes

b'%PDF-1.4\r%\xe2\xe3\xcf\xd3\r\n6 0 obj <</Linearized 1/L 77123/O 8/E 72907/N 1/T 76957/H [ 896 203]>>\rendobj\r                    \r\nxref\r\n6 30\r\n0000000016 00000 n\r\n0000001099 00000 n\r\n0000001175 00000 n\r\n0000001357 00000 n\r\n0000001473 00000 n\r\n0000001607 00000 n\r\n0000001890 00000 n\r\n0000002019 00000 n\r\n0000002395 00000 n\r\n0000003455 00000 n\r\n0000004471 00000 n\r\n0000005351 00000 n\r\n0000006333 00000 n\r\n0000007399 00000 n\r\n0000008384 00000 n\r\n0000009410 00000 n\r\n0000010416 00000 n\r\n0000022648 00000 n\r\n0000022900 00000 n\r\n0000023086 00000 n\r\n0000023370 00000 n\r\n0000037981 00000 n\r\n0000038234 00000 n\r\n0000051556 00000 n\r\n0000051802 00000 n\r\n0000051983 00000 n\r\n0000052268 00000 n\r\n0000072584 00000 n\r\n0000072831 00000 n\r\n0000000896 00000 n\r\ntrailer\r\n<</Size 36/Prev 76947/Root 7 0 R/Info 5 0 R/ID[<6CE05116D1375AAB2A2D81359B471C6A><7A99BD2685370C40B5857A995335A3D7>]>>\r\nstartxref\r\n0\r\n%%EOF\r\n                \r\n35 0 ob

In [46]:
print(f"Sending GET request to {service_url}/status")
requests.get(f'{service_url}/status').content

Sending GET request to http://ml-pdf-splitter.rnwl-backend-services.local:5006/status


b'{"status":"OK"}'

In [47]:
import json
print(f"Sending GET request to {service_url}/lambda_status")
response = requests.get(f'{service_url}/lambda_status').content
json.loads(response)

Sending GET request to http://ml-pdf-splitter.rnwl-backend-services.local:5006/lambda_status


{'status': 'ok',
 'message': 'Service is running and can access the PDF extraction API',
 'pdf_extraction_api': 'accessible',
 'test_response': {'stage': 'test',
  'time_taken': 0.04250216484069824,
  'arn_version': 'arn:aws:lambda:eu-west-2:306232495635:function:ExtractTextFromPDF2:test',
  'function_version': '$LATEST',
  'pdf_id': 0,
  'text': 'Test\xa0document\xa0PDF\xa0\n\xa0\nLorem\xa0ipsum\xa0dolor\xa0sit\xa0amet,\xa0consectetur\xa0adipiscing\xa0elit.\xa0Nulla\xa0est\xa0purus,\xa0ultrices\xa0in\xa0porttitor\xa0\nin,\xa0accumsan\xa0non\xa0quam.\xa0Nam\xa0consectetur\xa0porttitor\xa0rhoncus.\xa0Curabitur\xa0eu\xa0est\xa0et\xa0leo\xa0feugiat\xa0\nauctor\xa0vel\xa0quis\xa0lorem.\xa0Ut\xa0et\xa0ligula\xa0dolor,\xa0sit\xa0amet\xa0consequat\xa0lorem.\xa0Aliquam\xa0porta\xa0eros\xa0sed\xa0\nvelit\xa0imperdiet\xa0egestas.\xa0Maecenas\xa0tempus\xa0eros\xa0ut\xa0diam\xa0ullamcorper\xa0id\xa0dictum\xa0libero\xa0\ntempor.\xa0Donec\xa0quis\xa0augue\xa0quis\xa0magna\xa0condimentum\xa0lobortis.

In [48]:
print(f"EXTRACTED TEXT: \n\n{json.loads(response)['test_response']['text']}")

EXTRACTED TEXT: 

Test document PDF 
 
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla est purus, ultrices in porttitor 
in, accumsan non quam. Nam consectetur porttitor rhoncus. Curabitur eu est et leo feugiat 
auctor vel quis lorem. Ut et ligula dolor, sit amet consequat lorem. Aliquam porta eros sed 
velit imperdiet egestas. Maecenas tempus eros ut diam ullamcorper id dictum libero 
tempor. Donec quis augue quis magna condimentum lobortis. Quisque imperdiet ipsum vel 
magna viverra rutrum. Cras viverra molestie urna, vitae vestibulum turpis varius id. 
Vestibulum mollis, arcu iaculis bibendum varius, velit sapien blandit metus, ac posuere lorem 
nulla ac dolor. Maecenas urna elit, tincidunt in dapibus nec, vehicula eu dui. Duis lacinia 
fringilla massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur 
ridiculus mus. Ut consequat ultricies est, non rhoncus mauris congue porta. Vivamus viverra 
suscipit felis eget condimentum. Cum sociis natoque p

In [54]:
import requests

def test_extract_text_endpoint(url, pdf_path):
    # Endpoint URL
    endpoint = f"{url}/extract-text/"

    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Prepare the files for the request
        files = {'file': ('test.pdf', pdf_file, 'application/pdf')}

        # Send POST request to the endpoint
        response = requests.post(endpoint, files=files)

    # Check the response
    if response.status_code == 200:
        print("Request successful!")
        return response
    else:
        print(f"Request failed with status code: {response.status_code}")
        print("Error message:", response.text)

# Usage
pdf_path = "../data/lorem_ipsum.pdf"  # Adjust this to your PDF file's path
pdf_path = "../data/(1000679622) IPID - Private Car Quote from ERS_120149420.pdf"  # Adjust this to your PDF file's path
print(f"Sending POST request to {service_url}/extract-text/")
response = test_extract_text_endpoint(service_url, pdf_path)

Sending POST request to http://ml-pdf-splitter.rnwl-backend-services.local:5006/extract-text/
Request successful!


In [55]:
json.loads(response.content)

{'metadata': {},
 'text': 'Private Car Motor Insurance\nInsurance Product Information Document\nThis insurance is underwritten by ERS (Syndicate 218 at Lloyd’s) which is registered in the UK. ERS Syndicate Management Limited is authorised by\nthe Prudential Regulation Authority and regulated by the Financial Conduct Authority and Prudential Regulation Authority. Registered number: 204851.\nThis document provides a summary of the key information. It does not contain the full terms and conditions; these can be found in your policy document.\nYou can find your policy document at www.ers.com.\nWhat is this type of insurance?\nComprehensive cover – You are covered for loss or damage caused by accidental damage, fire, theft and third party liability cover.\nWhat is insured?\nWhilst driving your vehicle you will be covered for any\none claim or claims arising out of one incident following:\nProperty damage up to £20,000,000;\nProperty damage costs/expenses up to £5,000,000;\nLegal costs up to

In [56]:
print(f"EXTRACTED TEXT: \n\n{json.loads(response.content)['text']}")

EXTRACTED TEXT: 

Private Car Motor Insurance
Insurance Product Information Document
This insurance is underwritten by ERS (Syndicate 218 at Lloyd’s) which is registered in the UK. ERS Syndicate Management Limited is authorised by
the Prudential Regulation Authority and regulated by the Financial Conduct Authority and Prudential Regulation Authority. Registered number: 204851.
This document provides a summary of the key information. It does not contain the full terms and conditions; these can be found in your policy document.
You can find your policy document at www.ers.com.
What is this type of insurance?
Comprehensive cover – You are covered for loss or damage caused by accidental damage, fire, theft and third party liability cover.
What is insured?
Whilst driving your vehicle you will be covered for any
one claim or claims arising out of one incident following:
Property damage up to £20,000,000;
Property damage costs/expenses up to £5,000,000;
Legal costs up to £35,000.
Third Party 