# Azure

## Test Only the Azure Function

In [56]:
import requests
from dotenv import load_dotenv
import os
load_dotenv("../src/.env")
function_endpoint=os.getenv("PDF2TXT_FUNCTION_URL")

In [57]:
import fitz

def split_pdf_bytes(file_bytes, window_size: int = 10):
    """
    Split a PDF file into smaller chunks of specified window size.

    Args:
        file_bytes (bytes): The PDF file as bytes.
        window_size (int): Number of pages per chunk. Defaults to WINDOW_SIZE.

    Yields:
        bytes: Chunks of the PDF file, each containing up to window_size pages.
    """
    doc = fitz.open(stream=file_bytes, filetype="pdf")
    for i in range(0, len(doc), window_size):
        new_doc = fitz.open()
        new_doc.insert_pdf(
            doc,
            from_page=i,
            to_page=min(i + window_size - 1, len(doc) - 1),
            annots=False,
            links=False,
        )
        yield new_doc.tobytes()

In [58]:
import requests
import base64

# Usage example
pdf_path = '../data/lorem_ipsum.pdf'

# Read PDF file as bytes
with open(pdf_path, 'rb') as file:
    pdf_bytes = file.read()

split_bytes = split_pdf_bytes(pdf_bytes)
pdf_chunk = next(split_bytes)

# Convert to base64
pdf_chunk64 = base64.b64encode(pdf_chunk).decode('utf-8')

# Prepare headers
headers = {
    'Content-Type': 'application/json'
}

# Make request
response = requests.post(
    function_endpoint,
    json={'pdf_data': pdf_chunk64},
    headers=headers
)

# Check if request was successful
response.raise_for_status()

In [60]:
print(response.json()['text'])

Test document PDF 
 
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla est purus, ultrices in porttitor 
in, accumsan non quam. Nam consectetur porttitor rhoncus. Curabitur eu est et leo feugiat 
auctor vel quis lorem. Ut et ligula dolor, sit amet consequat lorem. Aliquam porta eros sed 
velit imperdiet egestas. Maecenas tempus eros ut diam ullamcorper id dictum libero 
tempor. Donec quis augue quis magna condimentum lobortis. Quisque imperdiet ipsum vel 
magna viverra rutrum. Cras viverra molestie urna, vitae vestibulum turpis varius id. 
Vestibulum mollis, arcu iaculis bibendum varius, velit sapien blandit metus, ac posuere lorem 
nulla ac dolor. Maecenas urna elit, tincidunt in dapibus nec, vehicula eu dui. Duis lacinia 
fringilla massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur 
ridiculus mus. Ut consequat ultricies est, non rhoncus mauris congue porta. Vivamus viverra 
suscipit felis eget condimentum. Cum sociis natoque penatibus et magnis

## Container Apps Splitter + Extraction Azure Function


In [3]:
import requests
from dotenv import load_dotenv
import os
load_dotenv("../.env")
app_api_key=os.getenv("PDF_SPLITTER_API_KEY")

In [None]:
service_url="https://ml-pdf-splitter-contapp.ashygrass-329b8f02.uksouth.azurecontainerapps.io"
#service_url="http://localhost:5006"

#### APP API STATUS CHECK

In [5]:
print(f"Sending GET request to {service_url}/status")
resp = requests.get(
    f'{service_url}/status',
    headers={
        "X-API-Key": app_api_key
    },
    timeout=5
)

print(resp.status_code, resp.json())  # {'status':'OK'}

Sending GET request to https://ml-pdf-splitter-contapp.ashygrass-329b8f02.uksouth.azurecontainerapps.io/status
200 {'status': 'OK'}


#### CHECK STATUS OF FUNCTION VIA APP

In [21]:
import json
print(f"Sending GET request to {service_url}/function_status")
response = requests.get(
    f'{service_url}/function_status',
    headers={
        "X-API-Key": app_api_key
    },
).content
print("_" * 30 + '\n',"EXTRACTED TEXT: \n", json.loads(response)['test_response']['text'])
print("\n" + "_" * 30 + '\n', "JSON RESPONSE: \n")
json.loads(response)


Sending GET request to https://ml-pdf-splitter-contapp.ashygrass-329b8f02.uksouth.azurecontainerapps.io/function_status
______________________________
 EXTRACTED TEXT: 
 Test document PDF 
 
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla est purus, ultrices in porttitor 
in, accumsan non quam. Nam consectetur porttitor rhoncus. Curabitur eu est et leo feugiat 
auctor vel quis lorem. Ut et ligula dolor, sit amet consequat lorem. Aliquam porta eros sed 
velit imperdiet egestas. Maecenas tempus eros ut diam ullamcorper id dictum libero 
tempor. Donec quis augue quis magna condimentum lobortis. Quisque imperdiet ipsum vel 
magna viverra rutrum. Cras viverra molestie urna, vitae vestibulum turpis varius id. 
Vestibulum mollis, arcu iaculis bibendum varius, velit sapien blandit metus, ac posuere lorem 
nulla ac dolor. Maecenas urna elit, tincidunt in dapibus nec, vehicula eu dui. Duis lacinia 
fringilla massa. Cum sociis natoque penatibus et magnis dis parturient montes, nasc

{'status': 'ok',
 'message': 'Service is running and can access the PDF extraction API',
 'pdf_extraction_api': 'accessible',
 'test_response': {'stage': 'test',
  'time_taken': 0.010281801223754883,
  'pdf_id': 0,
  'text': 'Test\xa0document\xa0PDF\xa0\n\xa0\nLorem\xa0ipsum\xa0dolor\xa0sit\xa0amet,\xa0consectetur\xa0adipiscing\xa0elit.\xa0Nulla\xa0est\xa0purus,\xa0ultrices\xa0in\xa0porttitor\xa0\nin,\xa0accumsan\xa0non\xa0quam.\xa0Nam\xa0consectetur\xa0porttitor\xa0rhoncus.\xa0Curabitur\xa0eu\xa0est\xa0et\xa0leo\xa0feugiat\xa0\nauctor\xa0vel\xa0quis\xa0lorem.\xa0Ut\xa0et\xa0ligula\xa0dolor,\xa0sit\xa0amet\xa0consequat\xa0lorem.\xa0Aliquam\xa0porta\xa0eros\xa0sed\xa0\nvelit\xa0imperdiet\xa0egestas.\xa0Maecenas\xa0tempus\xa0eros\xa0ut\xa0diam\xa0ullamcorper\xa0id\xa0dictum\xa0libero\xa0\ntempor.\xa0Donec\xa0quis\xa0augue\xa0quis\xa0magna\xa0condimentum\xa0lobortis.\xa0Quisque\xa0imperdiet\xa0ipsum\xa0vel\xa0\nmagna\xa0viverra\xa0rutrum.\xa0Cras\xa0viverra\xa0molestie\xa0urna,\xa0vitae\x

Test document PDF 
 
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla est purus, ultrices in porttitor 
in, accumsan non quam. Nam consectetur porttitor rhoncus. Curabitur eu est et leo feugiat 
auctor vel quis lorem. Ut et ligula dolor, sit amet consequat lorem. Aliquam porta eros sed 
velit imperdiet egestas. Maecenas tempus eros ut diam ullamcorper id dictum libero 
tempor. Donec quis augue quis magna condimentum lobortis. Quisque imperdiet ipsum vel 
magna viverra rutrum. Cras viverra molestie urna, vitae vestibulum turpis varius id. 
Vestibulum mollis, arcu iaculis bibendum varius, velit sapien blandit metus, ac posuere lorem 
nulla ac dolor. Maecenas urna elit, tincidunt in dapibus nec, vehicula eu dui. Duis lacinia 
fringilla massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur 
ridiculus mus. Ut consequat ultricies est, non rhoncus mauris congue porta. Vivamus viverra 
suscipit felis eget condimentum. Cum sociis natoque penatibus et magnis

#### EXTRACT PDF TO TEXT VIA SPLITTER APP (FULL PIPELINE TEST)

In [23]:
def test_extract_text_endpoint(url, pdf_path, app_api_key):
    # Endpoint URL
    endpoint = f"{url}/extract-text/"
    print(f"Sending POST request to {endpoint}")

    # Open the PDF file in binary mode
    with open(pdf_path, 'rb') as pdf_file:
        # Prepare the files for the request
        files = {'file': ('test.pdf', pdf_file, 'application/pdf')}

        # Send POST request to the endpoint
        response = requests.post(
            endpoint,
            headers={"X-API-Key": app_api_key},
            files=files
        )

    # Check the response
    if response.status_code == 200:
        print("Request successful!")
        text = json.loads(response.content)['text']
        print(text)
        return response
    else:
        print(f"Request failed with status code: {response.status_code}")
        print("Error message:", response.text)
        return response

In [24]:
# TEST CASE 1
pdf_path = "../data/lorem_ipsum.pdf"  # Adjust this to your PDF file's path
response = test_extract_text_endpoint(service_url, pdf_path, app_api_key)


Sending POST request to https://ml-pdf-splitter-contapp.ashygrass-329b8f02.uksouth.azurecontainerapps.io/extract-text/
Request failed with status code: 404
Error message: {"detail":"Not Found"}


In [55]:
# TEST CASE 2
pdf_path = "../data/(1000679622) IPID - Private Car Quote from ERS_120149420.pdf"  # Adjust this to your PDF file's path
print(f"Sending POST request to {service_url}/extract-text/")
response = test_extract_text_endpoint(service_url, pdf_path, app_api_key)
text = json.loads(response.content)['text']
print('_' * 80, '\n', text)

Sending POST request to https://ml-pdf-splitter-ml-env-3.thankfulwater-80ea82bb.uksouth.azurecontainerapps.io/extract-text/
Request successful!
________________________________________________________________________________ 
 Private Car Motor Insurance
Insurance Product Information Document
This insurance is underwritten by ERS (Syndicate 218 at Lloyd’s) which is registered in the UK. ERS Syndicate Management Limited is authorised by
the Prudential Regulation Authority and regulated by the Financial Conduct Authority and Prudential Regulation Authority. Registered number: 204851.
This document provides a summary of the key information. It does not contain the full terms and conditions; these can be found in your policy document.
You can find your policy document at www.ers.com.
What is this type of insurance?
Comprehensive cover – You are covered for loss or damage caused by accidental damage, fire, theft and third party liability cover.
What is insured?
Whilst driving your vehicle y