## Setting Up Azure Document Intelligence

In [None]:
# !pip install azure-ai-documentintelligence==1.0.0b4

In [5]:
import os

# Get Configuration Settings
from dotenv import load_dotenv
load_dotenv()

False

In [6]:
# import libraries
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

In [13]:
endpoint = os.getenv("DOCUMENT_INTELLIGENCE_ENDPOINT")
key = os.getenv("DOCUMENT_INTELLIGENCE_SUBSCRIPTION_KEY")

In [8]:
def get_words(page, line):
    result = []
    for word in page.words:
        if _in_span(word, line.spans):
            result.append(word)
    return result

# To learn the detailed concept of "span" in the following codes, visit: https://aka.ms/spans 
def _in_span(word, spans):
    for span in spans:
        if word.span.offset >= span.offset and (word.span.offset + word.span.length) <= (span.offset + span.length):
            return True
    return False

In [9]:
def analyze_read():
    from azure.core.credentials import AzureKeyCredential
    from azure.ai.documentintelligence import DocumentIntelligenceClient
    from azure.ai.documentintelligence.models import DocumentAnalysisFeature, AnalyzeResult, AnalyzeDocumentRequest

    # For how to obtain the endpoint and key, please see PREREQUISITES above.
    endpoint = os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"]
    key = os.environ["DOCUMENT_INTELLIGENCE_SUBSCRIPTION_KEY"]

    document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

    # Analyze a document at a URL:
    formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/rest-api/read.png"
    # Replace with your actual formUrl:
    # If you use the URL of a public website, to find more URLs, please visit: https://aka.ms/more-URLs 
    # If you analyze a document in Blob Storage, you need to generate Public SAS URL, please visit: https://aka.ms/create-sas-tokens
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read",
        AnalyzeDocumentRequest(url_source=formUrl),
        features=[DocumentAnalysisFeature.LANGUAGES]
    )       
    
    # # If analyzing a local document, remove the comment markers (#) at the beginning of these 11 lines.
    # # Delete or comment out the part of "Analyze a document at a URL" above.
    # # Replace <path to your sample file>  with your actual file path.
    # path_to_sample_document = "<path to your sample file>"
    # with open(path_to_sample_document, "rb") as f:
    #     poller = document_intelligence_client.begin_analyze_document(
    #         "prebuilt-read",
    #         analyze_request=f,
    #         features=[DocumentAnalysisFeature.LANGUAGES],
    #         content_type="application/octet-stream",
    #     )
    result: AnalyzeResult = poller.result()
    
    # [START analyze_read]
    # Detect languages.
    print("----Languages detected in the document----")
    if result.languages is not None:
        for language in result.languages:
            print(f"Language code: '{language.locale}' with confidence {language.confidence}")
    
    # To learn the detailed concept of "bounding polygon" in the following content, visit: https://aka.ms/bounding-region
    # Analyze pages.
    for page in result.pages:
        print(f"----Analyzing document from page #{page.page_number}----")
        print(f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}")

        # Analyze lines.
        if page.lines:
            for line_idx, line in enumerate(page.lines):
                words = get_words(page, line)
                print(
                    f"...Line # {line_idx} has {len(words)} words and text '{line.content}' within bounding polygon '{line.polygon}'"
                )

                # Analyze words.
                for word in words:
                    print(f"......Word '{word.content}' has a confidence of {word.confidence}")
        
    # Analyze paragraphs.
    if result.paragraphs:
        print(f"----Detected #{len(result.paragraphs)} paragraphs in the document----")
        for paragraph in result.paragraphs:
            print(f"Found paragraph within {paragraph.bounding_regions} bounding region")
            print(f"...with content: '{paragraph.content}'")

    print("----------------------------------------")
    # [END analyze_read]

In [11]:

if __name__ == "__main__":
    from azure.core.exceptions import HttpResponseError
    from dotenv import find_dotenv, load_dotenv

    try:
        load_dotenv(find_dotenv())
        analyze_read()
    except HttpResponseError as error:
        # Examples of how to check an HttpResponseError
        # Check by error code:
        if error.error is not None:
            if error.error.code == "InvalidImage":
                print(f"Received an invalid image error: {error.error}")
            if error.error.code == "InvalidRequest":
                print(f"Received an invalid request error: {error.error}")
            # Raise the error again after printing it
            raise
        # If the inner error is None and then it is possible to check the message to get more information:
        if "Invalid request".casefold() in error.message.casefold():
            print(f"Uh-oh! Seems there was an invalid request: {error}")
        # Raise the error again
        raise

----Languages detected in the document----
Language code: 'en' with confidence 1
Language code: 'en' with confidence 0.95
Language code: 'en' with confidence 0.8
Language code: 'en' with confidence 0.9
Language code: 'en' with confidence 0.99
Language code: 'en' with confidence 0.7
----Analyzing document from page #1----
Page has width: 915 and height: 1190, measured with unit: LengthUnit.PIXEL
...Line # 0 has 13 words and text 'While healthcare is still in the early stages of its Al journey, we' within bounding polygon '[259, 55, 817, 56, 817, 78, 259, 76]'
......Word 'While' has a confidence of 0.996
......Word 'healthcare' has a confidence of 0.995
......Word 'is' has a confidence of 0.999
......Word 'still' has a confidence of 0.997
......Word 'in' has a confidence of 0.998
......Word 'the' has a confidence of 0.999
......Word 'early' has a confidence of 0.997
......Word 'stages' has a confidence of 0.997
......Word 'of' has a confidence of 0.999
......Word 'its' has a confidence o

## Exploring formular extraction output

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeOutputOption, AnalyzeResult, DocumentAnalysisFeature


document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

with open("./docs/sees_gpt_test.pdf", "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read",
        analyze_request=f,
        features=[DocumentAnalysisFeature.FORMULAS],
        output=[AnalyzeOutputOption.PDF],
        content_type="application/octet-stream",
    )
result: AnalyzeResult = poller.result()
operation_id = poller.details["operation_id"]

# response = document_intelligence_client.get_analyze_result_pdf(model_id=result.model_id, result_id=operation_id)
# with open("analyze_result.pdf", "wb") as writer:
#     writer.writelines(response)

In [16]:
import json

# Save the dictionary to a JSON file
with open('analysis.json', 'w') as json_file:
    json.dump(result.as_dict(), json_file, indent=4)

## Formular extraction from document

In [17]:
def extract_texts_and_formulas(file_path):
    
    with open(file_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read",
            analyze_request=f,
            features=[DocumentAnalysisFeature.FORMULAS],
            # output=[AnalyzeOutputOption.PDF],
            content_type="application/octet-stream",
        )
    result: AnalyzeResult = poller.result()

    all_formulars = []
    for page in result.pages:
        print(f"----Formulas detected from page #{page.page_number}----")
        if page.formulas:
            all_formulars.extend([f.value for f in page.formulas])

    print()
    assert len(all_formulars)==result.content.count(":formula:")
    print("Formula extraction complete!")
    print()

    extract = result.content
    for formular in all_formulars:
        extract = extract.replace(":formula:", formular, 1)

    return extract

In [18]:
print(extract_texts_and_formulas("./docs/sees_gpt_test.pdf"))

----Formulas detected from page #1----
----Formulas detected from page #2----
----Formulas detected from page #3----
----Formulas detected from page #4----
----Formulas detected from page #5----
----Formulas detected from page #6----
----Formulas detected from page #7----
----Formulas detected from page #8----
----Formulas detected from page #9----
----Formulas detected from page #10----

Formula extraction complete!

13 Partial Derivatives
To this point, with the exception of the occasional section in the last chapter, we've been working almost exclusively with functions of a single variable. It is now time to formally start multi-variable Calculus, i.e. Calculus involving functions of two or more variables. We will be covering the same basic topics as we do with single variable Calculus. Namely, limits, derivatives and integrals.
In this chapter we will open up with a quick section discussing taking limits of multi-variable func- tions. We will only be covering limits of multi-variab

In [None]:
with open("sees_gpt_test.txt", 'w') as file:
    file.write(extract_texts_and_formulas("../sees_gpt_test.pdf"))

## Extracting PDF content with Azure Document Intelligence

In [20]:
def extract_texts(file_path):
    
    with open(file_path, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-read",
            analyze_request=f,
            # features=[DocumentAnalysisFeature.FORMULAS],
            # output=[AnalyzeOutputOption.PDF],
            content_type="application/octet-stream",
        )
    result: AnalyzeResult = poller.result()

    # all_formulars = []
    # for page in result.pages:
    #     print(f"----Formulas detected from page #{page.page_number}----")
    #     if page.formulas:
    #         all_formulars.extend([f.value for f in page.formulas])

    # print()
    # assert len(all_formulars)==result.content.count(":formula:")
    # print("Formula extraction complete!")
    # print()

    extract = result.content
    # for formular in all_formulars:
    #     extract = extract.replace(":formula:", formular, 1)

    return extract

In [21]:
extracted_content = extract_texts("./docs/WHO-doc-snippet.pdf")
with open("WHO-doc-snippet-extracted.txt", "w", encoding="utf-8") as f:
    f.write(extracted_content)
