In [5]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentAnalysisFeature
import os
import pandas as pd
import re
from dotenv import load_dotenv
load_dotenv()

True

# Table

In [6]:
key = os.environ.get('OCR_KEY')
endpoint = os.environ.get('LOCAL_ENDPOINT')
credential = AzureKeyCredential(key)
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential)

doc_path ='data/table-test-document.pdf'

with open(doc_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read", 
        analyze_request=f, 
        content_type="application/octet-stream", # generic file type
    )
result = poller.result()

ResourceNotFoundError: Operation returned an invalid status 'Not Found'

In [5]:
import uuid

tabel_selection_method = 'index'
table_output_format = 'reference'
select_table = True
table_idx = 0
table_output_caslib = 'work'

def map_parsing(result) -> pd.DataFrame:
    tables = []

    # extract all table data
    for index, table in enumerate(result.tables):
        if table_output_format.upper() == 'MAP':
            dict = table.as_dict()
            df = pd.DataFrame.from_dict(dict['cells'])

            # extract page_number and polygon coordinates
            df['page'] = df['boundingRegions'].apply(lambda x: x[0]['pageNumber'])
            df['table_index'] = index
            df['polygon'] = df['boundingRegions'].apply(lambda x: x[0]['polygon'])

            # extract polygon coordinates
            df['x1'] = df['polygon'].apply(lambda x: x[0])
            df['y1'] = df['polygon'].apply(lambda x: x[1])
            df['x2'] = df['polygon'].apply(lambda x: x[2])
            df['y2'] = df['polygon'].apply(lambda x: x[3])
            df['x3'] = df['polygon'].apply(lambda x: x[4])
            df['y3'] = df['polygon'].apply(lambda x: x[5])
            df['x4'] = df['polygon'].apply(lambda x: x[6])
            df['y4'] = df['polygon'].apply(lambda x: x[7])

            # extract offset and length
            df['offset'] = df['spans'].apply(lambda x: int(x[0].get('offset')) if x else None)
            df['length'] = df['spans'].apply(lambda x: int(x[0].get('length')) if x else None)

            # drop unnecessary columns
            df.drop(columns=['boundingRegions','spans', 'polygon'], inplace=True)

            table_info = {
                'table_index': index,
                'row_count': table.row_count,
                'column_count': table.column_count,
                'cell_count': table.row_count*table.column_count,
                'table': df
            }

            tables.append(table_info)

    # select specific table (optional)
    if select_table:
        if tabel_selection_method.upper() == 'INDEX':
            parsed_result = tables[table_idx]['table']
        elif tabel_selection_method.upper() == 'SIZE':
            # Find the entry with the highest cell_count using max function
            table_most_cells = max(tables, key=lambda x: x['cell_count'], default=None)
            parsed_result = table_most_cells['table'] if table_most_cells else None

    else:
        # combine all extracted tables (only works for output type 'map')
        parsed_result = pd.concat([table['table'] for table in tables], ignore_index=True)

    return parsed_result

def result_to_dfs(result) -> list:
    tables = []
    for table in result.tables:
        table_df = pd.DataFrame(columns=range(table.column_count), index=range(table.row_count))

        for cell in table.cells:
            table_df.iloc[cell.row_index, cell.column_index] = cell.content

        # use the first row as column names
        table_df.columns = table_df.iloc[0]
        table_df = table_df[1:]
        
        tables.append(table_df)
    return tables

def reference_parsing(result) -> pd.DataFrame: # TODO
    tables = result_to_dfs(result)
    table_info = []

    for table in tables:
        reference = uuid.uuid4()
        reference = re.sub(r'^\w{3}', 'tbl_', str(reference))
        reference = reference.replace('-', '')

        try: 
            print(f'Save table {reference} to caslib {table_output_caslib}')
        except Exception as e:
            print(f'Failed to save table {reference} to caslib {table_output_caslib}')
            raise e
        
        table_info.append({
            'reference': reference,
            'row_count': table.shape[0],
            'column_count': table.shape[1],
            'out_caslib': table_output_caslib
        })

    return pd.DataFrame(table_info)

def table_parsing(result) -> pd.DataFrame: #TODO
    tables = result_to_dfs(result)
    select_table = True

    # select specific table 
    if select_table:
        if tabel_selection_method.upper() == 'INDEX': # Table with index == table_idx
            parsed_result = tables[table_idx]
        elif tabel_selection_method.upper() == 'SIZE': # Table with most cells
            table_most_cells = max(tables, key=lambda x: x.size, default=None)
            parsed_result = table_most_cells if table_most_cells else None

        else:
            raise ValueError(f'Invalid table selection method: {tabel_selection_method}')

    return parsed_result

parsing_methods = {
    'MAP': map_parsing,
    'REFERENCE': reference_parsing,
    'TABLE': table_parsing
}

def parse_ocr_result(result) -> pd.DataFrame:
    # call one of the parsing methods depending on the output format
    parsing_method = table_output_format.upper()
    parsed_result = parsing_methods.get(parsing_method)(result = result)

    return parsed_result


Save table tbl_f2544e783458aa85684e89e8e33a7 to caslib work
Save table tbl_de94472d64528a029e8f817a2b750 to caslib work


Unnamed: 0,reference,row_count,column_count,out_caslib
0,tbl_f2544e783458aa85684e89e8e33a7,5,5,work
1,tbl_de94472d64528a029e8f817a2b750,5,3,work


In [104]:
tables = result_to_dfs(result)

table_most_cells = max(tables, key=lambda x: x.size, default=None)

table_most_cells

Unnamed: 0,name,email,phone,birthdate,married
1,Jens,mail@mail.com,17526736746,12.05.92,no
2,Hans,mail@mail.com,17526736746,13.05.92,no
3,Mario,mail@mail.com,17526736746,14.05.92,no
4,Judith,,17526736746,15.05.92,no
5,Lea,mail@mail.com,17526736746,16.05.92,no


In [100]:
tables = []
for table in result.tables:
    table_df = pd.DataFrame(columns=range(table.column_count), index=range(table.row_count))

    for cell in table.cells:
        table_df.iloc[cell.row_index, cell.column_index] = cell.content

    # use the first row as column names
    table_df.columns = table_df.iloc[0]
    table_df = table_df[1:]
    
    tables.append(table_df)

tables[0].head()


Unnamed: 0,name,email,phone,birthdate,married
1,Jens,mail@mail.com,17526736746,12.05.92,no
2,Hans,mail@mail.com,17526736746,13.05.92,no
3,Mario,mail@mail.com,17526736746,14.05.92,no
4,Judith,,17526736746,15.05.92,no
5,Lea,mail@mail.com,17526736746,16.05.92,no


# KEY VALUE

In [220]:
key = os.environ.get('OCR_KEY')
endpoint = os.environ.get('OCR_ENDPOINT')
credential = AzureKeyCredential(key)
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential, api_version='2023-10-31-preview')

In [224]:
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentAnalysisFeature

doc_path ='data/patient_intake_form_sample.jpg'

with open(doc_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", 
        analyze_request=f, 
        content_type="application/octet-stream", # generic file type
        #content_type="Image/JPEG",
        #content_type="Image/PNG",
        #content_type='Application/PDF':
        features=['keyValuePairs']#, DocumentAnalysisFeature.QUERY_FIELDS],
        #query_fields=["First_Name", "City", "last_name"],
    )
result = poller.result()

In [262]:
# https://learn.microsoft.com/en-us/python/api/azure-ai-documentintelligence/azure.ai.documentintelligence.models.analyzeresult?view=azure-python-preview
result.key_value_pairs.key[0]


AttributeError: 'list' object has no attribute 'key'

In [283]:
def parse_ocr_result(result):
    key_value_pairs = result.key_value_pairs
    form_data = []
    for pair in key_value_pairs:
        # get key info
        page_number = pair.key.bounding_regions[0].page_number
        key = pair.key.content
        key_x1, key_y1, key_x2, key_y2, key_x3, key_y3, key_x4, key_y4 = pair.key.bounding_regions[0].polygon
        key_offset = pair.key.spans[0].offset
        key_length = pair.key.spans[0].length
        
        # get value info
        value = pair.get('value', None)
        value_x1 = value_y1 = value_x2 = value_y2 = value_x3 = value_y3 = value_x4 = value_y4 = None
        value_offset = value_length = None

        if value is not None:
            value = value.get('content', None)
            value_x1, value_y1, value_x2, value_y2, value_x3, value_y3, value_x4, value_y4 = pair.value.bounding_regions[0].polygon
            value_offset = pair.value.spans[0].offset
            value_length = pair.value.spans[0].length

        key_value = {
            'page_number': page_number,
            'key': key,
            'value': value,
            'key_x1': key_x1,
            'key_y1': key_y1,
            'key_x2': key_x2,
            'key_y2': key_y2,
            'key_x3': key_x3,
            'key_y3': key_y3,
            'key_x4': key_x4,
            'key_y4': key_y4,
            'key_offset': key_offset,
            'key_length': key_length,
            'value_x1': value_x1,
            'value_y1': value_y1,
            'value_x2': value_x2,
            'value_y2': value_y2,
            'value_x3': value_x3,
            'value_y3': value_y3,
            'value_x4': value_x4,
            'value_y4': value_y4,
            'value_offset': value_offset,
            'value_length': value_length,
        }

        form_data.append(key_value)
    
    df = pd.DataFrame(form_data)

    return df

In [285]:
parse_ocr_result(result).head()

Unnamed: 0,page_number,key,value,key_x1,key_y1,key_x2,key_y2,key_x3,key_y3,key_x4,...,value_x1,value_y1,value_x2,value_y2,value_x3,value_y3,value_x4,value_y4,value_offset,value_length
0,1,First Name:,ALEJANDRO,74,421,332,424,332,467,74,...,408.0,340.0,1035.0,370.0,1033.0,468.0,405.0,463.0,32.0,9.0
1,1,Last Name:,ROSALEZ,1111,425,1364,430,1363,472,1110,...,1389.0,370.0,1878.0,370.0,1877.0,471.0,1388.0,469.0,53.0,7.0
2,1,Date of Birth:,10/10/1982,2114,428,2418,428,2418,474,2114,...,2470.0,397.0,2918.0,389.0,2916.0,473.0,2468.0,471.0,76.0,10.0
3,1,Sex:,,81,568,179,570,179,609,80,...,,,,,,,,,,
4,1,Marital Status:,MARRIED,498,569,828,571,828,613,498,...,881.0,519.0,1232.0,520.0,1227.0,610.0,877.0,604.0,108.0,7.0


In [286]:
with open(doc_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", 
        analyze_request=f, 
        content_type="application/octet-stream", # generic file type
        features=[DocumentAnalysisFeature.QUERY_FIELDS],
        query_fields=["First_Name", "City", "last_name"],
    )
result = poller.result()

In [290]:
result.content

'Patient Information\nFirst Name:\nALEJANDRO\nLast Name:\nROSALEZ\nDate of Birth:\n10/10/1982\nSex:\nMarital Status:\nMARRIED\nEmail Address:\nAddress:\n123 ANY STREET\nCity:\nANYTOWN\nState:\nCA\nZip Code:\n12345\nPhone:\n646-555-0111\nEmergency Contact 1:\nFirst Name:\nCARLOS\nLast Name:\nSALAZAR\nPhone:\n212-555-0150\nRelationship to Patient:\nBROTHER\nEmergency Contact 2:\nFirst Name:\nJANE\nPhone:\n650-555-0123\nLast Name:\nDOE\nRelationship to Patient:\nFRIEND\nDid you feel fever or feverish lately? :selected: Yes :unselected: No\nAre you having shortness of breath? :unselected: Yes :selected: No\nDo you have a cough? :unselected: Yes :selected: No\nDid you experience loss of taste or smell? :unselected: Yes :selected: No\nWhere you in contact with any confirmed COVID-19 positive patients? :selected: Yes :unselected: No\nDid you travel in the past 14 days to any regions affected by COVID-19? :unselected: Yes :selected: No'

In [93]:
doc = result.documents[0]

In [94]:
doc.fields

{'City': {'type': 'string', 'valueString': 'ANYTOWN', 'content': 'ANYTOWN', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2122, 693, 2555, 686, 2555, 779, 2120, 778]}], 'confidence': 0.916, 'spans': [{'offset': 161, 'length': 7}]},
 'First_Name': {'type': 'string', 'valueString': 'ALEJANDRO', 'content': 'ALEJANDRO', 'boundingRegions': [{'pageNumber': 1, 'polygon': [408, 340, 1035, 370, 1033, 468, 405, 463]}], 'confidence': 0.854, 'spans': [{'offset': 32, 'length': 9}]},
 'last_name': {'type': 'string', 'valueString': 'ROSALEZ', 'content': 'ROSALEZ', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1389, 370, 1878, 370, 1877, 471, 1388, 469]}], 'confidence': 0.888, 'spans': [{'offset': 53, 'length': 7}]}}

In [88]:
# print all attributes of result
for attr in dir(result):
    print(attr)

_MutableMapping__marker
__abstractmethods__
__annotations__
__class__
__class_getitem__
__contains__
__delattr__
__delitem__
__dict__
__dir__
__doc__
__eq__
__format__
__ge__
__getattribute__
__getitem__
__getstate__
__gt__
__hash__
__init__
__init_subclass__
__iter__
__le__
__len__
__lt__
__module__
__ne__
__new__
__orig_bases__
__reduce__
__reduce_ex__
__repr__
__reversed__
__setattr__
__setitem__
__sizeof__
__slots__
__str__
__subclasshook__
__weakref__
_abc_impl
_as_dict_value
_attr_to_rest_field
_data
_deserialize
_get_discriminator
_is_model
api_version
as_dict
clear
content
content_format
copy
documents
figures
get
items
key_value_pairs
keys
languages
lists
model_id
pages
paragraphs
pop
popitem
sections
setdefault
string_index_type
styles
tables
update
values


# Query Fields

In [291]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentAnalysisFeature

key = os.environ.get('OCR_KEY')
endpoint = os.environ.get('OCR_ENDPOINT')
url = "https://raw.githubusercontent.com/Azure/azure-sdk-for-python/main/sdk/documentintelligence/azure-ai-documentintelligence/samples/sample_forms/forms/Invoice_1.pdf"

client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))
poller = client.begin_analyze_document(
    "prebuilt-layout",
    AnalyzeDocumentRequest(url_source=url),
    features=[DocumentAnalysisFeature.QUERY_FIELDS],
    query_fields=["NumberOfGuests", "StoreNumber"],
)
result = poller.result()

KeyboardInterrupt: 

In [293]:
with open(doc_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", 
        analyze_request=f, 
        content_type="application/octet-stream", # generic file type
        features=[DocumentAnalysisFeature.QUERY_FIELDS],
        query_fields=["First_Name", "City", "last_name"],
    )
result = poller.result()

In [307]:
result.documents[0].fields.get('First_Name')#.bounding_regions[0].page_number

{'type': 'string', 'valueString': 'ALEJANDRO', 'content': 'ALEJANDRO', 'boundingRegions': [{'pageNumber': 1, 'polygon': [408, 340, 1035, 370, 1033, 468, 405, 463]}], 'confidence': 0.854, 'spans': [{'offset': 32, 'length': 9}]}

In [355]:
result.documents[0].fields['City'].spans[0].offset

161

In [347]:
result.documents[0].fields.keys()

dict_keys(['City', 'First_Name', 'last_name'])

In [357]:
def parse_query_results(result, query_list):
    query_data = []

    for doc in result.documents:
        for query in query_list:

            x1, y1, x2, y2, x3, y3, x4, y4 = doc.fields.get(query).bounding_regions[0].polygon

            query_info = {
                'key': query,
                'value': doc.fields.get(query).content,
                'page_number': doc.fields.get(query).bounding_regions[0].page_number,
                'confidence': doc.fields.get(query).confidence,
                'type': doc.fields.get(query).type,
                'x1': x1,
                'y1': y1,
                'x2': x2,
                'y2': y2,
                'x3': x3,
                'y3': y3,
                'x4': x4,
                'y4': y4,
                'offset': doc.fields.get(query).spans[0].offset,
                'length': doc.fields.get(query).spans[0].length,
            }

            query_data.append(query_info)

    df = pd.DataFrame(query_data)

    return df

                
               

query_fields=["First_Name", "City", "last_name"]
parse_query_results(result, query_fields)

{'type': 'string', 'valueString': 'ALEJANDRO', 'content': 'ALEJANDRO', 'boundingRegions': [{'pageNumber': 1, 'polygon': [408, 340, 1035, 370, 1033, 468, 405, 463]}], 'confidence': 0.854, 'spans': [{'offset': 32, 'length': 9}]}
{'type': 'string', 'valueString': 'ANYTOWN', 'content': 'ANYTOWN', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2122, 693, 2555, 686, 2555, 779, 2120, 778]}], 'confidence': 0.916, 'spans': [{'offset': 161, 'length': 7}]}
{'type': 'string', 'valueString': 'ROSALEZ', 'content': 'ROSALEZ', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1389, 370, 1878, 370, 1877, 471, 1388, 469]}], 'confidence': 0.888, 'spans': [{'offset': 53, 'length': 7}]}


Unnamed: 0,key,value,page_number,confidence,type,x1,y1,x2,y2,x3,y3,x4,y4,offset,length
0,First_Name,ALEJANDRO,1,0.854,string,408,340,1035,370,1033,468,405,463,32,9
1,City,ANYTOWN,1,0.916,string,2122,693,2555,686,2555,779,2120,778,161,7
2,last_name,ROSALEZ,1,0.888,string,1389,370,1878,370,1877,471,1388,469,53,7


In [324]:
def prepare_query(query_string):
    query_list = query_string.split(',')
    query_list = [q.strip() for q in query_list] # remove leading and trailing whitespace
    query_list = [q.replace(' ', '_') if ' ' in q else q for q in query_list] # replace spaces with underscores
        
    for q in query_list:
        try:
            re.compile(q)
        except re.error:
            raise re.error
        
    return query_list

In [358]:
query_string = "City, First name, last name" 
query_list = prepare_query(query_string)
query_list

['City', 'First_name', 'last_name']

In [360]:
import pandas as pd

# Example DataFrame
df = pd.DataFrame({
    'key': ['A', 'B', 'C'],
    'value': [1, 2, 3]
})

# Transpose DataFrame
df_transposed = df.set_index('key').T

print(df_transposed)



key    A  B  C
value  1  2  3


# Normal Text

In [98]:
key = os.environ.get('OCR_KEY')
endpoint = os.environ.get('OCR_ENDPOINT')
credential = AzureKeyCredential(key)
document_intelligence_client = DocumentIntelligenceClient(endpoint, credential, api_version='2023-10-31-preview')

In [215]:
doc_path ='data/letter-example.pdf'

with open(doc_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read", 
        analyze_request=f, 
        content_type="application/octet-stream", # generic file type
        #content_type="Image/JPEG",
        #content_type="Image/PNG",
        #content_type='Application/PDF':
        #features=['keyValuePairs', DocumentAnalysisFeature.QUERY_FIELDS],
        #query_fields=["First_Name", "City", "last_name"],
    )
result = poller.result()

In [219]:
result.pages[0].lines

[{'content': 'MICROSOFT OFFICE USER', 'polygon': [1.7287, 1.0026, 6.6854, 1.0026, 6.6854, 1.2938, 1.7287, 1.2891], 'spans': [{'offset': 0, 'length': 21}]},
 {'content': 'RECIPIENT NAME', 'polygon': [1.7525, 2.5734, 3.6388, 2.5734, 3.6388, 2.75, 1.7525, 2.7548], 'spans': [{'offset': 22, 'length': 14}]},
 {'content': 'Title | Company | Address | City, ST ZIP', 'polygon': [1.3657, 2.8407, 4.0542, 2.8407, 4.0542, 3.0078, 1.3657, 3.0078], 'spans': [{'offset': 37, 'length': 40}]},
 {'content': 'CONTACT', 'polygon': [5.8354, 2.602, 6.9289, 2.5972, 6.9289, 2.7691, 5.8354, 2.7739], 'spans': [{'offset': 78, 'length': 7}]},
 {'content': 'Address', 'polygon': [6.1267, 2.8837, 6.6758, 2.8837, 6.6758, 3.0078, 6.1267, 3.003], 'spans': [{'offset': 86, 'length': 7}]},
 {'content': 'City, ST ZIP', 'polygon': [6.0455, 3.0603, 6.757, 3.0603, 6.757, 3.2131, 6.0455, 3.2131], 'spans': [{'offset': 94, 'length': 12}]},
 {'content': 'Email', 'polygon': [6.2174, 3.2513, 6.5803, 3.2513, 6.5803, 3.3707, 6.2174, 3.

In [213]:
import io

with open('data/handwritten-form.jpg', 'rb') as document:
        document_file = io.BytesIO(document.read())

#document_file = io.BytesIO(document)


with open(doc_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read", 
        analyze_request=document_file, 
        content_type="application/octet-stream", # generic file type
        #content_type="Image/JPEG",
        #content_type="Image/PNG",
        #content_type='Application/PDF':
        #features=['keyValuePairs', DocumentAnalysisFeature.QUERY_FIELDS],
        #query_fields=["First_Name", "City", "last_name"],
    )
result = poller.result()

In [193]:
def format_azure_polygon(polygon):
    if not polygon:
        return "N/A"

    if len(polygon) != 4:
        raise ValueError("Polygon should have exactly 4 coordinates.")
    
    x1, y1 = polygon[0]
    x2, y2 = polygon[1]
    x3, y3 = polygon[2]
    x4, y4 = polygon[3]

    return x1, y1, x2, y2, x3, y3, x4, y4


def parse_azure_ocr(result, level):
    # azure doesn't provide results on page level natively
    if (level.upper() == "PAGE"):
        lod = "LINE"
    else:
        lod = level.upper()

    for page in result.pages:
        contains_handwriting = result.styles[0].is_handwritten
        ocr_data = []
        
        # to calculate the average confidence
        if lod != "WORD":
            word_confidences = [word.confidence for word in page.words]
            total_confidence = sum(word_confidences)
            total_words = len(word_confidences)
            average_confidence = total_confidence / total_words if total_words > 0 else 0
            
        # extraction of (natively provided) results 
        if lod == "PARAGRPAH":
            for paragraph_idx, paragraph in enumerate(result.paragraphs):
                x1, y1, x2, y2, x3, y3, x4, y4 = paragraph.bounding_regions[0].polygon

                paragrpah_info = {
                    "page": paragraph.bounding_regions[0].page_number,
                    "paragraph": paragraph_idx,
                    "text": paragraph.content,
                    "Role": paragraph.role,
                    "bb_x1": x1,
                    "bb_y1": y1,
                    "bb_x2": x2,
                    "bb_y2": y2,
                    "bb_x3": x3,
                    "bb_y3": y3,
                    "bb_x4": x4,
                    "bb_y4": y4
                }
                
                ocr_data.append(paragrpah_info)

        elif lod == "LINE":
            for line_idx, line in enumerate(page.lines):
                x1, y1, x2, y2, x3, y3, x4, y4 = line.polygon

                line_info = {
                    "page": page.page_number,
                    "line": line_idx,
                    "text": line.content,
                    "bb_x1": x1,
                    "bb_y1": y1,
                    "bb_x2": x2,
                    "bb_y2": y2,
                    "bb_x3": x3,
                    "bb_y3": y3,
                    "bb_x4": x4,
                    "bb_y4": y4,
                    "offset": 
                }
                
                ocr_data.append(line_info)

        elif lod == "WORD":
            for word in page.words:
                x1, y1, x2, y2, x3, y3, x4, y4 = word.polygon

                word_info = {
                    "page": page.page_number,
                    "text": word.content,
                    "confidence": word.confidence,
                    "bb_x1": x1,
                    "bb_y1": y1,
                    "bb_x2": x2,
                    "bb_y2": y2,
                    "bb_x3": x3,
                    "bb_y3": y3,
                    "bb_x4": x4,
                    "bb_y4": y4,
                    }
                
                ocr_data.append(word_info)
        
        df = pd.DataFrame(ocr_data)

        # in case texts should be aggreagted on page level
        if level.upper() == "PAGE":
            ocr_data = []
            page_info = {
                    "page": page.page_number,
                    "text": "\n ".join(df['text']),
                    "confidence": average_confidence,
                    "contains_handwriting": contains_handwriting,
                    "bb_x1": df["bb_x1"].min(),
                    "bb_y1": df["bb_y1"].min(),
                    "bb_x2": df["bb_x2"].max(),
                    "bb_y2": df["bb_y2"].min(),
                    "bb_x3": df["bb_x3"].max(),
                    "bb_y3": df["bb_y3"].max(),
                    "bb_x4": df["bb_x4"].min(),
                    "bb_x4": df["bb_x4"].max(),
                    }
            ocr_data.append(page_info)
            
            df = pd.DataFrame(ocr_data)

    return df

In [211]:
parse_azure_ocr(result, "page")

Unnamed: 0,page,text,confidence,contains_handwriting,bb_x1,bb_y1,bb_x2,bb_y2,bb_x3,bb_y3,bb_x4
0,1,HANDWRITING SAMPLE FORM\n NAME\n DATE\n 8-3-89...,0.961774,True,31,15,629,16,629,742,551


In [200]:
result.paragraphs[0].spans[0].length

23

In [214]:
for para in result.paragraphs:
        print(para)



{'spans': [{'offset': 0, 'length': 23}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [185, 15, 453, 16, 453, 34, 185, 33]}], 'content': 'HANDWRITING SAMPLE FORM'}
{'spans': [{'offset': 24, 'length': 4}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [50, 60, 89, 60, 89, 70, 50, 70]}], 'content': 'NAME'}
{'spans': [{'offset': 29, 'length': 4}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [235, 61, 271, 61, 271, 74, 235, 73]}], 'content': 'DATE'}
{'spans': [{'offset': 34, 'length': 6}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [222, 81, 298, 81, 298, 98, 222, 98]}], 'content': '8-3-89'}
{'spans': [{'offset': 41, 'length': 4}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [342, 62, 374, 63, 374, 74, 342, 74]}], 'content': 'CITY'}
{'spans': [{'offset': 46, 'length': 9}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [474, 63, 549, 63, 549, 76, 474, 75]}], 'content': 'STATE ZIP'}
{'spans': [{'offset': 56, 'length': 20}], 'boundingRegions': [{'pageNumber': 1, 'pol

In [205]:
result.pages[0].words[0]

{'content': 'HANDWRITING', 'polygon': [187, 15, 313, 15, 313, 34, 187, 33], 'confidence': 0.993, 'span': {'offset': 0, 'length': 11}}

In [163]:
x1, y1, x2, y2, x3, y3, x4, y4 = result.pages[0].lines[0].polygon

In [164]:
y1

15