# This isd code to read data from Form recognizer
## Also, this has steps to add additional metadata on pricessing output

In [None]:
from azure.storage.blob import BlobServiceClient
from azure.identity import DefaultAzureCredential
from azure.cosmos import CosmosClient, DatabaseProxy, ContainerProxy
import os
import tempfile

In [None]:
def get_blobservice_client(account_url):
    credential = DefaultAzureCredential()
    blobServicClient = BlobServiceClient(account_url=account_url, credential=credential)
    return blobServiceClient

In [None]:
# Code to read file data stored in azure storage account

def get_blob_contents(fileName, account_url, container):
    """This function is used to access the blob file and load the data into redis

    Args:
        fileName : name of the file
        account_url : url of the storage account
        container : name of the container

    Returns:
        file data
    """
    try:
        blobServicClient = get_blobservice_client(account_url)
        blobClient = blobServicClient.get_blob_client(container=container, blob=fileName)
        with tempfile.TemporaryDirectory() as temp_dir:
            file_path = f"{temp_dir}/{container}/{fileName}"
            os.makedirs(os.path.dirname(file_path), exist_ok=True)
            with open(f"{file_path}", "wb") as file:
                blob_data = blobClient.download_blob()
                blob_data.readinto(file)
            content = open(f"{file_path}",'rb')
        return content
    except Exception as e:
        logging.error(f'Exception occured in reading blob: {e}')

In [None]:
filename = <> # blob file name to be downloaded
container_name = <> # container having target blob
stg_account_url = <> # storage account url having data
filecontents = get_blob_contents(fileName = filename, account_url = stg_account_url, container=container_name)

In [None]:
from azure.ai.formrecognizer import FormRecognizerClient,DocumentTable
import json
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.identity import DefaultAzureCredential
import pandas as pd

### below is code to connect to azure form recognizer service


In [None]:
DI_endpoint = <> # Form Recognizer endpoint
def get_document_analysis_client(endpoint):
    credential = DefaultAzureCredential()
    document_analysis_client = DocumentAnalysisClient(endpoint=endpoint, credential=credential)
    return document_analysis_client

below is code to get contents analysed from form recognizers document intelligence

In [None]:
def read_file_contents(filecontents):
    document_analysis_client = get_document_analysis_client(DI_endpoint)
    poller = document_analysis_client.begin_analyze_document("prebuilt-document",document=filecontents)
    result = poller.result()
    document = DocumentTable(document_analysis_client)
    document.from_pdf(filecontents)
    return document
all_contents = read_file_contents(filecontents)
contents = all_contents.to_dict()

### NOw from the document intelligence output, data will be segregated in different parts like paragraphs, tables, key value pairs,... 
### now if a page containes paragraphs at the begining and table in the middle and again paragraphs at the end, then correct position of table in page for reconstruction will be needed, 
### so below is code to fetch those details and then organize them in proper sequence based on spans and bounding region details

In [None]:
def get_paragraphs(contents):
    para_data = []
    page_width =  result_in_dict['pages'][0]['width']
    for i, paragraph in enumerate(paragraphs, start=1):
        placement=''
        if(paragraph["bounding_regions"][0]["polygon"][0]["x"]>=(page_width/2)):
            placement="right"
        else:
            placement="left"
        paragraph_entry = {
            "content": paragraph["content"],
            "role": paragraph.get("role", "NA"),
            "page no": paragraph["bounding_regions"][0]["page_number"],
            "spans":paragraph["spans"],
            "placement":placement,
            "y-cordniate":paragraph["bounding_regions"][0]["polygon"][0]['y'],
            "matching_ordinal":[]
        }
        para_data.append(paragraph_entry)
        return para_data

In [None]:
paragraphs = get_paragraphs(contents['paragraphs'])

In [None]:
def get_tables(contents):
    table_data=[]
    for i, table in enumerate(tables, start=0):
        for cell in table["cells"]:
            table_entry = {
                "content": cell["content"],
                "role": "table cell",
                "page no": cell["bounding_regions"][0]["page_number"],
                "spans":cell["spans"],
                "table no":i
            }
            table_data.append(table_entry)
    return table_data


In [None]:
table_data = get_tables(contents["tables"])

In [None]:
def filter_duplicates(para_data,table_data):
    for table in table_data:
        for i,para in enumerate(para_data):
            if(para["page no"]==table["page no"]):
                if(para["content"]==table["content"]):
                    if(para["spans"]==table["spans"]):
                        para["content"]=""
                        para["role"]="table"
                        para["table no"]=table["table no"]
    data = pd.DataFrame(para_data)
    if 'table no' in data.columns:
        data['table no'] = data['table no'].fillna(np.nan).astype('Int64')
        data = data.drop_duplicates(subset=['table no', 'content'], keep='first')
    else:
        print("The 'table no' column does not exist in the DataFrame.")
    final_list=data.to_dict(orient='records')
    return final_list
filtered_list = filter_duplicates(paragraphs,table_data)

In [None]:
def drop_extra_metadata(final_list):
    for i,item in enumerate(final_list):
        item['ordinal']=i
        if(item["role"]=="table"):
            table_no=item["table no"]
            content_for_table = table_data[table_no]
            for j in range(len(content_for_table['cells'])): 
                content_for_table['cells'][j].pop('bounding_regions') 
                content_for_table['cells'][j].pop('spans')
            content_for_table.pop('bounding_regions')
            content_for_table.pop('spans')
            item["content"]=content_for_table
        elif(item["role"]=='NA'):
            item["role"]="para"
        elif(item["role"]==None):
            item["role"]="para"
filtered_list = drop_extra_metadata(filtered_list)

### fina; filtered list contain all paras and tables in proper sequence with no repeated entires, additionally we'll get metadatas like role table/para, page mp. contents, table no,...