In [1]:
# Generic imports
import os
from dotenv import load_dotenv
from tabulate import tabulate
import io
import copy
import re
# Azure imports
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient 
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.storage.blob import BlobServiceClient

from langchain import hub
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch
from azure.search.documents import SearchClient, IndexDocumentsBatch
from azure.search.documents.indexes.models import(
    SimpleField,
    ComplexField,
    SearchableField,
    SearchField,
    SearchFieldDataType,
)



In [None]:
"""
This code loads environment variables using the `dotenv` library and sets the necessary environment variables for Azure services.
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""

load_dotenv()

os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
doc_intelligence_endpoint = "https://cog-fr-m42nz7gwiimnc.cognitiveservices.azure.com/"
doc_intelligence_key = "f763f17a17fc4acb84cc47afe21423f1"
blob_connection_string = os.getenv("AZURE_BLOB_STORAGE_CONNECTION_STRING")
storage_account_key = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
search_endpoint: str = os.getenv("AZURE_SEARCH_ENDPOINT")
search_admin_key: str = os.getenv("AZURE_SEARCH_ADMIN_KEY")



In [None]:
# Index name
index_name: str = "index-publications-02" #Change to index name that you want to store to

# Connect to document intelligence
document_intelligence_client = DocumentIntelligenceClient(endpoint= doc_intelligence_endpoint, credential=AzureKeyCredential(doc_intelligence_key))

# Connect to blob storage 
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
blob_client = blob_service_client.get_container_client("publications")
list_blobs = blob_client

# Connect to search service 
search_client = SearchClient(endpoint = search_endpoint, index_name = index_name, credential = AzureKeyCredential(search_admin_key))


In [None]:
# connect openAI and search service 
aoai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002-32k",
    openai_api_version="2023-12-01-preview",  # e.g., "2023-12-01-preview"
)


index_name = "index-publications-02" #Change to index name that you want to store to


fields = [ #change fields to fields that you want to use
        SimpleField(name= "id", type= 'Edm.String', key= True, filterable = True),
        SimpleField(name = "pageNumber", type = "Edm.Int32", filterable = False, facetable = False, searchable = False, sortable = False),
        SimpleField(name= "parent", type= 'Edm.String', filterable = True),
        SearchableField(name= "content", type= 'Edm.String', searchable= True, filterable= True, facetable= True, sortable= True),
        SearchableField(name = "tables", type = "Edm.String", collection= True, searchable= True, filterable= False, facetable= False, sortable= False),
        SearchableField(name= "metadata", type= 'Edm.String', searchable= True, filterable= True, facetable= False, sortable= True),
        SimpleField(name = 'url', type = 'Edm.String', filterable = False, facetable = False, searchable = False, sortable = False), 
        SearchField(name= "content_vector", type= SearchFieldDataType.Collection("Edm.Single"), searchable= True, vector_search_dimensions= 1536, vector_search_profile_name= "myHnswProfile"  ),
    ]


vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint= search_endpoint,
    azure_search_key= search_admin_key,
    index_name=index_name,
    embedding_function=aoai_embeddings.embed_query,
    fields= fields, 
)

In [None]:
processed_names = []
list_blobs = blob_client.list_blobs()
for blob in list_blobs:
   
    # get blob data and info 
    
    blob_name = blob.name
    if blob_name in processed_names: 
        continue
    processed_names.append(blob_name)
    blob_url = (blob_client.get_blob_client(blob_name)).url
    blob_data = bytes(blob_client.download_blob(blob_name).readall())
    
    try:
    # pass blob data through document intelligence
        doc_data = doc_intelligence(document = blob_data)
    # extract tables from document
        tables = get_table(result = doc_data)
    # first dict format
        first_dict = format_dict(name = blob_name, url = blob_url)
    # pages and tables dict 
        doc_pages = pages_tables_dict(first_dict = first_dict, result = doc_data, tables = tables)
    # docs to send and tables to merge 
        docs_to_send, tables_to_merge = prep_to_send(doc_pages = doc_pages)
    # vector store document
        vector_store.add_documents(docs_to_send)
    # merge tables
        batch = IndexDocumentsBatch()
        if tables_to_merge:
            batch.add_merge_actions(*tables_to_merge)
            search_client.index_documents(batch)
    
        print(f"Document {blob_name} uploaded.")
        print(doc_pages)
    
    except: 
        print(f"Error occured, file name: {blob_name}")
        continue

In [None]:
#document intelligence functions
def doc_intelligence(document) -> AnalyzeResult:
    
    poller = document_intelligence_client.begin_analyze_document(
    "prebuilt-layout", 
    analyze_request = document, 
    content_type = "application/pdf", 
    output_content_format = "markdown")
    result: AnalyzeResult = poller.result()
    return result

# Get tables from document processed with document intelligence 
def get_table(result: AnalyzeResult):
    tables = []
    table_formated = []
    if result.tables:
        for table in result.tables:
            table_data = []
            headers = []
            
            for cell in table.cells:
                if cell.row_index == 0:
                    headers.append(cell.content)
                else:
                    if len(table_data) < cell.row_index:
                        table_data.append([])
                    table_data[cell.row_index-1].append(cell.content)
            
            table_formated.append(tabulate(table_data, headers=headers, tablefmt="simple")) ## store tables for storage
        tables.append(table_formated)
    return tables

# Format intial dict
def format_dict(name: str, url: str):
    
    new_name = name.split(".")[0]
    result_dict ={
        
        "id": new_name,
        "parent": name,
        "url": url,
    }
    return result_dict

# pages and document dict
def pages_tables_dict(first_dict: dict, result: AnalyzeResult, tables: list):
    
    doc_pages = []
    doc_tables = []

    if result.tables:
        for t, tabs in enumerate(result.tables):
            for page in tabs.bounding_regions:
                table_dict = {
                    'id': f'{first_dict["id"]}_{page.page_number}',
                    'pageNumber': page.page_number,
                    'table': tables[0][t]
                }
                doc_tables.append(table_dict)

    for page in result.pages:
        lines=[]
        for line in page.lines:
            lines.append(line.content)

        page_dict = {
            'id': f'{first_dict["id"]}_{page.page_number}',
            'parent': first_dict["parent"],
            'pageNumber': page.page_number,
            'url': first_dict["url"],
            'tables': [],
            'content': ' '.join(lines)
        }

        for tabs in doc_tables:
            if tabs['pageNumber'] == page.page_number:
                page_dict['tables'].append(tabs['table'])
        doc_pages.append(page_dict)
        
    return doc_pages

# create documents to send for AI search and tables to merge. 
def prep_to_send(doc_pages: list):
    
    docs_to_send = []
    tables_to_merge = []
    
    for page in doc_pages:
        metadata = {
            "id": page['id'],
            "parent": page["parent"],
            "pageNumber": page['pageNumber'],
            "url": page['url'],
        }
        doc = Document(page_content=page["content"])
        doc.metadata = metadata
        docs_to_send.append(doc)
        
        tabs = {
            "id": page['id'],            
            'tables': page['tables']
        }
        tables_to_merge.append(tabs)

    return docs_to_send, tables_to_merge