# Imports

In [1]:
# Generic imports
import os
from dotenv import load_dotenv
from tabulate import tabulate
import io
import copy
import re
# Azure imports
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient 
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.storage.blob import BlobServiceClient

from langchain import hub
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_openai import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores.azuresearch import AzureSearch
from azure.search.documents import SearchClient, IndexDocumentsBatch
from azure.search.documents.indexes.models import(
    SimpleField,
    ComplexField,
    SearchableField,
    SearchField,
    SearchFieldDataType,
)



# .env Values

In [2]:
def reload_env():
    from dotenv import load_dotenv, dotenv_values
    import os

    # Clear the current environment variables
    for key in dotenv_values().keys():
        os.environ.pop(key, None)

    # Reload the .env file
    load_dotenv()

In [5]:
"""
This code loads environment variables using the `dotenv` library and sets the necessary environment variables for Azure services.
The environment variables are loaded from the `.env` file in the same directory as this notebook.
"""

#load_dotenv()
reload_env()

os.environ["AZURE_OPENAI_ENDPOINT"] = os.getenv("OPENAI_ENDPOINT")
os.environ["AZURE_OPENAI_API_KEY"] = os.getenv("OPENAI_KEY")
doc_intelligence_endpoint = os.getenv("DOC_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("DOC_INTELLIGENCE_KEY")
blob_connection_string = os.getenv("BLOB_STORAGE_CONNECTION_STRING")
storage_account_key = os.getenv("STORAGE_ACCOUNT_KEY")
search_endpoint: str = os.getenv("AI_SEARCH_ENDPOINT")
search_admin_key: str = os.getenv("AI_SEARCH_KEY")



# Initializers

In [6]:
# Index name
index_name: str = "docu-layout-01" #Change to index name that you want to store to

# Connect to document intelligence
document_intelligence_client = DocumentIntelligenceClient(endpoint= doc_intelligence_endpoint, credential=AzureKeyCredential(doc_intelligence_key))

# Connect to blob storage 
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)

blob_client = blob_service_client.get_container_client("docu-layout-01")
if not blob_client.exists():
    blob_client.create_container()
    print("Created")
list_blobs = blob_client

# Connect to search service 
search_client = SearchClient(endpoint = search_endpoint, index_name = index_name, credential = AzureKeyCredential(search_admin_key))


# Upload to Blob Storage

In [26]:
path = "D:\\OneDrive\\Semester 7 COEN\\COEN 424\\Project\\Code\\COEN-424-DocuLayout\\Coen424 Fall 2024-Assignment One.pdf"
# Get files 
files = ["D:\\OneDrive\\Semester 7 COEN\\COEN 424\\Project\\Code\\COEN-424-DocuLayout\\Coen424 Fall 2024-Assignment One.pdf"]

# Send files
for index, file_to_send in enumerate(files):
    with open(file_to_send, mode = "rb") as data:
        sent = blob_client.upload_blob(name=os.path.basename(file_to_send), data=data, overwrite=True) #should also add metadata
    print(f"Sent: {os.path.basename(file_to_send)} as {sent}")

Sent: Coen424 Fall 2024-Assignment One.pdf as <azure.storage.blob._blob_client.BlobClient object at 0x00000200FECC3E30>


# Index Creation

In [43]:
# connect openAI and search service 
aoai_embeddings = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-3-small",
    openai_api_version="2024-05-01-preview",  # e.g., "2023-12-01-preview"
)


index_name = "docu-layout-index-01" #Change to index name that you want to store to


fields = [ #change fields to fields that you want to use
        SimpleField(name= "id", type= 'Edm.String', key= True, filterable = True),
        SimpleField(name = "pageNumber", type = "Edm.Int32", filterable = False, facetable = False, searchable = False, sortable = False),
        SimpleField(name= "parent", type= 'Edm.String', filterable = True),
        SearchableField(name= "content", type= 'Edm.String', searchable= True, filterable= True, facetable= True, sortable= True),
        SearchableField(name= "metadata", type= 'Edm.String', searchable= True, filterable= True, facetable= False, sortable= True),
        SimpleField(name = 'url', type = 'Edm.String', filterable = False, facetable = False, searchable = False, sortable = False), 
        SearchField(name= "content_vector", type= SearchFieldDataType.Collection("Edm.Single"), searchable= True, vector_search_dimensions= 1536, vector_search_profile_name= "myHnswProfile"  ),
    ]


vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint= search_endpoint,
    azure_search_key= search_admin_key,
    index_name=index_name,
    embedding_function=aoai_embeddings.embed_query,
    fields= fields, 
)

# Functions

In [52]:
from PyPDF2 import PdfReader
def doc_intelligence(document) -> AnalyzeResult:
    pdf_reader = PdfReader(io.BytesIO(document))
    num_pages = pdf_reader._get_num_pages()
    all_results = []
    print(num_pages)

    for start_page in range(1, num_pages, 2):
        end_page = min(start_page + 2, num_pages)
    
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout", 
            analyze_request=document, 
            content_type="application/pdf", 
            output_content_format="markdown",
            pages = f"{start_page} - {end_page}" #if end_page != num_pages else f"{start_page}"
        )
        result: AnalyzeResult = poller.result()
        all_results.append(result)
    
    return all_results
    
# Format intial dict
def format_dict(name: str, url: str,):
    
    new_name = name.split(".")[0]
    new_name = new_name.replace(" ", "_")
    result_dict ={
        
        "id": new_name,
        "parent": name,
        "url": url,
    }
    return result_dict

# pages and document dict
def pages_dict(first_dict: dict, result: AnalyzeResult, index: int):
    
    page_index = index + 1
    page_index = page_index*2-1 # we are always passing two pages so to get first is 
    doc_pages = []         # index * 2 - 1. Then increment by one for second page. 
    
    for page in result.pages:
        
        lines=[]
        for line in page.lines:
            lines.append(line.content)

        page_dict = {
            'id': f'{first_dict["id"]}_{page_index}',
            'parent': first_dict["parent"],
            'pageNumber': page.page_number,
            'url': first_dict["url"],
            'content': ' '.join(lines)
        }
        doc_pages.append(page_dict)
        
        page_index += 1

        
    return doc_pages

# create documents to send for AI search and tables to merge. 
def prep_to_send(doc_pages: list):
    
    docs_to_send = []
    
    for page in doc_pages:
        metadata = {
            "id": page['id'],
            "parent": page["parent"],
            "pageNumber": page['pageNumber'],
            "url": page['url'],
        }
        doc = Document(page_content=page["content"])
        doc.metadata = metadata
        docs_to_send.append(doc)

    return docs_to_send

# Process From Blob Storage 

In [53]:
processed_names = ["Coen424 Fall 2024-Assignment One"]
list_blobs = blob_client.list_blobs()
for blob in list_blobs:
   
    # get blob data and info 
    
    blob_name = blob.name
    if blob_name in processed_names: 
        continue
    processed_names.append(blob_name)
    blob_url = (blob_client.get_blob_client(blob_name)).url
    blob_data = bytes(blob_client.download_blob(blob_name).readall())
    
    try:
    # pass blob data through document intelligence
        doc_data = doc_intelligence(document = blob_data)    
        # first dict format
        first_dict = format_dict(name = blob_name, url = blob_url)
        #doc_data = doc_data
        for d, data in enumerate(doc_data):
        # pages dict 
            doc_pages = pages_dict(first_dict = first_dict, result = data, index = d)
            print(doc_pages)
        # docs to send and tables to merge 
            docs_to_send = prep_to_send(doc_pages = doc_pages)
        # vector store document
            #documents_to_vector.append(docs_to_send)

            vector_store.add_documents(docs_to_send)
        # merge tables
        # batch = IndexDocumentsBatch()
        # if tables_to_merge:
        #     batch.add_merge_actions(*tables_to_merge)
        #     search_client.index_documents(batch)
    
            print(f"Document {blob_name} uploaded.")
            #print(doc_pages)
    
    except Exception as e: 
        print(f"An error occurred: {e}")
        print(f"Error occured, file name: {blob_name}")
        continue

4
[{'id': 'Coen424_Fall_2024-Assignment_One_1', 'parent': 'Coen424 Fall 2024-Assignment One.pdf', 'pageNumber': 1, 'url': 'https://coen424storage.blob.core.windows.net/docu-layout-01/Coen424%20Fall%202024-Assignment%20One.pdf', 'content': 'COEN 424/6313 Assignment1 Fall 2024 Individual or Group of 2 or 3 Assignment due by October 26 23:59 @copyright Yan Liu 2024-2025 This assignment is originally developed by Yan Liu @ Concordia University. This assignment is only for the course teaching and education purpose. Any distribution of this document to the Internet that involves any profit-making purpose is not given the consent from the author. This assignment is designed to practise data model design, queries and communication through binary serialization and deserialization (gRPC). The dataset is in the JSON form for Novel Prizes since year 1901. api.nobelprize.org/v1/prize.json To illustrate an data sample in Redis, one prize data sample from the prize.json is added to the Redis database