# Import Libraries 🧑‍💻

LangChain is being leveraged here to generate the structured data

In [None]:
from dotenv import load_dotenv
load_dotenv() 
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
import os
from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd

# Load Microsoft Build Document 📄

Load the Microsoft Build Document

In [None]:
loader = AzureAIDocumentIntelligenceLoader(api_endpoint=os.environ.get('DOCUMENT_INTELLIGENCE_ENDPOINT'), api_key=os.environ.get('DOCUMENT_INTELLIGENCE_KEY'), file_path='C:\\Users\\conne\\development\\repos\\chunking_for_rag\\Book_Of_News.pdf')
doc = loader.load()

## Print Microsoft Build Document

In [None]:
print(doc)

## Bring in GPT4o 🤖

Bring in GPT4o with the extra large context window of 96,000 words

In [None]:
llm = AzureChatOpenAI(
    azure_deployment="gpt4o",
    temperature=0,
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    api_version="2024-02-01"
)

## Make an LLM Call to Make Untructured Data Structured 📞

Let's take the entire Microsoft Build document we loaded above and have an LLM turn it into structured data. This will run through a needle in a haystack test.

For reference there are about 21,600 tokens in the Microsoft Build Document.

In [None]:
prompt = ChatPromptTemplate.from_template("""
You are an assistant that will summarize all of the Azure AI and Data Services announcements in the below context. Make sure to put the data into the following table. Make sure to only respond with the table and nothing else.

| Service      | Announcement |
|--------------|--------------|
|              |              |             

Context:
{docs_string}
""")

docs_string = ""
for page in doc:
    docs_string += page.page_content

chain = prompt | llm | StrOutputParser()
table = chain.invoke({"docs_string": doc})

## Print LLM Generated Table 🤖

In [None]:
print(table)

## Minor Data Cleaning 🧼

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

rows = [row.strip().split('|') for row in table.strip().split('\n')[2:]]
data_list = [[value.strip() for value in row[1:-1]] for row in rows]
df = pd.DataFrame(data_list, columns=["Service", "Announcement"])

## Query using Pandas 🐼

In [None]:
azure_ai_df = df[df['Service'] == 'Azure AI Services']
print(azure_ai_df)

azure_ai_df = df[df['Service'] == 'Developer Tools & DevOps']
print(azure_ai_df)