# Import Libraries

In [None]:
from dotenv import load_dotenv
load_dotenv() 
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
import os
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain import hub
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
import pandas as pd
from io import StringIO

# Load Document

In [None]:
loader = AzureAIDocumentIntelligenceLoader(api_endpoint=os.environ.get('DOCUMENT_INTELLIGENCE_ENDPOINT'), api_key=os.environ.get('DOCUMENT_INTELLIGENCE_KEY'), file_path='C:\\Users\\conne\\development\\repos\\converting_unstructured_data_to_structured_data_using_gpt4o\\Book_Of_News.pdf')
docs = loader.load()
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

docs_string = docs[0].page_content
splits = text_splitter.split_text(docs_string)

print("Length of splits: " + str(len(splits)))

## Print Document Splits

In [None]:
print(splits)

## Bring in GPT4o

In [None]:
llm = AzureChatOpenAI(
    azure_deployment="gpt4o",
    temperature=0,
    api_key=os.environ.get("AZURE_OPENAI_API_KEY"),
    azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
    api_version="2024-02-01"
)

## Make an LLM Call to make untructured data structured

In [None]:
prompt = ChatPromptTemplate.from_template("""
You are an assistant that will summarize all of the Generative AI announcements in the below context. Make sure to put the data into the following table. Make sure to only respond with the table and nothing else.

| Service | Announcement | 
|--------------|--------------|
|              |              |

Context:
{docs_string}
""")

chain = prompt | llm | StrOutputParser()
table = chain.invoke({"docs_string": splits})

In [None]:
print(table)

## Query using Pandas 🐼

In [None]:
import pandas as pd

# Remove leading and trailing whitespaces and split the table into rows
rows = [row.strip().split('|') for row in table.strip().split('\n')[2:]]

# Extract data from each row
data_list = [[value.strip() for value in row[1:-1]] for row in rows]

# Create DataFrame
df = pd.DataFrame(data_list, columns=["Service", "Announcement"])

# Assuming you've already created the DataFrame df as shown in the previous code snippet

# Query where Service equals 'Azure Data'
azure_data_df = df[df['Service'] == 'Azure Data']

# Show the resulting DataFrame
print(azure_data_df)

azure_ai_df = df[df['Service'] == 'Azure AI Services']

print(azure_ai_df)