## Import Libraries 🧑‍💻

In [None]:
import os
from dotenv import load_dotenv
from langchain_text_splitters import CharacterTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import MarkdownHeaderTextSplitter
load_dotenv()

## Bring in Azure OpenAI Embeddings 🔢

In [None]:
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment="embeddings",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY")
)

## Bring in Azure Search 🔎

In [None]:
index_name: str = "product-info-test"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_SEARCH_KEY"),
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

## Convert PDF to Markdown 🔁

In [None]:
loader = AzureAIDocumentIntelligenceLoader(file_path="C:\\Users\\conne\\development\\repos\\converting_unstructured_data_to_structured_data_using_gpt4o\\Book_Of_News.pdf", api_key=os.environ.get('DOCUMENT_INTELLIGENCE_KEY'), api_endpoint=os.environ.get('DOCUMENT_INTELLIGENCE_ENDPOINT'), api_model="prebuilt-layout")
book_of_build = loader.load()

# Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)

## Chunking Strategy #1 Character Split

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=30)
docs = text_splitter.split_documents(book_of_build)
vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
vector_store_password: str = os.getenv("AZURE_SEARCH_KEY")

index_name: str = "charsplit"
char_split_vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

char_split_vector_store.add_documents(documents=docs)

## Chunking Strategy #2 Split on Headers and Chunk

In [None]:
docs_string = book_of_build[0].page_content
splits = markdown_splitter.split_text(docs_string)

chunk_size = 600
chunk_overlap = 100
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
splits = text_splitter.split_documents(splits)
vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
vector_store_password: str = os.getenv("AZURE_SEARCH_KEY")

index_name: str = "headerandcharsplit"
header_and_char_split_vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

header_and_char_split_vector_store.add_documents(documents=splits)

## Chunking Strategy #3 Split on Headers

In [None]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
 
docs_string = book_of_build[0].page_content
splits = text_splitter.split_text(docs_string)
vector_store_address: str = os.getenv("AZURE_SEARCH_ENDPOINT")
vector_store_password: str = os.getenv("AZURE_SEARCH_KEY")

index_name: str = "headersplit"
header_split_vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

header_split_vector_store.add_documents(documents=splits)

## Test Chunking Strategy #1 Character Splitting

In [None]:
docs = char_split_vector_store.similarity_search(
    query="Azure AI Services announcements",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

## Test Chunking Strategy #2 Header and Character Splitting

In [None]:
docs = header_and_char_split_vector_store.similarity_search(
    query="Azure AI Services announcements",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

## Test Chunking Strategy #3 Header Splitting

In [None]:
docs = header_split_vector_store.similarity_search(
    query="Azure AI Services announcements",
    k=3,
    search_type="similarity",
)
print(docs[0].page_content)

## 