In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

True

In [2]:
# Azure Keys
azure_endpoint: str = os.environ.get('AZURE_ENDPOINT')
azure_openai_api_key: str = os.environ.get('AZURE_OPENAI_API_KEY')
azure_openai_api_version: str = os.environ.get('AZURE_OPENAI_API_VERSION')
azure_deployment: str = os.environ.get('AZURE_DEPLOYMENT')

In [3]:
#Azure opensearch keys
vector_store_address: str = os.environ.get('VECTOR_STORE_ADDRESS')
vector_store_password: str = os.environ.get('VECTOR_STORE_PASSWORD')

In [4]:
client = AzureOpenAI(
    api_key=azure_openai_api_key,
    api_version=azure_openai_api_version,
    azure_endpoint = azure_endpoint
)

In [5]:
#pdf loader
loader = PyPDFLoader("../input/data_engineering_notes.pdf")

pages = loader.load()
print(pages[1].metadata)

{'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../input/data_engineering_notes.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}


In [6]:
#document splitter

splitter = RecursiveCharacterTextSplitter(
  chunk_size = 500,
  chunk_overlap = 75
)

chunks = splitter.split_documents(pages)

print(len(chunks))

59


In [7]:
#define embedding model
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_endpoint,
    api_key=azure_openai_api_key,
)

In [8]:
#create a vector index
index_name: str = "pdf-vector-idx"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [10]:
#add documents
vector_store.add_documents(documents=chunks)

['MmM0YjAxM2MtZTQyNS00NjA1LWE4NjQtNmYxYjIwZDdjNzQ1',
 'NThmYjkxNzAtZTNhOC00YmJmLTk1YzktZmYxZTk2ZDlmZmRh',
 'NDc2YmI3YTYtMzVmNi00NGQ3LThjMGEtZGVhOTQxNmJmZTAz',
 'OWQ0NWU3OWQtZjUwZi00MTQ4LThhY2MtNmIwZTI1OWM1Y2M1',
 'YmFjNjc0OGYtMjVlMy00MGY2LWIxYjQtMGNlNmRiNGU4NDMz',
 'NzAzYzYxZDctZWQyNi00ZDliLTg3ZDMtYTY0MWE5ZWViYzY3',
 'NmI5OWQyNDEtYjYyMC00ZjMxLWI3NGYtNzAwMjVmOWVkYTRk',
 'Njg2NDEzYzEtNzliNi00ZGJhLWE4YTItNzViZWQ0OTAxNDQ2',
 'ZGE5MTVjYWQtZGI1My00ZDBmLTg5NmUtZWMzMThmZDY2M2Y5',
 'NWU3ZmUwOWMtNDNlOS00NjFlLThkYWUtNTFkMDgxMWU1NDUz',
 'YTY4NmMzYTgtM2Y2ZC00NGQxLTg3YjAtNzI3NzAwYWM1ZjBl',
 'MDk5MzUyY2YtMmRiMC00OWIyLThmOGQtNmM5MzgwYjBjNjM2',
 'ZjEzZTkxODQtZDM3Zi00NzM3LWFjOTktMzU2MjUzOGExNmIx',
 'NzhlZjliMzAtNjU1NS00NWZhLTgxN2ItZDY5YWYyZDRjYzFh',
 'YWZmMWNhZTItZTgxYy00MTcwLWIyZjctOTY0Mzg5ZTc5ZDdj',
 'NDA3NWM1YzctZTMzMy00MjgzLWJiNzgtODU5NGE2ZTQ3YjA2',
 'YjU3OWIxMzMtOWIxOC00Y2VkLTliYzktNDdkY2E1ODRjMjQ5',
 'ZGRkMmUzMTctY2Y5NS00N2Q0LWEwNDgtMjA5ODExZTMxOGUx',
 'MDA4ZjcwYzAtMzBkNi00YjFhLTkwMTctZjBjNmMyZDg0

In [11]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="how to clone delta lake table",
    k=3,
    search_type="similarity",
)
print(docs)
print(len(docs))

[Document(metadata={'id': 'Y2ZkYjFhNGEtMmNjZC00MjZjLWJkYmUtNzZlYmJiYzU2NWIy', 'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../input/data_engineering_notes.pdf', 'total_pages': 15, 'page': 4, 'page_label': '5'}, page_content='● A COMMENT is added to allow for easier discovery of \ntable contents \n● A LOCATION is specified, which will result in an \nexternal (rather than managed) table \n● The table is PARTITIONED BY a date column; this \nmeans that the data from each data will exist within its \nown directory in the target storage location \n4.3.5. Cloning Delta lake Table \n4.3.5.1. DEEP CLONE fully copies data and metadata from a source \ntable to a target. This copy occurs incrementally, so'), Document(metadata={'id': 'Yjk4MmE5NzktYzllZC00ZmNjLTkxMjgtZGVmYmY0MWJjYTBh', 'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../input/data_engineering_notes.pdf', 'total_pages': 15, 'page': 4, 'page_label': '5'}, page_content="table to a targe

In [12]:
# Define a function to interact with gpt
def ask_gpt(query,context):
    template = f"""
      You are a information retrieval AI. Format the retrieved information as a table or text
      Use only the context for your answers, do not make up information
      query: {query}

      {context} 
      """
    try:
        # Call the OpenAI API to generate a response
      completion = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": template
            }
          ]
        )

      return completion.choices[0].message.content

    except Exception as e:
        print(f"Error: {e}")
        return None
    
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [13]:
#Rag

question = "how to clone delta lake table"

retriever = vector_store.as_retriever()

docs = retriever.invoke(question)

combined_docs_context = combine_docs(docs)

response = ask_gpt(question,combined_docs_context)
print(response)

Here's the formatted information on cloning Delta Lake tables:

| **Cloning Method**              | **Description**                                                                                                                                                                               |
|----------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Deep Clone**                   | - Fully copies data and metadata from a source table to a target. <br> - This copy occurs incrementally, meaning executing the command again can sync changes from the source to the target. <br> - Copies additional metadata including streaming application transactions. |
| **Shallow Clone**                | - Creates a copy of a table quickly without modifying the current table. <br> - Only copies the Delta transaction logs, meaning data d