In [7]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

True

In [None]:
# Azure Keys
azure_endpoint: str = os.environ.get('AZURE_ENDPOINT')
azure_openai_api_key: str = os.environ.get('AZURE_OPENAI_API_KEY')
azure_openai_api_version: str = os.environ.get('AZURE_OPENAI_API_VERSION')
azure_deployment: str = os.environ.get('AZURE_DEPLOYMENT')

Text-embedding-3-large


In [9]:
#Azure opensearch keys
vector_store_address: str = os.environ.get('VECTOR_STORE_ADDRESS')
vector_store_password: str = os.environ.get('VECTOR_STORE_PASSWORD')

In [10]:
client = AzureOpenAI(
    api_key=azure_openai_api_key,
    api_version=azure_openai_api_version,
    azure_endpoint = azure_endpoint
)

In [11]:
#pdf loader
loader = PyPDFLoader("../input/data_engineering_notes.pdf")

pages = loader.load()
print(pages[1].metadata)

{'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': '../input/data_engineering_notes.pdf', 'total_pages': 15, 'page': 1, 'page_label': '2'}


In [12]:
#document splitter

splitter = RecursiveCharacterTextSplitter(
  chunk_size = 500,
  chunk_overlap = 75
)

chunks = splitter.split_documents(pages)

print(len(chunks))

59


In [13]:
#define embedding model
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_endpoint,
    api_key=azure_openai_api_key,
)

In [15]:
#create a vector index
index_name: str = "pdf-vector-idx"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [11]:
#add documents
vector_store.add_documents(documents=chunks)

['MmYxYWQ3ZjEtMTM0My00MWQyLTk5ZjktNjQ0M2E2M2JiMWRm',
 'MjE3OWFhMzMtNTIxYS00YThmLWJlMzgtYzhlMTU1NGE3ZDAz',
 'ZWM5Zjc0MmEtOGQ2OC00NzMwLTk3OGEtZmE5NjUzMDg3ODg5',
 'ZTg3YWJjMDEtOTU5ZC00ZDc5LWJkZjYtODRhNDBhYTQ3ZDVh',
 'YjA0OGIxNzgtZDIxMi00MTQyLTg3NzctYzg0NzE2MjNlMzRh',
 'ZTk5YjMyMDktODgzNS00Y2IxLWIwOGMtNmYyOTVhMWZiNTA4',
 'ZmZkOTg4YzktN2E0MS00MzZhLTliM2UtODk4OGQ3MWE2YTkz',
 'ZmYxY2E5NGEtNzc5ZS00OTMwLThlYzItMGFkYWU0MjY3ZGNi',
 'M2Y2MDRkNmUtZmQ2Yi00YTViLTk3NzAtNzNhMTVlODlhMzNl',
 'ZGMzZDk5ZDMtOTQyNS00ZjdjLTk4MjktNTNmNGQ2ZWM1M2Fl',
 'ZWViZWQwOTgtNDcyOC00YTMzLTg1ODktYThkNGFjNmFiYzkw',
 'MDE4NjA5MGUtNDc3NC00MjRlLTg0YjUtNWRjOWZjOGI3ZjU3',
 'NzBkN2NhNTgtMTNlYy00YWY0LWFhYWYtNWZjOGVmMTY1MjQ0',
 'YWY5NWNkNWQtYTFhNS00YzI2LWEzOGItMzUyNDcwZDJhOGE5',
 'OTM0MmY4NzktOTc4NS00NGMwLWFmZjUtMmYwOTFlNzMzMGM5',
 'M2M3YTEzNDYtNGQ3Mi00NmMxLTk0NjItZmE2MGYxODNlODdl',
 'OWI0YWI0MTgtZDA4MS00ZDQ4LTk0OTEtMDYzN2ZkYzMyMWJj',
 'MTk0OTZlMTgtZWIyYy00MGZkLTkwNzYtNWYwOThhM2Y1NWEy',
 'YmIxNDJhMzMtMWFhYi00MWRiLWE2YjYtMTA0OWQwMzU3

In [16]:
# Perform a similarity search
docs = vector_store.similarity_search(
    query="how to clone delta lake table",
    k=3,
    search_type="similarity",
)
print(docs)
print(len(docs))

[]
0


In [10]:
# Define a function to interact with gpt
def ask_gpt(query,context):
    template = f"""
      You are a information retrieval AI. Format the retrieved information as a table or text
      Use only the context for your answers, do not make up information
      query: {query}

      {context} 
      """
    try:
        # Call the OpenAI API to generate a response
      completion = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": template
            }
          ]
        )

      return completion.choices[0].message.content

    except Exception as e:
        print(f"Error: {e}")
        return None
    
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [11]:
#Rag

question = "how to clone delta lake table"

retriever = vector_store.as_retriever()

docs = retriever.invoke(question)

combined_docs_context = combine_docs(docs)

response = ask_gpt(question,combined_docs_context)
print(response)

Here is the information on how to clone a Delta Lake table, formatted in a clear table layout:

| Method           | Description                                                                                                                                                         |
|------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| **Deep Clone**    | Fully copies both data and metadata from a source table to a target. This copy is incremental, allowing for synchronization of changes on subsequent executions.  |
| **Shallow Clone** | Quickly creates a copy of a table without moving data; only the Delta transaction logs are copied. Useful for testing changes without modifying the original table. |
| **CTAS vs Shallow Clone** | Cloning is simpler than Create Table As Select (CTAS), as it faithfully copies the original table's properties (like partition