In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.azuresearch import AzureSearch
from langchain_openai import AzureOpenAIEmbeddings
from openai import AzureOpenAI
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
# Azure Keys
azure_endpoint: str = os.environ.get('AZURE_ENDPOINT')
azure_openai_api_key: str = os.environ.get('AZURE_OPENAI_API_KEY')
azure_openai_api_version: str = os.environ.get('AZURE_OPENAI_API_VERSION')
azure_deployment: str = os.environ.get('AZURE_DEPLOYMENT')

In [4]:
#Azure opensearch keys
vector_store_address: str = os.environ.get('VECTOR_STORE_ADDRESS')
vector_store_password: str = os.environ.get('VECTOR_STORE_PASSWORD')

In [5]:
client = AzureOpenAI(
    api_key=azure_openai_api_key,
    api_version=azure_openai_api_version,
    azure_endpoint = azure_endpoint
)

In [6]:
#pdf loader
loader = PyPDFLoader("../input/Zipse_Speech.pdf")

pages = loader.load()
print(pages[1].metadata)

{'producer': 'Microsoft: Print To PDF', 'creator': 'PyPDF', 'creationdate': '2024-05-14T22:16:56+02:00', 'author': 'BMW AG', 'moddate': '2024-05-17T08:49:01+02:00', 'subject': 'Speech of the CEO Oliver Zipse (full text)', 'title': 'Speech of the CEO Oliver Zipse (full text)', 'source': '../input/Zipse_Speech.pdf', 'total_pages': 17, 'page': 1, 'page_label': '2'}


In [7]:
#document splitter

splitter = RecursiveCharacterTextSplitter(
  chunk_size = 500,
  chunk_overlap = 75
)

chunks = splitter.split_documents(pages)

print(len(chunks))

82


In [8]:
#define embedding model
embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    azure_deployment=azure_deployment,
    openai_api_version=azure_openai_api_version,
    azure_endpoint=azure_endpoint,
    api_key=azure_openai_api_key,
)

In [9]:
#create a vector index
index_name: str = "pdf-vec-idx"
vector_store: AzureSearch = AzureSearch(
    azure_search_endpoint=vector_store_address,
    azure_search_key=vector_store_password,
    index_name=index_name,
    embedding_function=embeddings.embed_query,
)

In [10]:
#add documents
vector_store.add_documents(documents=chunks)

['MTgxYTc5NjQtNjU3Mi00MDQ5LTkzZmEtNzdiZTRjZjYwMDU4',
 'N2E1OThiNmQtNzZhMC00NzQ0LWE4ZTMtMGQwMjU3OGFjZDYw',
 'ZDQyZmVmMGEtYjE5MS00MmVhLTk0MWQtMjQ3MjgxMjAwMTNi',
 'NmNkOTYwNmItODAzZC00Y2FhLTkzZjctYzdiOWEyYzc2MTUy',
 'MDhkMzFhZjQtNmZhMC00NmEwLTk5ODgtOTllN2Q4Njg5ZGQ0',
 'ZWVkMjM0NzUtM2I3NC00NWM2LTlkOGQtYTY1MTMwNjdhY2Y3',
 'ZTgxZmE1NmQtMzNjYy00ZjRmLTg2ZGEtNjRkZjBiMTYzMzA0',
 'YTRhNGRiODYtYTI0YS00ODIzLWE1YjQtY2NiNTQyNTQ0YjBm',
 'NTZhNjI0YzEtNzJiYi00ZGQ4LTlkMjEtNzI4OTQxOTM2NTZh',
 'NGQyMGIwMWEtZDE0MC00YTdkLWE0YjEtMjlkMzVhYWVmYTk1',
 'NDlhNTcwMGUtZmEyZi00ZTAxLTk3YjMtNjczNzRiMjA4NDI2',
 'YTE5MDU0MGItNzVjMC00YTE5LTlkMmYtYzAxZjViMmIzZGYz',
 'MTcxNzAwOGItOTAyZC00YWI1LWI5MjYtMjE3YTNkYTkyNmQ2',
 'ZjA0ZTQwOGItYWI5YS00ODA2LWE5NWYtYTJjZjZlYjMzMmEy',
 'YTMzM2ZjYjAtZDM0Yi00ODU3LTgyN2UtYWM5MjViYWQ3NGUz',
 'ODE2MmQ3MGUtZDQ0Yi00MzQ5LWEyNWMtMWY0YWU4NzEzYjY3',
 'MmQ2Mzc4ZDctZmY4Zi00YjRiLWJjNDEtNThlZjA2NDQ1ZDZh',
 'ZWRkOTc2ZmUtMjljMS00ZmY2LWJhMTUtMzk1YTZiMjNlMzdl',
 'MGRlZTdkMTAtNTMzNi00Y2RjLWE4YTMtNzFkY2VmNGYy

In [11]:
# # Perform a similarity search
# docs = vector_store.similarity_search(
#     query="when nehru speech happened",
#     k=3,
#     search_type="similarity",
# )
# print(docs)
# print(len(docs))

In [12]:
# Define a function to interact with gpt
def ask_gpt(query,context):
    template = f"""
      You are a information retrieval AI. Format the retrieved information as a table or text
      Use only the context for your answers, do not make up information
      query: {query}

      {context} 
      """
    try:
        # Call the OpenAI API to generate a response
      completion = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": template
            }
          ]
        )

      return completion.choices[0].message.content

    except Exception as e:
        print(f"Error: {e}")
        return None
    
def combine_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [13]:

def ask_question(question):
  retriever = vector_store.as_retriever()

  docs = retriever.invoke(question)

  combined_docs_context = combine_docs(docs)

  response = ask_gpt(question,combined_docs_context)
  return response