In [2]:
import os
import getpass
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_google_vertexai import ChatVertexAI
from langchain_openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.memory import ConversationBufferMemory

In [3]:
# Set environment variable for Google API key
os.environ["GOOGLE_API_KEY"] = getpass.getpass()

In [37]:
# Open the PDF file
pdf_document = fitz.open("ABSTRACT.pdf")

# Extract text from the PDF
text = ""
for page_num in range(len(pdf_document)):
    page = pdf_document.load_page(page_num)
    text += page.get_text()

# Close the document
pdf_document.close()

# Print or process the extracted text
print(text)

ABSTRACT 
In the future of India lies the future of a sixth of the world’s population. 
As the Artificial Intelligence (AI) revolution sweeps through 
societies and enters daily life, its role in shaping India’s development 
and growth is bound to be substantial. For India, AI holds 
promise as a catalyst to accelerate progress, while providing mechanisms 
to leapfrog traditional hurdles such as poor infrastructure 
and bureaucracy. At the same time, an investment in AI is accompanied 
by risk factors with long-term implications on society: it 
is imperative that risks be vetted at this early stage. In this paper, 
we describe opportunities and challenges for AI in India. We detail 
opportunities that are cross-cutting (bridging India’s linguistic 
divisions, mining public data), and also specific to one particular 
sector (healthcare). We list challenges that originate from existing 
social conditions (such as equations of caste and gender). Thereafter 
we distill out concrete steps a

In [None]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,          # Maximum size of each chunk
    chunk_overlap=200,         # Number of characters that overlap between chunks
    length_function=len,      # Function to determine the length of the text
    is_separator_regex=False  # Indicates if the separator should be treated as a regex
)

# Split the text into chunks
chunks = text_splitter.create_documents([text])

# Print the first few chunks for verification
for i, chunk in enumerate(chunks[:5]):
    print(f"Chunk {i+1}:\n{chunk}\n")

Chunk 1:
page_content='ABSTRACT 
In the future of India lies the future of a sixth of the world’s population.'

Chunk 2:
page_content='As the Artificial Intelligence (AI) revolution sweeps through'

Chunk 3:
page_content='societies and enters daily life, its role in shaping India’s development'

Chunk 4:
page_content='and growth is bound to be substantial. For India, AI holds'

Chunk 5:
page_content='promise as a catalyst to accelerate progress, while providing mechanisms'



In [None]:
# Create embeddings and ChromaDB
db = Chroma.from_documents(chunks, OpenAIEmbeddings())

In [None]:
# Set up retriever
retriever = db.as_retriever()

In [None]:
llm = ChatVertexAI(model="gemini-1.5-flash")

In [None]:
# Define prompt template
template = """
<s>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.

Use the following context (delimited by <ctx></ctx>) and the chat history (delimited by <hs></hs>) to answer the question:
</s>
------
<ctx>
{context}
</ctx>
------
<hs>
{history}
</hs>
------
{question}
Answer:
"""

prompt = PromptTemplate(
    input_variables=["history", "context", "question"],
    template=template,
)




In [None]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    verbose=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": prompt,
        "memory": ConversationBufferMemory(
            memory_key="history",
            input_key="question"
        ),
    },
)

In [None]:
# Run the QA system
print(qa.run("Give me the summary"))