In [14]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("survpaper.pdf")
data = loader.load()  # entire PDF is loaded as a single Document

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# split data
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200)
docs = text_splitter.split_documents(data)


print("Total number of documents: ",len(docs))

Total number of documents:  1295


In [17]:
docs[7]

Document(metadata={'producer': 'dvipdfm 0.13.2c, Copyright © 1998, by Mark A. Wicks', 'creator': 'TeX output 2003.09.02:1034', 'creationdate': '2003-09-02T10:34:47+01:00', 'source': 'survpaper.pdf', 'total_pages': 42, 'page': 0, 'page_label': '1'}, page_content='survival modelling in actuarial mathematics, are focussed. The following topics\nare concerned: the development from age-discrete to age-continuous modelling,')

In [18]:
from langchain_chroma import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from dotenv import load_dotenv
load_dotenv() 

#Get an API key: 
# Head to https://ai.google.dev/gemini-api/docs/api-key to generate a Google AI API key. Paste in .env file

# Embedding models: https://python.langchain.com/v0.1/docs/integrations/text_embedding/

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
vector = embeddings.embed_query("hello, world!")
vector[:5]
#vector

[0.05636945366859436,
 0.004828543867915869,
 -0.07625909894704819,
 -0.023642510175704956,
 0.053293220698833466]

In [19]:
vectorstore = Chroma.from_documents(documents=docs, embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"))

In [20]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 2})

retrieved_docs = retriever.invoke("What is survival analysis?")

In [21]:
len(retrieved_docs)

2

In [22]:
print(retrieved_docs[1].page_content)

ity (inside the group, or inside the population) may be of interest. Frailty-based
survival models can produce this information.


In [31]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash",temperature=0.3, max_tokens=5)

In [32]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [33]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [34]:
response = rag_chain.invoke({"input": "What is survival analysis?"})
print(response["answer"])

Based on the provided text
