In [1]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb langchain_experimental
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv
!pip3 install tabulate



In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# Other modules and packages
import os
import tempfile
import streamlit as st  
import pandas as pd
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [5]:
llm = ChatOpenAI(model="gpt-4o", api_key = OPENAI_API_KEY)
# llm.invoke("Tell me something about russian blue")

load PDF document

In [6]:
loader = PyPDFLoader("data/alzheimers.pdf")
doc = loader.load()
doc

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.0 (Macintosh)', 'creationdate': '2025-03-31T17:30:21-05:00', 'author': "Alzheimer's Association", 'moddate': '2025-04-23T12:08:50-05:00', 'title': "Alzheimer's Association 2025 Alzheimer's Disease Facts and Figures", 'trapped': '/False', 'source': 'data/alzheimers.pdf', 'total_pages': 152, 'page': 0, 'page_label': 'a'}, page_content="2025  \nALZHEIMER’S DISEASE  \nFACTS AND FIGURES \nSPECIAL REPORT  \nAmerican Perspectives  \non Early Detection  \nof Alzheimer's Disease in  \nthe Era of Treatment"),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.0 (Macintosh)', 'creationdate': '2025-03-31T17:30:21-05:00', 'author': "Alzheimer's Association", 'moddate': '2025-04-23T12:08:50-05:00', 'title': "Alzheimer's Association 2025 Alzheimer's Disease Facts and Figures", 'trapped': '/False', 'source': 'data/alzheimers.pdf', 'total_pages': 152, 'page': 1, 'page_label': 'b'

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=200, length_function=len, separators=["\n","\n\n"," "])

chunks = text_splitter.split_documents(doc)
chunks

[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.0 (Macintosh)', 'creationdate': '2025-03-31T17:30:21-05:00', 'author': "Alzheimer's Association", 'moddate': '2025-04-23T12:08:50-05:00', 'title': "Alzheimer's Association 2025 Alzheimer's Disease Facts and Figures", 'trapped': '/False', 'source': 'data/alzheimers.pdf', 'total_pages': 152, 'page': 0, 'page_label': 'a'}, page_content="2025  \nALZHEIMER’S DISEASE  \nFACTS AND FIGURES \nSPECIAL REPORT  \nAmerican Perspectives  \non Early Detection  \nof Alzheimer's Disease in  \nthe Era of Treatment"),
 Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.0 (Macintosh)', 'creationdate': '2025-03-31T17:30:21-05:00', 'author': "Alzheimer's Association", 'moddate': '2025-04-23T12:08:50-05:00', 'title': "Alzheimer's Association 2025 Alzheimer's Disease Facts and Figures", 'trapped': '/False', 'source': 'data/alzheimers.pdf', 'total_pages': 152, 'page': 1, 'page_label': 'b'

In [8]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-3-small", 
        openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()

create vector db

In [9]:
#load vector store
vectorStore = Chroma.from_documents(
        documents=chunks,
        embedding=embedding_function,
        persist_directory="vectorstore_test"  #writing in to the test databse
)

In [10]:
#create retriever and get relevant chunks
retriever = vectorStore.as_retriever(search_kwargs={"k": 3})
relevant_chunks = retriever.invoke("Who is the author of the paper?")
relevant_chunks

[Document(id='c6a70f78-7db6-4956-a56f-709ace3b0ce4', metadata={'creator': 'Adobe InDesign 19.0 (Macintosh)', 'page_label': 'a', 'title': "Alzheimer's Association 2025 Alzheimer's Disease Facts and Figures", 'source': 'data/alzheimers.pdf', 'total_pages': 152, 'moddate': '2025-04-23T12:08:50-05:00', 'producer': 'Adobe PDF Library 17.0', 'page': 150, 'trapped': '/False', 'creationdate': '2025-03-31T17:30:21-05:00', 'author': "Alzheimer's Association"}, page_content='The Alzheimer’s Association acknowledges the \ncontributions of Joseph Gaugler, Ph.D., Bryan \nJames, Ph.D., Tricia Johnson, Ph.D., Jessica Reimer, \nPh.D., Kezia Scales, Ph.D., Sarah Tom, Ph.D., M.P.H., \nJennifer Weuve, M.P.H., Sc.D., and Jarmin Yeh, \nPh.D., M.P.H., M.S.S.W., in the preparation of  \n2025 Alzheimer’s Disease Facts and Figures. \nSpecial thanks to reviewers Jose Abisambra, Ph.D., \nErin Emery-Tiburcio, Ph.D., Fayron Epps, Ph.D., \nR.N., Andrea Gilmore-Bykovskyi, Ph.D., R.N.,  \nPaola Gilsanz, Sc.D., Pei Jun

In [11]:

PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

In [12]:
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="Who is the auther of this article?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

The Alzheimer’s Association acknowledges the 
contributions of Joseph Gaugler, Ph.D., Bryan 
James, Ph.D., Tricia Johnson, Ph.D., Jessica Reimer, 
Ph.D., Kezia Scales, Ph.D., Sarah Tom, Ph.D., M.P.H., 
Jennifer Weuve, M.P.H., Sc.D., and Jarmin Yeh, 
Ph.D., M.P.H., M.S.S.W., in the preparation of  
2025 Alzheimer’s Disease Facts and Figures. 
Special thanks to reviewers Jose Abisambra, Ph.D., 
Erin Emery-Tiburcio, Ph.D., Fayron Epps, Ph.D., 
R.N., Andrea Gilmore-Bykovskyi, Ph.D., R.N.,  
Paola Gilsanz, Sc.D., Pei Jung Lin, Ph.D., and 
Melinda Power, Sc.D.

---

7481(20):30478-30484.
772. Pickering CEZ, Maxwell CD, Yefimova M, Wang D, Puga F, 
Sullivan T. Early Stages of COVID-19 Pandemic Had No 
Discernable Impact on Risk of Elder Abuse and Neglect 
Among Dementia Family Caregivers

In [13]:
llm.invoke(prompt)

AIMessage(content="I don't know who the author of the article is based on the provided context.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 16, 'prompt_tokens': 985, 'total_tokens': 1001, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_65564d8ba5', 'id': 'chatcmpl-CWCUtdWqNjXm45hDPQJilKKZisaaU', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--a00eea91-3871-4383-b492-95d429842932-0', usage_metadata={'input_tokens': 985, 'output_tokens': 16, 'total_tokens': 1001, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

Structured responses

In [14]:
class ExtractInfo(BaseModel):
    title: str = Field(description="Title of the document")
    summary: str = Field(description="Summary of the document")
    year: int = Field(description="Year of the publication of the document")
    document_author: str = Field(description="Names of the auther")

In [15]:
rag_langchain = (
    {
        "context": retriever,
        "question":RunnablePassthrough()
    }
    | prompt_template
    | llm.with_structured_output(ExtractInfo,strict=True)
)

rag_langchain.invoke("Give me the title, summary, year, author of the document.")

ExtractInfo(title="Alzheimer's Association 2025 Alzheimer's Disease Facts and Figures", summary="This document provides comprehensive information and statistics about Alzheimer's disease, including American attitudes towards its early detection, diagnosis, and treatment. It also incorporates guidelines and rights regarding the disclosure of research results to participants in Alzheimer's studies. The appendices outline the sources and methods used to derive statistics in the report, aiming to inform researchers, policymakers, and the public with accurate insights about Alzheimer's or related dementias.", year=2025, document_author="Alzheimer's Association")

In [16]:
structured_response = rag_langchain.invoke("Give me the title, summary, year, author of the document.")

df = pd.DataFrame([structured_response.model_dump()])
df.set_index("title", inplace=True)
pd.set_option('display.max_colwidth', None)
df

Unnamed: 0_level_0,summary,year,document_author
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alzheimer's Association 2025 Alzheimer's Disease Facts and Figures,"The document provides comprehensive statistics and insights on Alzheimer's disease as of 2025. It covers topics like public attitudes towards early detection and treatment, as well as rights for research participants to access their individual research results. Recommendations for primary care involvement in diagnosis and disclosure, and implications of Alzheimer's biomarker disclosure policies are also detailed.",2025,Alzheimer's Association
