In [1]:
%pwd

'/home/subhankhurshid/Documents/personal-projects/medical-app/research'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'/home/subhankhurshid/Documents/personal-projects/medical-app'

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
from langchain_pinecone import PineconeVectorStore
from openai import OpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print(torch.version.cuda)   # vs. `nvidia-smi` reported version
print(torch.backends.cudnn.version())

12.6
90501


In [5]:
load_dotenv()

True

In [6]:
def load_pdf_doc(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents


In [7]:
data = load_pdf_doc(data='data')

In [8]:
data

[Document(metadata={'producer': 'iTextSharp™ 5.5.5 ©2000-2014 iText Group NV (AGPL-version); modified using iTextSharp™ 5.5.5 ©2000-2014 iText Group NV (AGPL-version)', 'creator': 'PyPDF', 'creationdate': '2017-10-23T12:02:57+05:30', 'author': 'William Stallings', 'moddate': '2018-08-21T06:36:33+03:00', 'title': 'Computer Security: Principles and Practice, Global Edition, 4/e', 'source': 'data/book_new.pdf', 'total_pages': 986, 'page': 0, 'page_label': 'Front Cover'}, page_content=''),
 Document(metadata={'producer': 'iTextSharp™ 5.5.5 ©2000-2014 iText Group NV (AGPL-version); modified using iTextSharp™ 5.5.5 ©2000-2014 iText Group NV (AGPL-version)', 'creator': 'PyPDF', 'creationdate': '2017-10-23T12:02:57+05:30', 'author': 'William Stallings', 'moddate': '2018-08-21T06:36:33+03:00', 'title': 'Computer Security: Principles and Practice, Global Edition, 4/e', 'source': 'data/book_new.pdf', 'total_pages': 986, 'page': 1, 'page_label': 'IFC'}, page_content='A01_STAL0611_04_GE_FM.indd   1

In [9]:
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [10]:
text_chunks = text_split(extracted_data=data)
print(f"{len(text_chunks)} is the length of the document")

6064 is the length of the document


In [11]:
def download_hugging_face_embedding_model():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device':'cpu'})
    return embeddings

In [12]:
embeddings = download_hugging_face_embedding_model()

  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device':'cpu'})


In [14]:
query_result = embeddings.embed_query("Hello World")
print(len(query_result))

384


In [15]:
pinecone_api = os.getenv('PINECONE_API_KEY')
print(pinecone_api)

pcsk_3HtRed_7v8Lo1bcgNHfQLwHmNA7yWTFrpXBJKLBpynEhhCzKHrkBTqXCS1FN4KVr3Xxk1q


In [16]:
pc = Pinecone(api_key=pinecone_api)

In [18]:
index_name = 'medical-bot'

In [86]:

pc.create_index(
    name=index_name,
    dimension=384,
    metric='cosine',
    spec=ServerlessSpec(
        cloud='aws',
        region='us-east-1'
    )
)


{
    "name": "medical-bot",
    "metric": "cosine",
    "host": "medical-bot-q23k3fb.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [87]:
docs = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [19]:
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [20]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x71807aa92870>

In [21]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [22]:
retrieved_docs = retriever.invoke("What is Acne?")

In [23]:
retrieved_docs

[Document(id='04f1818c-260b-4cbb-b41f-e9def56e39b3', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data/book.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='a37a9963-7135-48a0-98e2-6db7bd41eccd', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 55.0, 'page_label': '26', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'data/book.pdf', 'total_pages': 4505.0}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoin— A drug that works by increasing the\nturnover (death and replace

In [24]:
gemini_api_key = os.getenv('GEMINI_API_KEY')
print(gemini_api_key)

AIzaSyCrA3e8g2g3HIzA7SdxZnBeqNV0j0EtW24


In [25]:
gemini_base_url = os.getenv("GEMINI_BASE_URL")
gemini_base_url

'https://generativelanguage.googleapis.com/v1beta/openai/'

In [26]:
system_prompt = (
    """
    You are an assistant for question answering tasks. Use the following
    pieces of retrieved context to answer the question. If you don't know the 
    answer, say that you don't know. Use three sentences maximum and keep the 
    answer concise.
    {context}
    """
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}")
    ]
)

In [27]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    max_output_tokens=500,
    google_api_key=gemini_api_key 
)

In [28]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)

In [29]:
question_answer_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="\n    You are an assistant for question answering tasks. Use the following\n    pieces of retrieved context to answer the question. If you don't know the \n    answer, say that you don't know. Use three sentences maximum and keep the \n    answer concise.\n    {context}\n    "), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| ChatGoogleGenerativeAI(model='models/gemini-2.0-flash', google_api_key=SecretStr('**********'), temperature=0.0, max_outp

In [None]:
rag_chain = create_retrieval_chain(retriever, question_answer_chain)





In [32]:
response = rag_chain.invoke({"input": "What is Acromegaly?"})
print(response['answer'])

Acromegaly is a rare disorder characterized by the abnormal release of a chemical from the pituitary gland in the brain. This leads to increased growth in bone and soft tissue, along with other disturbances throughout the body. It affects both men and women, with diagnosis often delayed until middle age due to the gradual onset of symptoms.
