In [1]:
import os
os.chdir("../")

In [2]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_data_files(data):
    loader=DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents=loader.load()
    return documents

In [4]:
extracted_data=load_data_files("data")

In [5]:
len(extracted_data)

637

In [6]:
from typing import List
from langchain.schema import Document

def filter_to_min_doc(docs: List[Document])->List[Document]:
    minimal_docs: List[Document]=[]
    for doc in docs:
        src=doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source":src}
            )
        )
    return minimal_docs

In [7]:
minimal_docs=filter_to_min_doc(extracted_data)

In [8]:
def text_split(minimal_docs):
    text_splitter=RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks=text_splitter.split_documents(minimal_docs)
    return text_chunks

In [9]:
text_chunks=text_split(minimal_docs)
print(f"Number of chunks : {len(text_chunks)}")

Number of chunks : 5859


In [10]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name="sentence-transformers/all-MiniLM-L6-v2"
    embeddings=HuggingFaceEmbeddings(
        model_name=model_name,
        #model_kwargs={"device":"cuda" if torch.cuda.is_available() else "cpu"}
    )
    return embeddings

embedding = download_embeddings()

  embeddings=HuggingFaceEmbeddings(


In [11]:
from dotenv import load_dotenv
load_dotenv()

True

In [12]:
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

os.environ["PINECONE_API_KEY"]=PINECONE_API_KEY
os.environ["OPENAI_API_KEY"]=OPENAI_API_KEY

In [13]:
from pinecone import Pinecone
pinecone_api_key=PINECONE_API_KEY
pc=Pinecone(api_key=pinecone_api_key)

In [14]:
from pinecone import ServerlessSpec

index_name="medic-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws",region="us-east-1")
        )
    
index=pc.Index(index_name)

In [15]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embedding,
    index_name=index_name
)

In [16]:
docsearch=PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

ADD MORE DOCS by : docsearch.add_documents(documents=[doc_name])

In [17]:
retriever=docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [18]:
retrieved_docs=retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='e4ef7869-9809-41eb-aec7-4a9dc4cc77d3', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='8e8840e9-7870-45f0-a428-58b1e43c8285', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='7199bd5c-a8e3-40f3-9ab0-baebba262184', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25')]

In [None]:
# from langchain_openai import ChatOpenAI

# chat_model = ChatOpenAI(model="gpt-4o",api_key=OPENAI_API_KEY)

from langchain_google_genai import ChatGoogleGenerativeAI
chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    api_key="AIzaSyBOQE4BVCmkh8_jCztmFzpJuHtEEI7zj2A",
    convert_system_message_to_human=True
)

In [44]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [45]:
system_prompt = (
    "You are a helpful medical assistant." 
    "Answer questions about health, medicine, fitness, and diet clearly and safely." 
    "Do not give exact diagnoses or prescriptions.When discussing medicines, only explain their general purpose, usage, and side effects — do not prescribe exact dosages unless explicitly provided by official guidelines."
    "Always remind users to consult a doctor for serious concerns."
    "Always prioritize safety and evidence-based information. If you do not know answer, ask to consult an expert. Do not Hallucinate."
    "Answer concisely"
    "\n\n"
    "{context}"
)

prompt=ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}"),
    ]
)

In [46]:
qna_chain=create_stuff_documents_chain(chat_model, prompt)
rag_chain=create_retrieval_chain(retriever, qna_chain)

In [47]:
response=rag_chain.invoke({"input":"What is Acromegaly and Gigantism?"})
print(response["answer"])

Acromegaly and gigantism are disorders caused by the abnormal release of a chemical from the pituitary gland in the brain. This leads to increased growth in bone and soft tissue, along with other bodily disturbances.
