In [1]:
print("Hello, World!")

Hello, World!


In [2]:
%pwd

'e:\\gen ai\\Gen_AI_MediBot\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'e:\\gen ai\\Gen_AI_MediBot'

### Extract Text


In [5]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [6]:
def load_pdf_file(data: str):
    """
    Load all PDF files from a directory
    """
    loader = DirectoryLoader(
        path=data,
        glob="*.pdf",
        loader_cls=PyPDFLoader,
        show_progress=True
    )
    documents = loader.load()
    return documents


In [7]:
extracted_data = load_pdf_file("Data\medical_books")

100%|██████████| 1/1 [04:35<00:00, 275.20s/it]


In [8]:
extracted_data

[Document(metadata={'producer': '2.3.4 (4.2.13) d', 'creator': 'PDFsam Basic v4.2.12', 'creationdate': '2022-04-20T12:57:23+06:00', 'moddate': '2022-04-20T17:12:11+06:00', 'source': 'Data\\medical_books\\Medical_Book.pdf', 'total_pages': 1428, 'page': 0, 'page_label': '1'}, page_content='24th Edition \n,�. . \nELSLVlER \n-\nEdited by \nIan D. Penman \nStuart H. Ralston \nMark W. J. Strachan \nRichard P. Hobson \nII'),
 Document(metadata={'producer': '2.3.4 (4.2.13) d', 'creator': 'PDFsam Basic v4.2.12', 'creationdate': '2022-04-20T12:57:23+06:00', 'moddate': '2022-04-20T17:12:11+06:00', 'source': 'Data\\medical_books\\Medical_Book.pdf', 'total_pages': 1428, 'page': 1, 'page_label': '2'}, page_content='Medicine\nDavidson’s\nPrinciples and Practice of\n                    PDF  Collected  By:\n        Dr. Nazmul Alam Faruki'),
 Document(metadata={'producer': '2.3.4 (4.2.13) d', 'creator': 'PDFsam Basic v4.2.12', 'creationdate': '2022-04-20T12:57:23+06:00', 'moddate': '2022-04-20T17:12:11+

### Chunking

In [9]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_docs_into_chunks(documents, chunk_size=1000, chunk_overlap=150):
    """
    Split LangChain Documents into smaller chunks for RAG.
    Keeps metadata (source, page) automatically.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""],
        length_function=len,
    )
    chunks = splitter.split_documents(documents)
    return chunks


In [10]:
chunked_data = split_docs_into_chunks(extracted_data)
print("Total pages loaded:", len(extracted_data))
print("Total chunks created:", len(chunked_data))

print("\nSample chunk text:\n", chunked_data[0].page_content[:400])
print("\nSample chunk metadata:\n", chunked_data[0].metadata)

Total pages loaded: 1428
Total chunks created: 13012

Sample chunk text:
 24th Edition 
,�. . 
ELSLVlER 
-
Edited by 
Ian D. Penman 
Stuart H. Ralston 
Mark W. J. Strachan 
Richard P. Hobson 
II

Sample chunk metadata:
 {'producer': '2.3.4 (4.2.13) d', 'creator': 'PDFsam Basic v4.2.12', 'creationdate': '2022-04-20T12:57:23+06:00', 'moddate': '2022-04-20T17:12:11+06:00', 'source': 'Data\\medical_books\\Medical_Book.pdf', 'total_pages': 1428, 'page': 0, 'page_label': '1'}


In [11]:
chunked_data

[Document(metadata={'producer': '2.3.4 (4.2.13) d', 'creator': 'PDFsam Basic v4.2.12', 'creationdate': '2022-04-20T12:57:23+06:00', 'moddate': '2022-04-20T17:12:11+06:00', 'source': 'Data\\medical_books\\Medical_Book.pdf', 'total_pages': 1428, 'page': 0, 'page_label': '1'}, page_content='24th Edition \n,�. . \nELSLVlER \n-\nEdited by \nIan D. Penman \nStuart H. Ralston \nMark W. J. Strachan \nRichard P. Hobson \nII'),
 Document(metadata={'producer': '2.3.4 (4.2.13) d', 'creator': 'PDFsam Basic v4.2.12', 'creationdate': '2022-04-20T12:57:23+06:00', 'moddate': '2022-04-20T17:12:11+06:00', 'source': 'Data\\medical_books\\Medical_Book.pdf', 'total_pages': 1428, 'page': 1, 'page_label': '2'}, page_content='Medicine\nDavidson’s\nPrinciples and Practice of\n                    PDF  Collected  By:\n        Dr. Nazmul Alam Faruki'),
 Document(metadata={'producer': '2.3.4 (4.2.13) d', 'creator': 'PDFsam Basic v4.2.12', 'creationdate': '2022-04-20T12:57:23+06:00', 'moddate': '2022-04-20T17:12:11+

### Embeddings

In [12]:
# Download the embedding model from HuggingFace
from langchain_community.embeddings import HuggingFaceEmbeddings

def load_embedding_model():
    """
    Download & load HuggingFace embedding model.
    Downloads only once and caches locally.
    """
    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

In [13]:
embedding_model = load_embedding_model()

  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [14]:
query_text = embedding_model.embed_query("What is the primary purpose of clinical trials?")


print("Sample embedding vector (first 10 values):", query_text[:10])
print("Embedding vector length:", len(query_text))

Sample embedding vector (first 10 values): [-0.01971418410539627, 0.09143586456775665, -0.033671844750642776, -0.0478733666241169, 0.02831442281603813, 0.039833202958106995, -0.035822585225105286, 0.1060108095407486, 0.1449873000383377, 0.06740771979093552]
Embedding vector length: 384


In [15]:
#query_text

### PINECONE 

In [16]:
from dotenv import load_dotenv
load_dotenv()

True

In [17]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")

In [18]:
print(PINECONE_API_KEY)


pcsk_5tMPnu_NypP92KMUvjjQVsyp4qLjY3hvsMUBuio5XDmAeHdFnPLPMFYqShHLdAf5M183z3


In [21]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("medibot-medical")

** Run this if the previous cell got error **

In [20]:
from pinecone import Pinecone, ServerlessSpec
import os

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medibot-medical"


pc.create_index(
    name=index_name,
    dimension=384,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1")
)


{
    "name": "medibot-medical",
    "metric": "cosine",
    "host": "medibot-medical-bat42bx.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [22]:
import os 
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

** Run only once **

In [23]:
## Embed each chunk and upsert the embeddings into pinecone index

from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore.from_documents(
    documents=chunked_data,
    embedding=embedding_model,
    index_name=index_name
)


In [24]:
# Load the existing Pinecone index

from langchain_pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(
    embedding=embedding_model,
    index_name=index_name
)

In [25]:
vector_store

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x19b70a93ed0>

In [26]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [27]:
retriever_docs = retriever.invoke("What is RNA?")

In [28]:
retriever_docs

[Document(id='99f6d045-2727-4f5f-974a-cf931b23f3d2', metadata={'creationdate': '2022-04-20T12:57:23+06:00', 'creator': 'PDFsam Basic v4.2.12', 'moddate': '2022-04-20T17:12:11+06:00', 'page': 61.0, 'page_label': '62', 'producer': '2.3.4 (4.2.13) d', 'source': 'Data\\medical_books\\Medical_Book.pdf', 'total_pages': 1428.0}, page_content='T h e  f u n d a m e n t a l  p r i n c i p l e s  o f  g e n o m i c s   \ue049  39\n3\nRNA dif fers fr om DNA in thr ee main ways:\n\ue049 RNA is single-stranded.\ue049 The sugar r esidue within the nucleotide is ribose, rather than deoxyribose.\ue049 It contains uracil (U) in place of thymine (T).'),
 Document(id='d04cec96-d1f0-4af6-9b00-a227a7cc2471', metadata={'creationdate': '2022-04-20T12:57:23+06:00', 'creator': 'PDFsam Basic v4.2.12', 'moddate': '2022-04-20T17:12:11+06:00', 'page': 62.0, 'page_label': '63', 'producer': '2.3.4 (4.2.13) d', 'source': 'Data\\medical_books\\Medical_Book.pdf', 'total_pages': 1428.0}, page_content='The start of the 5 

In [29]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model="llama-3.3-70b-versatile",   # fast & cheap
    temperature=0.4,
    max_tokens=500
)


In [30]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

SYSTEM_PROMPT = """
You are a medical-book question answering assistant.
Use ONLY the provided context.
If the answer is not in the context, say "Thank you. I don't know. Please consult a medical professional for accurate information."
Do not provide medical diagnosis.
Use a maximum of three sentences and keep the answer concise.
"""

PROMPT = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_PROMPT),
    ("human", "Question: {question}\n\nContext:\n{context}")
])

def build_groq_rag_chain(retriever):
    llm = ChatGroq(
        model="llama-3.3-70b-versatile",   # or "llama3-70b-8192"
        temperature=0.4,
        max_tokens=500
    )

    def format_docs(docs):
        # include source/page for traceability
        return "\n\n".join(
            f"Source: {d.metadata.get('source')} | Page: {d.metadata.get('page')}\n{d.page_content}"
            for d in docs
        )

    rag_chain = (
        {
            "context": retriever | format_docs,
            "question": RunnablePassthrough()
        }
        | PROMPT
        | llm
    )

    return rag_chain


In [33]:
rag_chain = build_groq_rag_chain(retriever)

resp = rag_chain.invoke("What is RNA?")


In [34]:
resp.content

'RNA (Ribonucleic acid) is a single-stranded molecule that differs from DNA in three main ways: it contains the sugar residue ribose, it contains uracil (U) instead of thymine (T), and it is single-stranded. RNA plays a crucial role in protein synthesis, with different types of RNA, such as messenger RNA (mRNA) and transfer RNA (tRNA), involved in various steps of the process. RNA molecules can have different functions and encode different proteins through a process called alternative splicing.'