## RAG Project : Medical ChatBot

### Change path to project directory

In [1]:
%pwd

'/home/reki/Documents/medical-chatbot/research'

In [2]:
import os 
os.chdir("../")

### 1. Load Documents

In [3]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

> Display document content and metadata

In [5]:
extracted_data = load_pdf_files("data")
print("Document page :", len(extracted_data))

Document page : 637


In [6]:
import pandas as pd

# Prepare data for the table
data = {
    "Source": [doc.metadata.get("source", "") for doc in extracted_data],
    "producer": [doc.metadata.get("producer", "") for doc in extracted_data],
    "creator": [doc.metadata.get("creator", "") for doc in extracted_data],
    "creationdate": [doc.metadata.get("creationdate", "") for doc in extracted_data],
    "Page Content": [doc.page_content for doc in extracted_data]
}
# Create DataFrame and display
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Source,producer,creator,creationdate,Page Content
0,data/Medical_book.pdf,PDFlib+PDI 5.0.0 (SunOS),PyPDF,2004-12-18T17:00:02-05:00,
1,data/Medical_book.pdf,PDFlib+PDI 5.0.0 (SunOS),PyPDF,2004-12-18T17:00:02-05:00,The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND ED...
2,data/Medical_book.pdf,PDFlib+PDI 5.0.0 (SunOS),PyPDF,2004-12-18T17:00:02-05:00,The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND ED...
3,data/Medical_book.pdf,PDFlib+PDI 5.0.0 (SunOS),PyPDF,2004-12-18T17:00:02-05:00,"STAFF\nJacqueline L. Longe, Project Editor\nDe..."
4,data/Medical_book.pdf,PDFlib+PDI 5.0.0 (SunOS),PyPDF,2004-12-18T17:00:02-05:00,Introduction.....................................


In [7]:
### Get page contents
print("Document page content:\n", extracted_data[3].page_content[:1000])

Document page content:
 STAFF
Jacqueline L. Longe, Project Editor
Deirdre S. Blanchfield, Associate Editor
Christine B. Jeryan, Managing Editor
Donna Olendorf, Senior Editor
Stacey Blachford, Associate Editor
Kate Kretschmann, Melissa C. McDade, Ryan
Thomason, Assistant Editors
Mark Springer, Technical Specialist
Andrea Lopeman, Programmer/Analyst
Barbara J. Yarrow,Manager, Imaging and Multimedia
Content
Robyn V . Young,Project Manager, Imaging and
Multimedia Content
Dean Dauphinais, Senior Editor, Imaging and
Multimedia Content
Kelly A. Quin, Editor, Imaging and Multimedia Content
Leitha Etheridge-Sims, Mary K. Grimes, Dave Oblender,
Image Catalogers
Pamela A. Reed, Imaging Coordinator
Randy Bassett, Imaging Supervisor
Robert Duncan, Senior Imaging Specialist
Dan Newell, Imaging Specialist
Christine O’Bryan,Graphic Specialist
Maria Franklin, Permissions Manager
Margaret A. Chamberlain, Permissions Specialist
Michelle DiMercurio, Senior Art Director
Mike Logusz, Graphic Artist
Mary Bet

> Load minimal document elements for embedding

In [8]:
from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [9]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [10]:
import pandas as pd

# Prepare data for the table
data = {
    "Source": [doc.metadata.get("source", "") for doc in minimal_docs],
    "Page Content": [doc.page_content for doc in minimal_docs]
}

# Create DataFrame and display
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Source,Page Content
0,data/Medical_book.pdf,
1,data/Medical_book.pdf,The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND ED...
2,data/Medical_book.pdf,The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND ED...
3,data/Medical_book.pdf,"STAFF\nJacqueline L. Longe, Project Editor\nDe..."
4,data/Medical_book.pdf,Introduction.....................................


### 2. Split document into Chunks

In [11]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [12]:
import random

def count_words(text):
    return len(text.split())
texts_chunk = text_split(minimal_docs)
# Select a random chunk index
random_chunk = random.randint(1, len(texts_chunk))
# Example usage:
sample_text = texts_chunk[random_chunk].page_content
word_count = count_words(sample_text)
character_count = len(sample_text)
print(f"Number of chunks: {len(texts_chunk)}")
print(f"Random Numbern chunks Selected : {random_chunk}")
print("Random Chunk Word count:", word_count)
print("Random Chunk Character count:", character_count)
print("Random Chunk Content:\n\n", texts_chunk[random_chunk].page_content)



Number of chunks: 5859
Random Numbern chunks Selected : 4265
Random Chunk Word count: 69
Random Chunk Character count: 447
Random Chunk Content:

 disease is limited to a small area of the Andes Mountains
in western South America; nearly all cases have been in
Peru, Colombia, and Ecuador. A large outbreak involving
thousands of people occurred in 1940–41, but bartonel-
losis has since occurred sporadically. Control of sand-
flies, the only known disease carrier (vector), has been
credited with managing the disease.
Causes and symptoms
Bartonellosis is transmitted by the nocturnal sandfly


### 3. Text Embedding

In [13]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embedding = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [14]:
vector = embedding.embed_query(sample_text)
print("Embedding size : ", len(vector))
print("Embedding vector : ", vector)

Embedding size :  384
Embedding vector :  [0.06012215092778206, -0.01892496645450592, -0.002912055002525449, 0.055078666657209396, 0.060785096138715744, -0.032075341790914536, 0.03570769354701042, -0.02875601500272751, -0.04718735069036484, 0.07564855366945267, 0.08060812205076218, -0.003895913017913699, 0.004809449892491102, 0.08964019268751144, -0.04438679292798042, 0.028742648661136627, -0.060592878609895706, -0.09547184407711029, 0.0721321702003479, 0.06929626315832138, -0.015975212678313255, 0.10873381048440933, -0.0020235429983586073, 0.021238096058368683, -0.12038499861955643, -0.023920513689517975, 0.027192866429686546, -0.02826279029250145, -0.03883209079504013, -0.058319319039583206, -0.06295912712812424, 0.0715273842215538, -0.03102743998169899, -0.024688255041837692, -0.020313717424869537, -0.03470931202173233, -0.011097008362412453, -0.0509861521422863, 0.007796239107847214, 0.04377606511116028, -0.018627390265464783, -0.052076976746320724, 0.06433876603841782, -0.03081890

### 4. Vecror Database

In [15]:
# Access environment variables
from dotenv import load_dotenv
import os
load_dotenv()
# Access environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

In [17]:
# Check Pinecone API connection
from pinecone import Pinecone 
pc = Pinecone(api_key=PINECONE_API_KEY)
pc

<pinecone.pinecone.Pinecone at 0x73bbfd06efe0>

In [18]:
# Create Pinecone index or connect to existing one 
from pinecone import ServerlessSpec
index_name = "medi-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,  # Dimension of the embeddings
        metric="cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)

In [19]:
# Check Pinecone API connection and index stats
try:
    stats = pc.describe_index(index_name)
    print("Pinecone API connection successful.")
    print("Index stats:", stats)
except Exception as e:
    print("Pinecone API connection failed:", e)

Pinecone API connection successful.
Index stats: {'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'medi-bot-8g8vd2x.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medi-bot',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'},
 'tags': None,
 'vector_type': 'dense'}


In [None]:
# Create Pinecone vector store from documents
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

### 5. Retrieve data from Knowledge Base

In [20]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

In [21]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [22]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='0c6b6a02-dc4c-49d6-84a9-f44adf7b031d', metadata={'source': 'data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='a089d50c-b605-4e50-a810-591a3feeade9', metadata={'source': 'data/Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='3f8e0a96-78e2-49f9-9995-a325cbcc68a1', metadata={'source': 'data/Medical_book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with 

### 6. Retrieve knowledge Intelligently with LLM

In [None]:
# Use Groq LLM Provider
import os
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

GROQ_API_KEY = os.getenv("GROQ_API_KEY")
# --- IGNORE ---
# --- IGNORE ---
# os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY --- IGNORE ---
# os.environ["GROQ_API_KEY"] = GROQ_API_KEY --- IGNORE ---      

In [None]:
import getpass
import os
# Prompt for API key if not set
if not os.environ.get("GROQ_API_KEY"):
  # Prompt for API key securely
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")
  # Get LLM model
from langchain.chat_models import init_chat_model
model = init_chat_model("llama-3.3-70b-versatile", model_provider="groq")

In [26]:
model.invoke("Hello, world!")

AIMessage(content="Hello! It's nice to meet you. Is there something I can help you with or would you like to chat?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 25, 'prompt_tokens': 39, 'total_tokens': 64, 'completion_time': 0.034802591, 'prompt_time': 0.011298004, 'queue_time': 0.084394172, 'total_time': 0.046100595}, 'model_name': 'llama-3.3-70b-versatile', 'system_fingerprint': 'fp_3f3b593e33', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None}, id='run--4dc5d7da-71e6-44ba-87eb-7aceea5ed364-0', usage_metadata={'input_tokens': 39, 'output_tokens': 25, 'total_tokens': 64})

In [27]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [31]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "Check question's language and answer in the same language."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [32]:
question_answer_chain = create_stuff_documents_chain(model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [33]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])


Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue. Gigantism occurs when this abnormality happens before bone growth stops, resulting in unusual height. Acromegaly occurs when the abnormality happens after bone growth stops, causing a range of other disturbances throughout the body.


In [34]:
response = rag_chain.invoke({"input": "c'est quoi l'acromégalie et le gigantisme ?"})
print(response["answer"])

L'acromégalie est un trouble caractérisé par une croissance anormale des os et des tissus mous en raison d'une libération excessive d'hormone de croissance par la glande pituitaire. Le gigantisme est une condition similaire qui se produit chez les enfants et les adolescents avant la fermeture des plaques de croissance. Les deux conditions entraînent des troubles variés dans l'organisme.
