In [38]:
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(os.getcwd())))
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from typing import List
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.runnables import RunnableParallel
from pinecone import Pinecone, ServerlessSpec
from utils import get_settings
import warnings
warnings.filterwarnings("ignore")

In [2]:
abs_path = os.path.dirname(os.path.abspath(os.getcwd()))
data_path = os.path.join(abs_path, 'data')

In [3]:
# Extracting text from PDFs in the data directory
def load_pdf_file(data_directory):
    loader = DirectoryLoader(data_directory,
                              glob = "*.pdf",
                            loader_cls = PyPDFLoader)
    documents = loader.load()
    return documents

In [4]:
extracted_documents = load_pdf_file(data_path)

In [5]:
len(extracted_documents)

637

In [6]:
def filter_to_minimal_docs(docs: List[Document], ) -> List[Document]:
    """
    Given a list of documnets, return a new list of documents cntaining only the 'source'
    and the original 'page_content'.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get('source')
        minimal_doc = Document(
            page_content=doc.page_content,
            metadata={'source': src}
        )
        minimal_docs.append(minimal_doc)
    return minimal_docs

In [7]:
minimal_docs = filter_to_minimal_docs(extracted_documents)

In [8]:
# Chunking the documents into smaller pieces
def split_documents(docs: List[Document], chunk_size: int =500, chunk_overlap: int = 100) -> List[Document]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""],
        length_function=len
    )
    return text_splitter.split_documents(docs)

In [9]:
chunked_documents = split_documents(minimal_docs)
print(f"Number of chunked documents: {len(chunked_documents)}")

Number of chunked documents: 6600


In [10]:
def download_embeddings():
    """
    Download and return the HuggingFace embeddings model.
    """
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name
    )
    return embeddings

embeddings_model = download_embeddings()

In [11]:
pinecone_api_key = get_settings().PINECONE_API_KEY


In [12]:
pc = Pinecone(api_key=pinecone_api_key, environment="us-west4-gcp")

In [13]:
# Get embedding dimension by embedding a sample text
sample_embedding = embeddings_model.embed_query("sample text")
embedding_dimension = len(sample_embedding)
print(f"Embedding dimension: {embedding_dimension}")

Embedding dimension: 384


In [14]:
index_name = "medical-chatbot"

# Check if index exists
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embedding_dimension,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print(f"Created index: {index_name}")
else:
    print(f"Index {index_name} already exists")

Created index: medical-chatbot


In [15]:
index = pc.Index(index_name)

In [16]:
docsearch = PineconeVectorStore.from_documents(
    documents=chunked_documents,
    embedding=embeddings_model,
    index_name = index_name
)

In [29]:
retriever  = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [30]:
query = "what is acne"
retrieved_docs = retriever.invoke(query)
retrieved_docs

[Document(id='afb08e86-e221-47b4-aac6-e135d94dbc39', metadata={'source': 'c:\\Users\\omarj\\Documents\\AI\\NLP\\Project\\Medical-Chatbot\\data\\Medical_book.pdf'}, page_content='Journal of Urology (Mar. 1998): 935-940.\nNancy J. Nordenson\nAcid reflux see Heartburn\nAcidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million'),
 Document(id='32e70a08-01f1-4cc7-89f1-7bcafe0546bb', metadata={'source': 'c:\\Users\\omarj\\Documents\\AI\\NLP\\Project\\Medical-Chatbot\\data\\Medical_book.pdf'}, page_content='creams containing benzoyl peroxide or tretinoin may be\nused to clear up mild to moderately severe acne.\nIsotretinoin (Accutane) is prescribed onl

In [23]:
gemini_api_key = get_settings().GEMINI_API_KEY
chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    google_api_key=gemini_api_key
)

In [26]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [35]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

In [39]:
question_answer_chain = prompt | chat_model   

rag_chain = (
    RunnableParallel(
        {
            "context": lambda x: format_docs(retriever.invoke(x["input"])),
            "input": lambda x: x["input"]
        }
    )
    | question_answer_chain
)

In [40]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response.content)

Acromegaly is a disorder where an abnormal release of a chemical from the pituitary gland causes increased growth in bone and soft tissue, occurring after bone growth has stopped. Gigantism is a variant of this disorder that occurs in children whose bony growth plates have not closed, leading to exceptional growth of long bones and unusual height. Both conditions involve abnormal growth due to pituitary gland dysfunction.


In [41]:
response = rag_chain.invoke({"input": "what is acne?"})
print(response.content)

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores or hair follicles of the skin become clogged with oil, dead skin cells, and bacteria. This blockage allows sebum, a waxy material, to collect inside the pores or follicles.
