In [2]:
%pwd

'd:\\MediBot-Medical_Chatbot\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'd:\\MediBot-Medical_Chatbot'

In [17]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [18]:
# Extract data from the pdf file
def load_pdf_file(data):
    loader= DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

In [21]:
extracted_data = load_pdf_file(data='Data/')

In [23]:
len(extracted_data)

637

In [24]:
# split te data into text chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [25]:
text_chunks=text_split(extracted_data)
print('Length of Text Chunks', len(text_chunks))

Length of Text Chunks 5860


In [29]:
from langchain_huggingface import HuggingFaceEmbeddings

In [32]:
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [35]:
embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [36]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [39]:
from langchain_community.vectorstores import Pinecone

In [43]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(
        api_key="PINECONE_API_KEY"
    )

In [53]:
from dotenv import load_dotenv
import os

load_dotenv()



True

In [44]:
index_name = 'medical'

In [54]:
from langchain_pinecone import PineconeVectorStore 
vectorstore_from_texts = PineconeVectorStore.from_texts(
        [t.page_content for t in text_chunks],
        index_name=index_name,
        embedding=embeddings
    )

In [55]:
retriever = vectorstore_from_texts.as_retriever(search_type='similarity', seach_kwargs={"k":3})

In [57]:
retriever_docs = retriever.invoke("What is Acne")

In [58]:
retriever_docs

[Document(id='a54c8047-75b0-4c2e-a97a-8ce0d3eed103', metadata={}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='7269970b-fca5-4423-991c-cf9d3fb9d9bb', metadata={}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells, and bacteria.\nDescription\nAcne vulgaris, the medical term for common acne, is\nthe most common skin disease. It affects nearly 17 million\npeople in the United States. While acne can arise at any'),
 Document(id='3ab86954-31a3-431e-bee2-fcfdd1e550e3', metadata={}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a womanâ€™s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Bi

In [62]:
from langchain_ollama import ChatOllama

llm = ChatOllama(
    model="gemma:2b",
    temperature=0,
    # other params...
)

In [78]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

system_prompt = """
You must decide FIRST whether the answer exists in the context.

Rules:
- If the answer is NOT fully present in the context, reply ONLY with: "I don't know."
- If the answer IS present, answer using ONLY the context.
- Do NOT add explanations, apologies, or extra text.
- Do NOT mix answers.

Context:
{context}

Question:
{input}
"""


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

rag_chain = (
    {
        "context": retriever,
        "input": RunnablePassthrough()
    }
    | prompt
    | llm
    | StrOutputParser()
)


In [79]:
response = rag_chain.invoke("What is Acromegaly and gigantism")

In [80]:
response

'Acromegaly and gigantism are disorders in which the abnormal release of a particular chemical from the pituitary gland in the brain causes increased growth in bone and soft tissue, as well as a variety of other disturbances throughout the body.'

In [68]:
def clean_answer(text: str) -> str:
    if "\n\n" in text:
        return text.split("\n\n", 1)[1].strip()
    return text.strip()


In [77]:
clean_answer(response)

'Acromegaly is a disorder in which the abnormal release of a particular chemical from the pituitary gland in the brain causes increased growth in bone and soft tissue, as well as a variety of other disturbances throughout the body.\n\nAcromegaly and gigantism are not mentioned in the context, so I cannot answer this question from the context.'