In [20]:
import os
os.chdir("../")
%pwd

'd:\\MY PROJECTS\\LLM\\Medical-Diagnosis-Chatbot'

In [128]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
#Extract data from the PDF File

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader
                             )
    
    documents = loader.load()
    
    return documents

In [130]:
extracted_data = load_pdf_file(data='Data/')

In [133]:
#extracted_data
print("Number of documents extracted: ", len(extracted_data))

Number of documents extracted:  6356


In [134]:
#Split the data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [135]:
text_chunks = text_split(extracted_data)
print("Length of text chunks: ", len(text_chunks))

Length of text chunks:  55056


In [136]:
#IMPORT
from langchain_huggingface import HuggingFaceEmbeddings


In [137]:

def download_higghing_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
    )
    return embeddings


In [138]:
embeddings = download_higghing_face_embeddings()

In [139]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [145]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [146]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [142]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec



pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-index"
existing_indexes = pc.list_indexes().names()

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
else:
    print(f"Index '{index_name}' already exists.")


Index 'medical-index' already exists.


In [143]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [144]:
#Embed each chunk of text and store it in Pinecone
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name,
)



In [152]:
#existing index

from langchain_pinecone import PineconeVectorStore

# Embed each chunk and upsert the embeddings into the Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)


In [153]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1fa4bd38950>

In [154]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="google/gemma-3-1b-it:free",  
    base_url="https://openrouter.ai/api/v1",
    api_key="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", 
    temperature=0.4,
    max_tokens=500
)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the follwing piece of retrieved context to answer the question. If you don't know the answer, say 'I don't know'."
    "Be concise and answer in a single sentence."
    "Use three sentences at most and keep the answer concise."
    "Make sure to provide relevant information based on the context."
    "\n\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
    ("system", system_prompt),
    ("human", "{input}")
    ]
)


In [162]:
question_answering_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(
    retriever,
    question_answering_chain,
)

In [None]:
response = rag_chain.invoke({
    "input": "Best medicine for headache?",
})

print(response['answer'])





The best medicine for headache depends on the type of headache and its severity. For tension-type headaches, over-the-counter pain relievers like aspirin, acetaminophen, ibuprofen, or naproxen can be effective. For migraines, a combination of acetaminophen, aspirin, and caffeine, such as Exedrin Migraine, can help relieve symptoms. In more severe cases, drugs that act on serotonin receptors may be prescribed. It's important to consult with a healthcare professional for personalized advice and treatment options.
