In [20]:
import os
os.chdir("../")
%pwd

'd:\\MY PROJECTS\\LLM\\Medical-Diagnosis-Chatbot'

In [21]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [22]:
#Extract data from the PDF File

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader
                             )
    
    documents = loader.load()
    
    return documents

In [23]:
extracted_data = load_pdf_file(data='Data/')

In [None]:
#extracted_data

In [24]:
#Split the data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [25]:
text_chunks = text_split(extracted_data)
print("Length of text chunks: ", len(text_chunks))

Length of text chunks:  8706


In [26]:
#IMPORT
from langchain_huggingface import HuggingFaceEmbeddings


In [27]:

def download_higghing_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
    )
    return embeddings


In [28]:
embeddings = download_higghing_face_embeddings()

In [35]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [51]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [52]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [38]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec



pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-index"
existing_indexes = pc.list_indexes().names()

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
else:
    print(f"Index '{index_name}' already exists.")


Index 'medical-index' already exists.


In [53]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [42]:
#Embed each chunk of text and store it in Pinecone
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name,
)



In [43]:
#existing index

from langchain_pinecone import PineconeVectorStore

# Embed each chunk and upsert the embeddings into the Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)


In [44]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1fa5dd2eff0>

In [45]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [None]:
retrieved_docs = retriever.invoke("What is the best way to treat a headache?")

In [47]:
retrieved_docs

[Document(id='ec1eacb9-f83f-4225-bf52-6c2f5b78262c', metadata={'creationdate': '2004-12-18T17:52:16-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T18:38:56-06:00', 'page': 169.0, 'page_label': '170', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data\\Medical_book.pdf', 'total_pages': 940.0}, page_content='Other prescription drugs are used to treat specific\ntypes of pain or specific pain syndromes. For example,\ncorticosteroidsare very effective against pain caused by\ninflammation and swelling, and sumatriptan (Imitrex)\nwas developed to treat migraine headaches.\nDrug administration depends on the drug type and\nthe required dose. Some drugs are not absorbed very well\nfrom the stomach and must be injected or administered\nintravenously. Injections and intravenous administration'),
 Document(id='38416bf6-4e08-4c79-8045-0f5002701d67', metadata={'creationdate': '2004-12-18T17:52:16-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T18:38:56-06:00', 'page': 762.0, 'page_label'

In [66]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="arliai/qwq-32b-arliai-rpr-v1:free",  
    base_url="https://openrouter.ai/api/v1",
    api_key="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", 
    temperature=0.4,
    max_tokens=500
)

In [67]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the follwing piece of retrieved context to answer the question. If you don't know the answer, say 'I don't know'."
    "Be concise and answer in a single sentence."
    "Use three sentences at most and keep the answer concise."
    "Make sure to provide relevant information based on the context."
    "\n\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
    ("system", system_prompt),
    ("human", "{input}")
    ]
)


In [68]:
question_answering_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(
    retriever,
    question_answering_chain,
)

In [80]:
response = rag_chain.invoke({
    "input": "Best medicine for skin rashes"
})

print(response["answer"])


The most effective treatments for skin rashes include cool baths with cornstarch and oatmeal, antipruritic products containing ingredients like aloe, menthol, and camphor, and topical steroids for severe cases.
