In [20]:
import os
os.chdir("../")
%pwd

'd:\\MY PROJECTS\\LLM\\Medical-Diagnosis-Chatbot'

In [128]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [129]:
#Extract data from the PDF File

def load_pdf_file(data):
    loader = DirectoryLoader(data,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader
                             )
    
    documents = loader.load()
    
    return documents

In [130]:
extracted_data = load_pdf_file(data='Data/')

In [133]:
#extracted_data
print("Number of documents extracted: ", len(extracted_data))

Number of documents extracted:  6356


In [134]:
#Split the data into Text Chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [135]:
text_chunks = text_split(extracted_data)
print("Length of text chunks: ", len(text_chunks))

Length of text chunks:  55056


In [136]:
#IMPORT
from langchain_huggingface import HuggingFaceEmbeddings


In [137]:

def download_higghing_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
    )
    return embeddings


In [138]:
embeddings = download_higghing_face_embeddings()

In [139]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [145]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [146]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [142]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec



pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-index"
existing_indexes = pc.list_indexes().names()

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
else:
    print(f"Index '{index_name}' already exists.")


Index 'medical-index' already exists.


In [143]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [144]:
#Embed each chunk of text and store it in Pinecone
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name=index_name,
)



In [152]:
#existing index

from langchain_pinecone import PineconeVectorStore

# Embed each chunk and upsert the embeddings into the Pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)


In [153]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1fa4bd38950>

In [154]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [155]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="google/gemini-2.5-pro-preview-03-25",  
    base_url="https://openrouter.ai/api/v1",
    api_key="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", 
    temperature=0.4,
    max_tokens=500
)

In [156]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the follwing piece of retrieved context to answer the question. If you don't know the answer, say 'I don't know'."
    "Be concise and answer in a single sentence."
    "Use three sentences at most and keep the answer concise."
    "Make sure to provide relevant information based on the context."
    "\n\n"
    "Context: {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
    ("system", system_prompt),
    ("human", "{input}")
    ]
)


In [157]:
question_answering_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(
    retriever,
    question_answering_chain,
)

In [159]:
response = rag_chain.invoke({
    "input": "Best medicine for skin rashes"
})

print(response)



{'input': 'Best medicine for skin rashes', 'context': [Document(id='63f4062e-02f9-448d-baf1-fd519a3f185c', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 2122.0, 'page_label': '2093', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': 'Data\\Medical_Book_one.pdf', 'total_pages': 4505.0}, page_content='use the anti-fungal for a full two-week course in order to\nprevent recurrence of the infection.\nAlternative treatment\nTopical treatments include poultices of pepper-\nmint, oregano, or lavender. Tea tree oil, diluted with\na carrier oil of almond oil, can be applied to the rash\nseveral times per day. Cedarwood and jasmine oils can\nrelieve itching when applied in the same manner.\nGrapefruit seed extract can be taken as a strong solu-\ntion of 15 drops in 1 oz of water.'), Document(id='d3988544-aefe-4362-b79f-5bcfd7951b51', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acr