In [1]:
print("Hello")

Hello


In [2]:
%pwd

'/home/vicron/Desktop/healthcare_chat1/healthcare_chatbot/research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'/home/vicron/Desktop/healthcare_chat1/healthcare_chatbot'

In [5]:
%pip install -U langchain-huggingface sentence-transformers

Collecting langchain-huggingface
  Using cached langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Using cached langchain_huggingface-0.3.0-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
#imports
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

In [7]:
#Check data

data_dir = "Data/"
if not os.path.exists(data_dir):
    raise FileNotFoundError(f"Directory '{data_dir}' does not exist. Please create it and add PDF files.")

In [8]:
#Load Data

def load_pdf(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [9]:
extracted_data = load_pdf(data=data_dir)

In [11]:
#extracted_data

In [12]:
def split_text(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [13]:
text_chunks =  split_text(extracted_data)
print(f"Length of text chunks : {len(text_chunks)}")

Length of text chunks : 5859


In [14]:
#downloading  Huggingface embeddings
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings

In [15]:
embeddings = download_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
query_result = embeddings.embed_query("Hello Victor")
print ("length", len(query_result))

length 384


In [18]:
#query_result

In [41]:
from dotenv import load_dotenv
load_dotenv()

True

In [42]:
PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY')

In [22]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "healthcare-chatbot1"

pc.create_index(
    name=index_name,
    dimension=384,  
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

{
    "name": "healthcare-chatbot1",
    "metric": "cosine",
    "host": "healthcare-chatbot1-ccj89ta.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [50]:
import os
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = DEEPSEEK_API_KEY

In [51]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,  
)

In [52]:
#Load Existing Index
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name = index_name,
    embedding = embeddings
)


In [35]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7ff811b13a90>

In [27]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [28]:
retrieved_docs = retriever.invoke("What is acne")

In [29]:
retrieved_docs

[Document(id='6e9d8134-0cc7-4a64-bca2-d1be1ea37193', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='98857843-b5c3-4911-8a21-78aa6fe8b38f', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'Data/Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'total_pages': 637.0}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the s

In [53]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    temperature=0.4,
    max_tokens=500,
    model_name="deepseek/deepseek-r1-0528:free",  # or "deepseek-coder"
    openai_api_key=os.environ["DEEPSEEK_API_KEY"],  # ✅ your DeepSeek key
    openai_api_base="https://openrouter.ai/api/v1"   # ✅ override base to DeepSeek
)


In [62]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are a strict and concise AI assistant for medical question-answering.\n"
    "ONLY use the provided context below to answer the user's question.\n"
    "If the context does not contain enough information to answer the question,\n"
    "you MUST reply with exactly: \"I don't know.\"\n"
    "Do NOT try to guess, infer, or use outside knowledge.\n"
    "Do NOT rephrase irrelevant content. Say: \"I don't know.\"\n"
    "NEVER assume. NEVER fabricate.\n\n"
    "{context}"
)



prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])


In [55]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [56]:
response = rag_chain.invoke({"input":"what is Acne"})
print(response["answer"])

Based on the provided context:

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. The medical term for common acne is Acne vulgaris.


In [64]:
response = rag_chain.invoke({"input":"what is malaria"})
print(response["answer"])

Malaria is a disease caused by *Plasmodium* parasites (single-celled protozoa) in red blood cells. It is transmitted exclusively through bites from infected anopheline mosquitoes and characterized by recurring severe chills and fever. While eliminated in some regions through mosquito control, it remains prevalent in tropical and subtropical areas.
