In [85]:
from tqdm.auto import tqdm
from langchain_community.document_loaders import PyPDFLoader
import os
import glob

def load_pdf(data_dir):
    documents = []
    pdf_files = glob.glob(os.path.join(data_dir, "**/*.pdf"), recursive=True)

    for pdf_file in tqdm(pdf_files, desc="Loading PDFs"):
        loader = PyPDFLoader(pdf_file)
        docs = loader.load()
        documents.extend(docs)

    return documents


DATA_DIR = "../data"
documents = load_pdf(DATA_DIR)

print(f"Loaded {len(documents)} documents.")

Loading PDFs: 100%|██████████| 1/1 [00:21<00:00, 21.48s/it]

Loaded 637 documents.





In [None]:
from langchain_core.documents import Document
def split_into_mindocus(docs:list[Document]) -> list[Document]: # remove unnessary metadata and only keep page_content and source
    mindocus = []
    for doc in docs:
        mindocus.append(Document(
            page_content=doc.page_content,
            metadata={"src": doc.metadata.get("source")}
        ))
    return mindocus

In [53]:
mindocus = split_into_mindocus(documents)
print(f'example mindocus: {mindocus[56]}')

example mindocus: page_content='imbalanced. Depending on the problem, the acupunctur-
ist will insert needles to manipulate chi on one or more
of the twelve organ meridians. On these twelve meridi-
ans, there are nearly 2,000 points which can be used in
acupuncture, with around 200 points being most fre-
quently used by traditional acupuncturists. During an
individual treatment, one to twenty needles may be used,
depending on which meridian points are chosen.
Acupuncture needles are always sterilized and
acupuncture is a very safe procedure. The depth of inser-
tion of needles varies, depending on which chi channels are
being treated. Some points barely go beyond superficial
layers of skin, while some acupuncture points require a
depth of 1-3 in (2.5-7.5 cm) of needle. The needles general-
ly do not cause pain. Patients sometimes report pinching
sensations and often pleasant sensations, as the body expe-
riences healing. Depending on the problem, the acupunc-
turist might spin or move 

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
def split_into_chunks(docs:list[Document], chunk_size=500, chunk_overlap=20) -> list[Document]:
 
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = []
    for doc in tqdm(docs, desc="Splitting into chunks"):
        doc_chunks = text_splitter.create_documents([doc.page_content], metadatas=[doc.metadata])
        chunks.extend(doc_chunks)
    return chunks
chunks = split_into_chunks(mindocus)
print(f"Total chunks created: {len(chunks)}")

Splitting into chunks: 100%|██████████| 637/637 [00:00<00:00, 6782.52it/s]

Total chunks created: 5859





In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

def get_embedding_model(model_name="all-MiniLM-L6-v2"):
    return HuggingFaceEmbeddings(model_name=model_name)

In [56]:
embedding_model = get_embedding_model()

  return HuggingFaceEmbeddings(model_name=model_name)


In [57]:
len(embedding_model.embed_query("Hello world"))

384

In [86]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.getenv("PIENCONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GROQ_API_KEY"] = GROQ_API_KEY



In [59]:
from pinecone import Pinecone
Pinecone_api_key=PINECONE_API_KEY
pc = Pinecone(api_key=PINECONE_API_KEY)



In [60]:
pc

<pinecone.pinecone.Pinecone at 0x20ac6feaba0>

In [61]:
from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


index = pc.Index(index_name)

In [62]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=chunks,
    embedding=embedding_model,
    index_name=index_name
)

In [63]:
# make a object from index of  pinecone 
docsearch=PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding_model
    
)


In [64]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [66]:
answers=retriever.invoke("What is the most common symptom of diabetes?")
for i, answer in enumerate(answers):
    print(f"Answer {i+1}:")
    print(f"Content: {answer.page_content}")
    print(f"Source: {answer.metadata['src']}")
    print("-" * 50)

Answer 1:
Content: • Type I diabetes mellitus. Characterized by fatigue and
an abnormally high level of glucose in the blood
(hyperglycemia).
• Amyotrophic lateral schlerosis. First signs are stum-
bling and difficulty climbing stairs. Later, muscle
cramps and twitching may be observed as well as
weakness in the hands making fastening buttons or
turning a key difficult. Speech may become slowed or
slurred. There may also be difficluty swallowing. As
respiratory muscles atrophy, there is increased danger
Source: ../data\Medical_book.pdf
--------------------------------------------------
Answer 2:
Content: begin to fall. A person with diabetes mellitus either does
not make enough insulin, or makes insulin that does not
work properly. The result is blood sugar that remains
high, a condition called hyperglycemia.
Diabetes must be diagnosed as early as possible. If
left untreated, it can damage or cause failure of the eyes,
kidneys, nerves, heart, blood vessels, and other body
organs. Hypog

In [79]:
from langchain_groq import ChatGroq
chatModel = ChatGroq(api_key=GROQ_API_KEY, model="llama-3.3-70b-versatile", temperature=0.1)


In [80]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [81]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)



prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [82]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [83]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Acne vulgaris, also known as common acne, is the most common skin disease, affecting nearly 17 million people in the United States.
