Medical Chatbot 

In [1]:
from langchain import PromptTemplate 
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

  from tqdm.autonotebook import tqdm


Extract Data from PDF file

In [2]:
#Extract data from PDF 
def load_pdf(data):
    loader = DirectoryLoader(data, 
                        glob="*.pdf",
                        loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [3]:
extracted_data = load_pdf("C:\\Users\\TIFFANY MUN\\Medical-Chatbot-2\\data")

Convert PDF to text chunks 

In [4]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [5]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 7020


Create Index in Pinecone

In [6]:
PINECONE_API_KEY="ec7f1a2d-e8f6-4843-841f-a0d789cc86b9"

In [7]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

In [8]:
import time

index_name = "medical-chatbot3"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

Create Vector Embeddings

In [9]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  warn_deprecated(


In [10]:
query_result = embeddings.embed_query("Hello World")
print("Length", len(query_result))

Length 384


Store Vector Embeddings in Pinecone

In [11]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embeddings)

In [12]:
from uuid import uuid4

from langchain_core.documents import Document


In [13]:
uuids = [str(uuid4()) for _ in range(len(text_chunks))]

vector_store.add_documents(documents=text_chunks, ids=uuids)

['4a10f456-0781-47d7-8dd6-4e689df7f81e',
 'c2d8c412-1211-4874-9d0f-f8ca35079252',
 '4354db27-8fa8-4840-93fa-526f467dd791',
 'af3131ad-10ff-4603-bacb-4bc0b17284e0',
 'a4e28d18-0ad2-40ef-a976-36ed5f40f262',
 '792bb496-aaab-4465-9aca-a7e58a6e72fb',
 '1fbe6fd7-712a-4d9a-8192-691ed05cbca1',
 'bb3561e5-da1a-4172-9efe-7107dbd1d035',
 '3c940e2d-bb88-4135-ae73-62b2dc6a1976',
 'b39e8c82-85d7-4fa6-9ecc-5c4d214eee7f',
 '8b666055-bc9d-4cd5-8b46-7727f963ac7f',
 '2e7f2917-f87c-4355-a594-1a6a41a44554',
 '33301982-d1ff-4dbb-b74a-4f9a0241ffef',
 'a437c339-c6e4-490a-b7b7-d7c5a5aaffe3',
 '59bbc049-a56d-4124-a706-0a9acb2c7004',
 '75e141d5-551c-4505-b112-d21048a5beeb',
 '2b4ff2ad-8480-4f4c-b0f0-ba54b18c6f6d',
 'a9917db9-37f1-4945-bd43-25a7e2d742b6',
 '706822bd-a080-4f30-8e24-f7c49bb1f207',
 '0e292134-8475-47cc-b18a-82be5d34d1f0',
 '1dbdd748-6114-40ff-b672-991852b80d2f',
 '0ee759f9-6ede-4835-80f4-ed814357b233',
 'b8670e24-f25e-4126-a104-58a5d7f0854b',
 'a1639af9-54dd-4978-b4e9-7ecf31990712',
 '0875c8c7-241d-

In [14]:
index_stats = index.describe_index_stats()
print("Index Statistics:", index_stats)

Index Statistics: {'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 7340}},
 'total_vector_count': 7340}


Document Retrieval and Contextual Similarity Search

In [15]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

retrieved_docs = retriever.invoke("What are Allergies?")

len(retrieved_docs)

3

In [16]:
retrieved_docs

[Document(metadata={'page': 130.0, 'source': 'C:\\Users\\TIFFANY MUN\\Medical-Chatbot-2\\data\\Medical_book.pdf'}, page_content="GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which contain histamine.HistaminePollen grains\nLymphocyte\nFIRST EXPOSURE"),
 Document(metadata={'page': 135.0, 'source': 'C:\\Users\\TIFFANY MUN\\Medical-Chatbot-2\\data\\Medical_book.pdf'}, page_content='the itchy, scratchy nose, eyes, and throat common inallergic rhinitis .\nThe particular allergens to which a person is sensi-'),
 Document(metadata={'page': 129.0, 'source': 'C:\\Users\\TIFFANY MUN\\Medical-Chatbot-2\\data\\Medical_bo

In [17]:
print(retrieved_docs[0].page_content)

GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies
Allergic rhinitis is commonly triggered by
exposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.
The presence of an allergen causes the
body's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.
IgE molecules attach to mast
cells, which contain histamine.HistaminePollen grains
Lymphocyte
FIRST EXPOSURE


In [18]:
print(retrieved_docs[1].page_content)

the itchy, scratchy nose, eyes, and throat common inallergic rhinitis .
The particular allergens to which a person is sensi-


In [19]:
print(retrieved_docs[2].page_content)

allergens are the following:
• plant pollens
• animal fur and dander
• body parts from house mites (microscopic creatures
found in all houses)
• house dust• mold spores• cigarette smoke• solvents• cleaners
Common food allergens include the following:
• nuts, especially peanuts, walnuts, and brazil nuts
• fish, mollusks, and shellfish• eggs• wheat• milk• food additives and preservatives
The following types of drugs commonly cause aller-
gic reactions:
• penicillin or other antibiotics


LLM Model: Hugging Face

In [20]:
# from huggingface_hub import login
# login(token = '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')


In [21]:
# llm = CTransformers(model=r"C:\Users\TIFFANY MUN\Medical-Chatbot-2\model\llama-2-7b-chat.ggmlv3.q4_0.bin",
#                     model_type="llama",
#                     config={'max_new_tokens': 150, 'temperature': 0.5})

LLM Model: Open AI

In [29]:
# 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

In [22]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo")

Retrieval Augmented Generation

In [23]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""


custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

result = rag_chain.invoke("What are Allergens?")
print(result)

Allergens are substances that can trigger allergic reactions in the body, such as plant pollens, animal fur, dust, mold spores, and certain foods like nuts and shellfish. Thanks for asking!


In [24]:
while True:
    user_input = input("Input Prompt: ")

    if user_input.lower() in ['exit', 'quit']:
        print("Exiting the loop.")
        break

    try:
        # Directly pass the user input string
        print(f"Querying with: {user_input}")
        
        result = rag_chain.invoke(user_input)
        
        # Print the result
        print("Response: ", result)
    except Exception as e:
        # Print any exceptions that occur
        print(f"An error occurred: {e}")


Querying with: what is acne?
Response:  Acne is a common skin disease characterized by pimples on the face, chest, and back due to clogged pores with oil, dead skin cells, and bacteria. Treatments include topical products like benzoyl peroxide or tretinoin, as well as isotretinoin for severe cases. Thanks for asking!
Querying with: what is allergens?
Response:  Allergens are substances that provoke an allergic response, such as plant pollens, animal fur, and certain foods like nuts and shellfish. Thanks for asking!
Querying with: what is athlete's foot?
Response:  Athlete's foot is a common fungal infection that affects the skin, causing itching, soreness, and peeling, typically found between the toes. It is often contracted in moist environments such as swimming pools, showers, and locker rooms. Thanks for asking!
Exiting the loop.


Retrieval Augmented Generation Check

In [25]:
formatted_docs = format_docs(retrieved_docs)
print(formatted_docs)  # Print formatted documents to verify


GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies
Allergic rhinitis is commonly triggered by
exposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.
The presence of an allergen causes the
body's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.
IgE molecules attach to mast
cells, which contain histamine.HistaminePollen grains
Lymphocyte
FIRST EXPOSURE

the itchy, scratchy nose, eyes, and throat common inallergic rhinitis .
The particular allergens to which a person is sensi-

allergens are the following:
• plant pollens
• animal fur and dander
• body parts from house mites (microscopic creatures
found in all houses)
• house dust• mold spores• cigarette smoke• solvents• cleaners
Common food allergens include the following:
• nuts, especially peanuts, walnuts, and brazil nuts
• fish, mollusks, and shellfish• eggs• wheat• milk• food additives and p

In [26]:
query = "What are Allergens?"
retrieved_docs = retriever.invoke(query)
formatted_docs = format_docs(retrieved_docs)
prompt = custom_rag_prompt.format(context=formatted_docs, question=query)
print("Prompt:", prompt)  # Print the prompt to see if it includes the context

response = llm(prompt)  # Get the LLM response
print("LLM Response:", response)


Prompt: Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

allergens are the following:
• plant pollens
• animal fur and dander
• body parts from house mites (microscopic creatures
found in all houses)
• house dust• mold spores• cigarette smoke• solvents• cleaners
Common food allergens include the following:
• nuts, especially peanuts, walnuts, and brazil nuts
• fish, mollusks, and shellfish• eggs• wheat• milk• food additives and preservatives
The following types of drugs commonly cause aller-
gic reactions:
• penicillin or other antibiotics

and the offend-ing substance is called an allergen. Common inhaledallergens include pollen, dust, and insect parts from tinyhouse mites. Common food allergens include nuts, fish,and milk.

When thisoccurs, an

  warn_deprecated(


LLM Response: content='Allergens are substances that can cause allergic reactions in individuals. They can include plant pollens, animal fur, certain foods like nuts and fish, and medications such as penicillin. Thanks for asking!' response_metadata={'token_usage': {'completion_tokens': 42, 'prompt_tokens': 263, 'total_tokens': 305}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-ecf2f328-b90e-45d0-8390-ed9ded52049e-0' usage_metadata={'input_tokens': 263, 'output_tokens': 42, 'total_tokens': 305}


In [27]:
while True:
    user_input = input("Input Prompt: ")

    if user_input.lower() in ['exit', 'quit']:
        print("Exiting the loop.")
        break

    try:
        # Retrieve relevant documents based on the user input
        retrieved_docs = retriever.invoke(user_input)
        
        # Format the retrieved documents
        formatted_docs = format_docs(retrieved_docs)
        
        # Create the prompt using the formatted context
        prompt = custom_rag_prompt.format(context=formatted_docs, question=user_input)
        print("Prompt:", prompt)  # Print the prompt to see if it includes the context
        
        # Get the LLM response
        response = llm(prompt)
        
        # Print the result
        print("Response:", response)
        
    except Exception as e:
        # Print any exceptions that occur
        print(f"An error occurred: {e}")


Prompt: Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.

Acidosis seeRespiratory acidosis; Renal
tubular acidosis; Metabolic acidosis
Acne
Definition
Acne is a common skin disease characterized by
pimples on the face, chest, and back. It occurs when thepores of the skin become clogged with oil, dead skincells, and bacteria.
Description
Acne vulgaris, the medical term for common acne, is

GALE ENCYCLOPEDIA OF MEDICINE 2 25Acne
Acne vulgaris affecting a woman’s face. Acne is the general
name given to a skin disorder in which the sebaceousglands become inflamed. (Photograph by Biophoto Associ-
ates, Photo Researchers, Inc. Reproduced by permission.)GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25

ent purposes. For example, lotions, soaps, gels, a

Draft

In [28]:
# from langchain_core.prompts import ChatPromptTemplate

# system_prompt = (
#     "You are an assistant for question-answering tasks. "
#     "Use the following pieces of retrieved context to answer "
#     "the question. If you don't know the answer, say that you "
#     "don't know. Use three sentences maximum and keep the "
#     "answer concise."
#     "\n\n"
#     "{context}"
# )

# PROMPT = ChatPromptTemplate.from_messages(
#     [
#         ("system", system_prompt),
#         ("human", "{input}"),
#     ]
# )

# from langchain.chains import create_retrieval_chain
# from langchain.chains.combine_documents import create_stuff_documents_chain
# question_answer_chain = create_stuff_documents_chain(llm, PROMPT)
# rag_chain = create_retrieval_chain(retriever, question_answer_chain)
# response = rag_chain.invoke({"input": "What is Acne?"})
# response
# print(response["answer"])