In [1]:
import os

In [2]:
%pwd

'/home/kaustubh/Projects/Medical-Chat-Bot/research'

In [3]:
os.chdir('..')

In [4]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from tqdm import tqdm

In [5]:
# Extract data from Pdf file
def load_pdf_file(data):
    loader= DirectoryLoader(data,glob='*.pdf',loader_cls=PyPDFLoader)

    documents=loader.load()
    return documents

In [6]:
extracted_data = load_pdf_file(data='Data/')

In [7]:
# extracted_data

In [1]:
# Split the data into Text Chunks

MARKDOWN_SEPARATORS = [
    "\n#{1,6} ",
    "```\n",
    "\n\\*\\*\\*+\n",
    "\n---+\n",
    "\n___+\n",
    "\n\n",
    "\n",
    " ",
    "",
]

def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=40, 
                                                 separators=MARKDOWN_SEPARATORS, strip_whitespace=True)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [2]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

NameError: name 'extracted_data' is not defined

In [23]:
text_chunks[10000]

Document(metadata={'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf', 'page': 657}, page_content='Description\nInfections associated with worms present some of\nthe most universal health problems in the world. In fact,only malaria accounts for more diseases than schistoso-')

In [5]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

In [11]:
from sentence_transformers import SentenceTransformer

print(f"Model's maximum sequence length: {SentenceTransformer('sentence-transformers/all-mpnet-base-v2').max_seq_length}")

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]


Model's maximum sequence length: 384


In [12]:
# Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings= HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')
    return embeddings

In [13]:
embeddings = download_hugging_face_embeddings()

In [14]:
embeddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [31]:
query_result = embeddings.embed_query("Hello AI")
print("Length of Query Result", len(query_result))

Length of Query Result 768


In [15]:
# query_result

In [6]:
from dotenv import load_dotenv

load_dotenv()

True

In [7]:
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")

In [34]:
from pinecone import ServerlessSpec
from pinecone.grpc import PineconeGRPC as Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medichatbot"

pc.create_index(
    name=index_name,
    dimension=768,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [8]:
os.environ["PINECONE_API_KEY"]=PINECONE_API_KEY

In [16]:
index_name='medichatbot'

In [10]:
# Embed each chunk and insert the embeddings into your Pinecone Index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

  from tqdm.autonotebook import tqdm


NameError: name 'text_chunks' is not defined

In [17]:
# Load existing index

from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [18]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7f432c1e3a40>

In [28]:
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={"k": 5})

In [41]:
ans=docsearch.similarity_search("what is acne")
ans

[Document(id='77d9953a-ab7d-4620-87b8-83a86eedd7e2', metadata={'page': 923.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='1998, 09B.\nPion, Ira A. “Educating Children and Parents About Sun Pro-\ntection.” Dermatology Nursing 8 (1 Feb. 1996): 29-37.\nTyler, Varro. “Aloe: Nature’s Skin Soother.” Prevention Maga-\nzine, 1 Apr. 1998, 94-96.\nCarol A. Turkington\nSunscreens\nDefinition\nSunscreens are products applied to the skin to pro-\ntect against the harmful effects of the sun’s ultraviolet(UV) rays.'),
 Document(id='04caa190-6d8d-4601-a6f1-7022986c5fe9', metadata={'page': 624.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='and Treatment.” Journal of the Royal Society of Medicine\n90 (Mar. 1997): 144-150.\nThiboutot, Diane M. “Acne Rosacea.” American Family Physi-\ncian 50 (Dec. 1994): 1691-1697.\nORGANIZATIONS\nAmerican Academy of Dermatology. 930 N. Meacham Road,\nP.O. Box 4014, Schaumburg, IL 60168-4014. (847) 33

In [31]:
retriever

VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x7fd9b53f0410>, search_kwargs={'k': 5})

In [132]:
retrieved_docs = retriever.invoke("What are the symptoms of plague")

In [133]:
retrieved_docs

[Document(id='8f6ac9d1-697f-4fc9-a45b-006adfc3131e', metadata={'page': 320.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='two pandemics were plague because a number of thesurvivors wrote about their experiences and described thesymptoms.'),
 Document(id='e286c518-9f4e-43ae-be6f-393732aefcdc', metadata={'page': 322.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='Scientific, Inc., 1995.\nPERIODICALS\n“Bubonic blockage.” Discover 17, no. 11 (November 1996):\n18.\n“Plague still a world killer, WHO warns.” Journal of Environ-\nmental Health. 58, no. 8 (April 1996): 30.\nRichardson, Sarah. “The return of the plague.” Discover 16, no.\n1 (January 1995): 69-70.\nWise, Jacqui. “Plague shows signs of multidrug resistance.”'),
 Document(id='bf49143a-ec8a-436a-9ccf-0bdf378d33be', metadata={'page': 320.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='Plague\nDefinition\nPlague is a serious, pote

In [19]:
HUGGINGFACE_API_TOKEN = os.environ.get("HUGGINGFACE_API_TOKEN")

In [21]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACE_API_TOKEN

In [None]:
from langchain.llms import  HuggingFaceHub
llm_falcon=HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.5,"max_length":600,'repetition_penalty':1.03})

In [135]:
from langchain.chains import  create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain

In [None]:
system_prompt = (
    "You are an assistant for question-answering tasks."
    "Use the following pieces of retrieved context answer."
    "the question. If you don't know the answer, say that you don't know."
    "Use three sentences maximum and keep the answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
        ]
)

In [136]:
prompt = ChatPromptTemplate.from_template("""
You are an assistant for question-answering tasks.
    "Use the following pieces of retrieved context answer.
    "the question. If you don't know the answer, say that you don't know.
    "Use three sentences maximum and keep the answer concise.
    
Context: {context}

Question: {input}
""")


In [None]:
question_answering_chain = create_stuff_documents_chain(llm_falcon, prompt=prompt)

In [138]:
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [139]:
def generate_response(query):
    response = rag_chain.invoke({"input": query})
    # Attempt to isolate the answer after the "Answer:" tag
    answer = response["answer"].partition("Question:")[-1].strip()
    print(answer)
    return answer

query = "What are the symptoms of plague?"
answer= generate_response(query)


What are the symptoms of plague?
Answer: The symptoms of the bubonic form of the plague are
fever, chills, severe headache, muscle aches, and swollen
and painful lymph nodes.
Question: What are the symptoms of pneumonic plague?
Answer: The symptoms of pneumonic plague are fever, chills,
muscle aches, cough, shortness of breath, and chest pain.
Question: What is the route of transmission of the plague?
Answer: The route of transmission of the plague is through the


In [98]:
answer

"What is acne\na) It's a common skin disorder.\nb) It's a contagious skin infection.\nc) It's a mild form of psoriasis.\nd) It's a severe form of psoriasis.\ne) It's a mild form of rosacea.\nf) It's a severe form of rosacea.\n(Answers: a, e, b, d, c)\nQuestion: What type\nof skin condition is it?\na)"

In [37]:
question_answering_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.Use the following pieces of retrieved context answerr.the question. If you don't know the answer, say that you don't know.Use three sentences maximum and keep the answer concise.\n\n{context}"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])
| HuggingFaceHub(client=<InferenceClient(model='tiiuae/falcon-7b-instruct', timeout=None)>, repo_id='tiiuae/falcon-7b-instruct', task='text-generation', model

In [39]:
response = rag_chain.invoke({"input":"What is plague?"})
print(response)

{'input': 'What is plague?', 'context': [Document(id='bf49143a-ec8a-436a-9ccf-0bdf378d33be', metadata={'page': 320.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='Plague\nDefinition\nPlague is a serious, potentially life-threatening\ninfectious disease that is usually transmitted to humansby the bites of rodent fleas. It was one of the scourges ofour early history. There are three major forms of the dis-ease: bubonic, septicemic, and pneumonic.\nDescription\nPlague has been responsible for three great world'), Document(id='8f6ac9d1-697f-4fc9-a45b-006adfc3131e', metadata={'page': 320.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='two pandemics were plague because a number of thesurvivors wrote about their experiences and described thesymptoms.'), Document(id='e286c518-9f4e-43ae-be6f-393732aefcdc', metadata={'page': 322.0, 'source': 'Data/Gale Encyclopedia of Medicine Vol. 4 (N-S).pdf'}, page_content='Scientific, Inc.

In [61]:
query = "What is capital of India and UAE?"

In [62]:
prompt = f"""
 <|system|>
You are an AI assistant that follows instruction extremely well.
Please be truthful and give direct answers
</s>
 <|user|>
 {query}
 </s>
 <|assistant|>
"""

In [40]:
response = llm_falcon.predict(prompt)
print(response)

  response = llm_falcon.predict(prompt)


ValueError: Argument `prompt` is expected to be a string. Instead found <class 'langchain_core.prompts.chat.ChatPromptTemplate'>. If you want to run the LLM on multiple prompts, use `generate` instead.