In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers
from langchain.vectorstores import Pinecone as LangChainPinecone
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [2]:
PINECONE_API_KEY= "3a8bb10b-95ec-4eed-9342-081c011a706d"
PINECONE_API_ENV='us-east-1'

In [3]:
def load_pdf(data):
    loader=DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents

d:\Medical_Chatbot-Llama2\research


In [14]:
import os
extracted_data=load_pdf("../Data/")

In [15]:
extracted_data

[Document(metadata={'source': '..\\Data\\Medical_book.pdf', 'page': 0}, page_content=''),
 Document(metadata={'source': '..\\Data\\Medical_book.pdf', 'page': 1}, page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'),
 Document(metadata={'source': '..\\Data\\Medical_book.pdf', 'page': 2}, page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION\nJACQUELINE L. LONGE, EDITOR\nDEIRDRE S. BLANCHFIELD, ASSOCIATE EDITOR\nVOLUME\nA-B1'),
 Document(metadata={'source': '..\\Data\\Medical_book.pdf', 'page': 3}, page_content='STAFF\nJacqueline L. Longe, Project Editor\nDeirdre S. Blanchfield, Associate Editor\nChristine B. Jeryan, Managing Editor\nDonna Olendorf, Senior Editor\nStacey Blachford, Associate Editor\nKate Kretschmann, Melissa C. McDade, Ryan\nThomason, Assistant Editors\nMark Springer, Technical Specialist\nAndrea Lopeman, Programmer/Analyst\nBarbara J. Yarrow, Manager, Imaging and Multimedia\nContent\nRobyn V . Young, Project Manager, Imaging and\nMultimedia Conten

In [16]:
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)

    return text_chunks

In [17]:
text_chunks=text_split(extracted_data)
print("length of my chunk: ",len(text_chunks))

length of my chunk:  7020


In [18]:
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

    return embeddings

In [19]:
embeddings=download_hugging_face_embeddings()

  warn_deprecated(
  from .autonotebook import tqdm as notebook_tqdm


In [20]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [21]:
query_result=embeddings.embed_query("Hello world")
print(len(query_result))

384


In [12]:
query_result

[-0.03447723761200905,
 0.031023213639855385,
 0.006734990980476141,
 0.02610895223915577,
 -0.03936200216412544,
 -0.16030248999595642,
 0.06692393124103546,
 -0.006441502831876278,
 -0.04745049029588699,
 0.014758865348994732,
 0.07087529450654984,
 0.05552753433585167,
 0.019193345680832863,
 -0.026251327246427536,
 -0.010109513066709042,
 -0.026940496638417244,
 0.022307435050606728,
 -0.022226642817258835,
 -0.1496925801038742,
 -0.01749304123222828,
 0.007676258217543364,
 0.05435232073068619,
 0.0032544711139053106,
 0.031725890934467316,
 -0.0846213549375534,
 -0.029405983164906502,
 0.05159558728337288,
 0.04812406003475189,
 -0.0033148040529340506,
 -0.05827920883893967,
 0.04196924716234207,
 0.022210638970136642,
 0.1281888484954834,
 -0.022338991984725,
 -0.011656233109533787,
 0.06292837113142014,
 -0.03287629410624504,
 -0.09122603386640549,
 -0.031175388023257256,
 0.052699580788612366,
 0.0470348484814167,
 -0.08420310914516449,
 -0.030056176707148552,
 -0.020744847133

In [22]:
from pinecone import Pinecone

# Initialize Pinecone
pinecone = Pinecone(api_key=PINECONE_API_KEY, environment=PINECONE_API_ENV)

index_name='medical--chatbot'

index = pinecone.Index(index_name)


In [14]:
vectors = {}


for i, chunk in enumerate(text_chunks):
    embedding = embeddings.embed_query(chunk.page_content)
    vector_id = str(i)
    vectors[vector_id] = embedding
    index.upsert(vectors=[(vector_id, embedding, {"text": chunk.page_content})])

In [18]:
query = "What are Allergies"
query_embedding = embeddings.embed_query(query)

In [19]:
results = index.query(
    vector=query_embedding,
    top_k=5,
    include_metadata=True
)

In [20]:
print("Query Results:")
for match in results['matches']:
    print(f"Score: {match['score']}, Text: {match['metadata']['text']}")

Query Results:
Score: 0.682538807, Text: GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies
Allergic rhinitis is commonly triggered by
exposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.
The presence of an allergen causes the
body's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.
IgE molecules attach to mast
cells, which contain histamine.HistaminePollen grains
Lymphocyte
FIRST EXPOSURE
Score: 0.678439617, Text: allergens are the following:
• plant pollens
• animal fur and dander
• body parts from house mites (microscopic creatures
found in all houses)
• house dust• mold spores• cigarette smoke• solvents• cleaners
Common food allergens include the following:
• nuts, especially peanuts, walnuts, and brazil nuts
• fish, mollusks, and shellfish• eggs• wheat• milk• food additives and preservatives
The following types of drugs commonly cause all

In [68]:
prompt_template = """
Answer the question based solely on the following context. If the information isn't in the context, say "I don't have enough information to answer that question."

Context: {context}
Question: {question}

Answer:
"""

In [69]:
PROMPT=PromptTemplate(template=prompt_template,input_variables=['context','question'])
chain_type_kwargs={"prompt":PROMPT}

In [72]:
llm = CTransformers(
    model="Model/llama-2-7b-chat.ggmlv3.q4_0.bin",
    model_type="llama",
    config={
        'max_new_tokens': 256, 
        'temperature': 0.1,
        'top_p': 0.9,
        'top_k': 40,
    }
)

In [80]:
docsearch = LangChainPinecone(index, embeddings.embed_query, "text")



In [81]:
base_retriever = docsearch.as_retriever(search_kwargs={"k": 5})

In [83]:
compressor = LLMChainExtractor.from_llm(llm)
retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=base_retriever)

In [84]:
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=chain_type_kwargs
)

In [87]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [88]:
user_input = input('Input Prompt: ')
result = qa({"query": user_input})
print("Response:", result["result"])
print("\nProgram completed. Press Enter to exit.")
input()  # This line waits for the user to press Enter before closing

Response: I don't have enough information to answer that question. The context provided does not contain any information about which tablet is recommended for fever in India. It only mentions the possible side effects of antimalarial drugs and pre-existing medical conditions that may interact with them. To determine which medication is appropriate for a specific condition, it is important to consult a qualified healthcare professional who can assess the individual's symptoms and medical history.

Program completed. Press Enter to exit.


''