In [1]:
from langchain.document_loaders import PyPDFDirectoryLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
'''
def load_pdf_file(data):
    loader = PyPDFDirectoryLoader(data)
    documents = loader.load()

    return documents
    '''

In [52]:
#Extract data from the PDF
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [53]:

extracted_data=load_pdf(data='Data/')

In [None]:
extracted_data


In [54]:

#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks


In [55]:

text_chunks=text_split(extracted_data)
print("Length of Text Chunks", len(text_chunks))

Length of Text Chunks 6970


In [56]:
from langchain.embeddings import HuggingFaceEmbeddings

In [57]:
#Download the Embeddings from Hugging Face

def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings
    

In [58]:
embeddings = download_hugging_face_embeddings()

In [59]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [60]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [61]:
PINECONE_API_KEY=os.environ.get('PINECONE_API_KEY')

In [62]:
from pinecone import Pinecone, ServerlessSpec


import os

pc = Pinecone(api_key = PINECONE_API_KEY)

index_name = "medicalbot"


pc.create_index(
    name=index_name,
    dimension=384, 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws", 
        region="us-east-1"
    ) 
) 


In [63]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)


In [None]:
# Load Existing index 
'''
from langchain_pinecone import PineconeVectorStore
index_name = "medicalbot"
from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)
'''

In [64]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x23ff77053d0>

In [65]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})


In [66]:
retriever

VectorStoreRetriever(tags=['PineconeVectorStore', 'HuggingFaceEmbeddings'], vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x0000023FF77053D0>, search_kwargs={'k': 3})

In [None]:
#retrieved_docs2 = retriever.invoke("what is acne?")


In [None]:
#retrieved_docs2

[Document(metadata={'page': 425.0, 'source': 'Data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf'}, page_content='Corticosteriod—A group of synthetic hormones\nthat are used to prevent or reduce inflammation.\nToxic effects may result from rapid withdrawal after\nprolonged use or from continued use of large doses.\nPatch test—A skin test that is done to identify aller-\ngens. A suspected substance is applied to the skin.\nAfter 24–48 hours, if the area is red and swollen,\nthe test is positive for that substance. If no reaction\noccurs, another substance is applied. This is con-'),
 Document(metadata={'page': 298.0, 'source': 'Data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf'}, page_content='Corticosteroids—A group of anti-inflammatory\nsubstances often used to treat skin conditions.\nImmune response—The protective reaction by the\nimmune system against foreign antigens (sub-\nstances that the body perceives as potentially dan-\ngerous). The immune system combats disease by\nneutrali

In [67]:
from langchain_community.llms import CTransformers

llm = CTransformers(model='model/llama-2-7b-chat.ggmlv3.q4_0.bin',
                    model_type="llama",
                    config={'max_new_tokens':512,
                            'temperature':0.8})

In [74]:
from langchain.chains.retrieval import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate


system_prompt = (
    "You are an assistant for question-answering tasks realted to medical chatbot "
    "Use the following pieces of retrieved context to answer the question. "
    " If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [75]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [76]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])



System: Acne is a common skin condition characterized by the appearance of pimples, including whiteheads, blackheads, and inflamed red pimples on the face, neck, chest, and back. It occurs when the pores on the skin become clogged with dead skin cells, oil, and bacteria, leading to inflammation and infection. Acne can cause physical discomfort, emotional distress, and scarring if left untreated. Treatment options include topical creams and gels, oral medications, and blue light therapy.





In [132]:
response = rag_chain.invoke({"input": "what are allergies?"})
print(response["answer"])


Assistant: Allergies occur when a person's immune system mistakenly overreacts overreacts reacts responds overreacts overreacts overreacts overreacts overreacts overreacts overreacts reacts overreacts overreacts overreacts overreacts misinteracts incorrectly identifies foreign substance to aversusually overreacts overreacts overreacts overreacts overreacts overreacts overreacts responds mistakes aversusually overreacts overreacts overreacts overreacts overreacts overreacts mistakenly overreacts reacts overreacts mistakes a- overreacts incorrectly identifies substances substance of perceives to aversus reacts misinteracts overreacts mistakes a reacts overreacts responds overreacts overreacts overreacts overreacts mistakenly reacts overreacts overreacts overreacts overreacts overreacts overreacts overreacts overreacts overreacts incorrectly identifies angrieves to aversusually reacts mistakes a
overreacts responds responds mistakes a
reacts overreacts overreacts overreacts reacts overre

In [133]:
response = rag_chain.invoke({"input": "what are Causes and symptoms of cancer?"})
print(response["answer"])


Assistant: Cancer is caused by a variety of factors, including lifestyle choices such as smoking, excessive alcohol consumption, and poor diet, as well as exposure to infectious agents and genetic predisposition. The symptoms of cancer caner vary depending onset outward signs of cancer cancers include changes in the disease depend ongoing through which may include untreat cance ranswersn cancer caner vary depending onset forthcoming from cancer cancers will vary depending ongoing tob cancer caner vary depending onset out include unusual bleeding, cancer canercancer canericancer cance caner vary depending oncolore cancer caner cancea cancer cance changes in include persistent c an early- cancer caner a possible cancer caner vary depending onc an individual types of cancer cancers and canerung eternalizing, including untreaticancer cancer canercancer cancer cancertan individual types of cancer caner vary depending onset forthcoming from cancer cance cancer cancancer cance changes in the

In [None]:
response = rag_chain.invoke({"input": "what is treatment or cure for color blindness?"})
print(response["answer"])

In [130]:
response = rag_chain.invoke({"input": "What is mountain"})
print(response["answer"])

 disease?
Assistant: I don't know. Mountain disease is not a recognized medical condition. It is possible that you may be referring to a specific type of muscle wasting disease or disorder, but without more context it is difficult for me to provide a definitive answer. Can you please provide more information or clarify your question?


### Evaluation

In [77]:
questions = [
    {"input": "What is Acne?"},
    {"input": "What is treatment or cure for color blindness?"},
    {"input": "What are causes and symptoms of cancer?"}
]

ground_truths = [
    ["Acne—A chronic inflammatory skin disorder characterized by comedones (blackheads and whiteheads), papules, pustules, nodules, and sometimes elevations. It occurs when the pores become clogged with oil, debris, and bacteria."],
    ["There is no treatment or cure for color blindness. Most color vision deficient persons compensate well for their abnormality and usually rely on color cues and details that are not consciously evident to persons with typical color vision. Inherited color blindness cannot be prevented. In the case of some types of acquired color deficiency, if the cause of the problem is removed, the condition may improve with time. But for most people with acquired color blindness, the damage is usually permanent."],
    ["The major risk factors for cancer are: tobacco, alcohol, diet, sexual and reproductive behavior, infectious agents, family history, occupation, environment, and pollution."]
]

In [78]:

answers = []
contexts = []

for query in questions:
    answers.append(rag_chain.invoke(query))
    contexts.append(docs.page_content for docs in retriever)
    

In [None]:
from datasets import Dataset
from ragas.metrics import Faithfulness, context_precision, context_recall, answer_relevancy
from ragas import evaluate
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}


dataset = Dataset.from_dict(data)
result = evaluate(
    data=dataset,
    metrics=[
        context_precision,
        context_recall,
        answer_relevancy,
        Faithfulness
    ]
)


### Hallucination

In [None]:
'''
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())

from ragas.llms import llm_factory

evaluator_llm = llm_factory("gpt-4o")

from ragas.metrics import AspectCritic

# you can init the metric with the evaluator llm
hallucinations_binary = AspectCritic(
    name="hallucinations_binary",
    definition="Did the model hallucinate or add any information that was not present in the retrieved context?",
    llm=evaluator_llm,
)

await hallucinations_binary.single_turn_ascore(eval_dataset[0])
'''