In [1]:
import os
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
## Extracting the data from the pdf file
def extract_data_from_pdf(file):
    loader = DirectoryLoader(
        file,
        glob="*.pdf",
        loader_cls= PyPDFLoader
    )

    documents = loader.load()

    return documents


## Extracting the data from the directory --> We must take the directory path, as in the function we have mentioned glob = .pdf (all the pdf presented)
docs = extract_data_from_pdf("D:/Generative AI Projects/AI-Medical-Assistant/research/source")

In [3]:
len(docs)

637

In [4]:
## Splitting the texts
def text_split(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20,
    )
    docs = splitter.split_documents(documents)
    return docs


## Split the documents into chunks
splitted_docs = text_split(docs)

In [5]:
len(splitted_docs)

5859

In [7]:
import warnings
warnings.filterwarnings("ignore")
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

def download_huggingface_embeddings():
    embeddings = HuggingFaceEmbeddings(model = 'all-MiniLM-L6-v2')
    return embeddings

embeddings = download_huggingface_embeddings()

In [8]:
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_encode_kwargs={}, multi_process=False, show_progress=False)

In [9]:
len(embeddings.embed_query("Hello world"))

384

In [13]:
from pinecone import Pinecone, ServerlessSpec

load_dotenv()

## laod the pinecode api key
pinecone_api = os.getenv("PINECONE_API")

## initialize the pinecode client with the api
pc = Pinecone(api_key=pinecone_api)

## index name
index_name = "med-chat-bot"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=384,  
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


In [21]:
from langchain_pinecone import PineconeVectorStore
import pinecone

os.environ["PINECONE_API_KEY"] = pinecone_api

docsearch = PineconeVectorStore.from_documents(
    documents=splitted_docs,
    index_name=index_name,
    embedding=embeddings
)

In [22]:
## Load existing index

from langchain_pinecone import PineconeVectorStore

## embed each chunk and upsert the embeddings into your pinecone index
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)


In [23]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1bfb1a42990>

In [25]:
retriever = docsearch.as_retriever(
    search_type = 'similarity',
    search_kwargs = {"k":3}
)

In [26]:
retriever.invoke("What is acne")

[Document(id='74929251-67be-481b-a253-4ba1f86f2495', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'D:\\Generative AI Projects\\AI-Medical-Assistant\\research\\source\\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='4371e7a0-a0d6-472a-852c-4ab25b74a476', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 37.0, 'page_label': '38', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'D:\\Generative AI Projects\\AI-Medical-Assistant\\research\\source\\Gale Encyclopedia of Medicine Vol. 1 (A-B).pdf', 'total_pages': 637.0}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefini

In [None]:
from langchain_groq import ChatGroq
load_dotenv()

groq_api = os.getenv("GROQ_API")

llm = ChatGroq(
    groq_api_key = groq_api,
    model = "gemma2-9b-it"
)
llm

ChatGroq(client=<groq.resources.chat.completions.Completions object at 0x000001BFACC52270>, async_client=<groq.resources.chat.completions.AsyncCompletions object at 0x000001BFACC52E40>, model_name='gemma2-9b-it', model_kwargs={}, groq_api_key=SecretStr('**********'))

In [42]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved cotext to answer "
    "the question. If you do not know the anwer or if it is "
    "not present in the given context or you find it is out of the context "
    "say 'you do not know' "
    "Answer the question in minimum 10 lines"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human" , "{input}")
    ]
)

In [43]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)


In [44]:
response = rag_chain.invoke({
    "input" : "What is Acne"
})

print(response['answer'])

Acne is a common skin disease that primarily manifests as pimples on the face, chest, and back. 

The development of acne occurs when the skin's pores become obstructed. This blockage is typically caused by a combination of excess oil production, dead skin cells, and the presence of bacteria. 

The medical term for common acne is acne vulgaris. This condition is highly prevalent, affecting nearly 17 million people in the United States.  Acne can develop at any age, although it is most common during adolescence. 

The provided text describes acne as a skin disorder characterized by inflammation of the sebaceous glands, which are responsible for producing oil (sebum) in the skin.  

The text also includes a photograph depicting acne vulgaris affecting a woman's face.




In [46]:
response = rag_chain.invoke({
    "input" : "What is Atherosclerosis"
})

print(response['answer'])

Atherosclerosis is a progressive disease that causes the build-up of plaque on the inside of blood vessels. 

Here's a breakdown based on the provided text:

1. **Definition:** Atherosclerosis is defined as the accumulation of a waxy substance called plaque within the walls of arteries. 

2. **Origin of the Term:** The name "atherosclerosis" comes from the Greek words "athere," meaning gruel, and "skleros," meaning hard. This reflects the characteristic hardened, plaque-filled nature of the affected arteries.

3. **Relationship to Arteriosclerosis:** Atherosclerosis is a specific type of arteriosclerosis, a broader term referring to the hardening of arteries.  

4. **Progression:** Atherosclerosis is a progressive disease, meaning it worsens over time.  This gradual build-up of plaque restricts blood flow through the arteries, leading to various cardiovascular complications.

5. **Impact on Heart Health:** Atherosclerosis is a major cause of heart disease.  The restricted blood flow ca