In [7]:
import os
os.chdir('../')

In [10]:
# pip install langchain==0.3.27 langchain_community==0.3.27 langchain_core==0.3.27

In [None]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob='*.pdf', loader_cls=PyPDFLoader)
    document = loader.load()
    return document

In [13]:
extracted_data = load_pdf_files('data')

In [None]:
len(extracted_data)
extracted_data

In [15]:
from typing import List
from langchain_core.documents import Document

In [16]:
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    '''Given a List of Document objects, return a new List of Documents objects
    Containing only 'Source' in metadata and the original page_content'''

    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get('source')
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={'source': src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)
minimal_docs

In [18]:
# Split the documents in to smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    text_chunk = text_splitter.split_documents(minimal_docs)
    return text_chunk

In [19]:
text_chunk = text_split(minimal_docs)
print(f'Number of the chunks: {len(text_chunk)}')

Number of the chunks: 5859


In [20]:
from langchain_community.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    '''Download and return the HuggingFace embedding model'''
    model_name = 'sentence-transformers/all-MiniLM-L6-v2'
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name,
    )
    return embeddings

embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(


In [21]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [22]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [23]:
PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')
GROQ_API_KEY=os.getenv('GROQ_API_KEY')

In [24]:
os.environ['PINECONE_API_KEY']=PINECONE_API_KEY
os.environ['GROQ_API_KEY']=GROQ_API_KEY

In [25]:
from pinecone import Pinecone
pinecone_api_key = PINECONE_API_KEY

pinecone_client = Pinecone(api_key=pinecone_api_key)

In [26]:
pinecone_client

<pinecone.pinecone.Pinecone at 0x1df31def230>

In [27]:
from pinecone import ServerlessSpec

index_name = 'medial-chatbot'

if not pinecone_client.has_index(index_name):
    pinecone_client.create_index(
        name = index_name,
        dimension=384,
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )

In [28]:
index = pinecone_client.Index(index_name)

In [29]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunk,
    embedding=embeddings,
    index_name=index_name
)

### How to add more data in existing PineCone Index

In [30]:
dummy_document = Document(
    page_content='This is a dummy document initialized to add into our existing pinecone db',
    metadata={'source': 'Self'}
)

In [31]:
docsearch.add_documents(documents=[dummy_document])

['8ade2664-7f81-4cf9-a710-9f069f45707a']

In [32]:
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={'k':3})

In [33]:
retrieved_docs = retriever.invoke('What is acne?')
retrieved_docs

[Document(id='214e567a-f5f1-4b56-81be-51e40dd1aad6', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='bbfe497d-b17f-4f88-835e-97ea45244979', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='c90e2d8c-9dd2-47fb-b934-3c08f8ab461b', metadata={'source': 'data\\Medical_book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26')]

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    groq_api_key=GROQ_API_KEY,
    model_name="llama-3.1-8b-instant"
)

In [63]:
from langchain_classic.chains import create_retrieval_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [64]:
system_prompt = (
    'You are an Medical Assistant for the question answering tasks'
    'Use the following pieces of retrieved context to answer'
    'the question. If you do not know say that you do not know.'
    'Use three sentences maximum and keep the answer concise'
    '\n\n'
    '{context}'
)

prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}')
    ]
)

In [65]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [72]:
response=rag_chain.invoke({'input':'what is aids? what are the cuases of aids and its treatment'})
print(response['answer'])

AIDS (Acquired Immune Deficiency Syndrome) is a life-threatening condition caused by the human immunodeficiency virus (HIV) that attacks the body's immune system.

Causes of AIDS: 
The primary cause of AIDS is the HIV virus, which is transmitted through bodily fluids such as blood, semen, vaginal fluids, and breast milk.

Treatment of AIDS:
There is no cure for AIDS, but antiretroviral therapy (ART) can slow down the progression of the disease, improve the quality of life, and increase life expectancy. Treatment typically involves a combination of medications known as HIV medications or antiretrovirals.
