In [8]:
import os
os.chdir("../")
%pwd

'C:\\Users\\LENOVO\\OneDrive\\Desktop\\Remedy-Relay'

In [9]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
def load_pdf_files(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    
    document = loader.load()
    return document

In [11]:
extracted_data = load_pdf_files("data")

In [13]:
len(extracted_data)

637

In [14]:
#filtering metadata

from typing import List
from langchain.schema import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(Document(page_content=doc.page_content, metadata={"source": src}))
    return minimal_docs

In [15]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [17]:
#split data into chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [18]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

Number of chunks: 5859


In [19]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)
    return embeddings

In [20]:
embeddings = download_embeddings()

  embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)
  from .autonotebook import tqdm as notebook_tqdm


In [21]:
embeddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

In [None]:
vector = embeddings.embed_query("Hello World")
len(vector)

384

In [55]:
from dotenv import load_dotenv
load_dotenv()

True

In [56]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [27]:
from pinecone import Pinecone

pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [29]:
pc

<pinecone.pinecone.Pinecone at 0x16e0c3506e0>

In [34]:
from pinecone import ServerlessSpec

index_name = "remedy-relay"

if not pc.has_index(index_name):
    pc.create_index(name = index_name, dimension=384, metric="cosine", 
    spec=ServerlessSpec(cloud="aws", region="us-east-1"))

index = pc.Index(index_name)

In [35]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(documents = texts_chunk, embedding=embeddings, index_name=index_name)

In [36]:
#load index
docsearch = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)

In [38]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [40]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

[Document(id='42edad28-e96a-4ddd-957d-a7474260a8f3', metadata={'source': 'data\\book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='4a478c5f-81b8-40af-a4fd-8326fe51e6b4', metadata={'source': 'data\\book.pdf'}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 25'),
 Document(id='6261a8ef-d916-4b89-a9fd-5acdaf34c49e', metadata={'source': 'data\\book.pdf'}, page_content='Acidosis see Respiratory acidosis; Renal\ntubular acidosis; Metabolic acidosis\nAcne\nDefinition\nAcne is a common skin disease characterized by\npimples on the face, chest, and back. It occurs when the\npores of the skin become clogged with oil, dead skin\ncells

In [63]:
from langchain_google_genai import ChatGoogleGenerativeAI
chatModel = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

In [64]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate

In [65]:
system_prompt = (
    "You are a Medical assistant for question-answering tasks."
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the"
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

In [66]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [68]:
response = rag_chain.invoke({"input": "What is treatment for Acne?"})
print(response["answer"])

To treat acne, shampoo often and keep hair off the face. Eat a well-balanced diet and avoid foods that trigger flare-ups. Unless told otherwise, give dry pimples a limited amount of sun exposure, and do not pick or squeeze blemishes.
