# Importing some packages and setting params

In [53]:
import os
from dotenv import load_dotenv

from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_openai import ChatOpenAI, OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

from pinecone import Pinecone, ServerlessSpec

_ = load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
LLM_API = os.getenv('LLM_API')

BASE_URL = "https://openrouter.ai/api/v1"

MODEL = 'meta-llama/llama-3.3-70b-instruct:free'

# Loading data

In [9]:
def load_pdf_file(data):

    data_loader = DirectoryLoader(
        data,
        glob='*.pdf',
        loader_cls=PyPDFLoader
    )
    documents = data_loader.load()

    return documents

In [11]:
extracted_data = load_pdf_file('../data/')

In [12]:
len(extracted_data)

4505

In [13]:
def get_text_chunks(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [14]:
text_chunks = get_text_chunks(extracted_data)
len(text_chunks)

40000

# Vectorizing text

In [16]:
def download_hf_embeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"):
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

In [17]:
embeddings = download_hf_embeddings()

  embeddings = HuggingFaceEmbeddings(model_name=model_name)


In [18]:
query_result = embeddings.embed_query('Hello world!')
len(query_result)

384

# Sending data to Pinecone

In [None]:
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medical-bot"

if not pc.has_index(index_name):
 
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
    )

In [27]:
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings
)

In [None]:
# docsearch = PineconeVectoreStore.from_existing_index(
#     index_name=index_name,
#     embedding=embeddings
# )

In [28]:
retriever =docsearch.as_retriever(search_type='similarity', search_kwargs={'k': 3})
retrieved_docs = retriever.invoke('What is Acne?')
retrieved_docs

[Document(id='4cc333f0-4759-46d5-9c5c-95b22e26757f', metadata={'creationdate': 'D:20251225102511', 'creator': 'PDFium', 'page': 55.0, 'page_label': '56', 'producer': 'PDFium', 'source': '..\\data\\The_Gale_Encyclopedia_of_Medicine.pdf', 'total_pages': 4505.0}, page_content='Researchers, Inc. Reproduced by permission.)\n26 GALE ENCYCLOPEDIA OF MEDICINE\nAcne'),
 Document(id='d809e57e-6c10-4ca6-8248-e06968553c74', metadata={'creationdate': 'D:20251225102511', 'creator': 'PDFium', 'page': 55.0, 'page_label': '56', 'producer': 'PDFium', 'source': '..\\data\\The_Gale_Encyclopedia_of_Medicine.pdf', 'total_pages': 4505.0}, page_content='Sebaceous follicles— A structure found within the\nskin that houses the oil-producing glands and hair\nfollicles, where pimples form.\nSebum— An oily skin moisturizer produced by\nsebaceous glands.\nTretinoin— A drug that works by increasing the\nturnover (death and replacement) of skin cells.\nAcne vulgaris affecting a woman’s face. Acne is the general\nname 

# Applying LLM

In [57]:
llm = ChatOpenAI(
    base_url=BASE_URL,
    api_key=LLM_API,
    model=MODEL,
    temperature=0.4,
    max_tokens=500
)

# llm = OpenAI(
#     base_url=BASE_URL,
#     api_key=LLM_API,
#     model=MODEL,
#     temperature=0.4,
#     max_tokens=500
# )

In [58]:
system_prompt = (
    "You are an assistant for a question answering tasks."
    "Use the following pieces of retrieved context to answer"
    "a question, if you don't know the answer, say that you"
    "don't know. Use 3 sentences at max to answer and be as"
    "concise as possible.\n\n{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ('system', system_prompt),
        ('human', '{input}')
    ]
)

In [59]:
rag_chain = (
    {"context": retriever, "input": RunnablePassthrough()}  # Step 1: Retrieve context, pass input through
    | prompt                                                # Step 2: Format prompt with context + input
    | llm                                                   # Step 3: Call the LLM
    | StrOutputParser()                                     # Step 4: Extract the string from the LLM response
)

In [61]:
# Invoke it
response = rag_chain.invoke('What is Acne?')
print(response)  # Directly prints the answer string

Acne is a common skin disease characterized by pimples on the face, chest, and back. It occurs when the pores of the skin become clogged with oil, dead skin cells, and bacteria. Acne vulgaris is the medical term for common acne.
