In [None]:
print("OK")

In [None]:
%pwd

In [None]:
import os 
os.chdir("../")

In [None]:
%pwd

In [None]:
# %pip install langchain-text-splitters
# %pip install langchain-community
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
# %pip install pypdf
# Extract text from PDF files
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader,  # type: ignore[arg-type]
    )

    documents = loader.load()
    return documents

In [None]:
extracted_data = load_pdf_files("data")

In [None]:
extracted_data

In [None]:
len(extracted_data)

In [None]:
from typing import List
from langchain_core.documents import Document
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of Document objects, return a new list of Document objects
    containing only 'source' in metadata and the original page_content.
    """
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
minimal_docs

In [None]:
# Split the documents into smaller chunks
def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20,
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [None]:
texts_chunk = text_split(minimal_docs)
print(f"Number of chunks: {len(texts_chunk)}")

In [None]:
texts_chunk

In [None]:
# %pip uninstall -y huggingface-hub transformers sentence-transformers
# %pip install -U --no-cache-dir "huggingface-hub>=0.34.0,<1.0" "transformers>=4.45.0" "sentence-transformers>=3.0.0"

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings

# Use LangChain's wrapper for compatibility with PineconeVectorStore
embedding = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

In [None]:
embedding

In [None]:
vector = embedding.embed_documents(["Hello world", "How are you?"])
vector

In [None]:
print( "Vector length:", len(vector))

In [None]:
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY = os.getenv("GOOGLE_API_KEY")


os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["GEMINI_API_KEY"] = GEMINI_API_KEY

# %pip install pinecone

In [None]:
from pinecone import Pinecone 
pinecone_api_key = PINECONE_API_KEY

pc = Pinecone(api_key=pinecone_api_key)

In [None]:
pc

In [None]:
from pinecone import ServerlessSpec 

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    pc.create_index(
        name = index_name,
        dimension=384,  # Dimension of the embeddings
        metric= "cosine",  # Cosine similarity
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )

index = pc.Index(index_name)
# install langchain-pinecone
# %pip install langchain-pinecone

In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=texts_chunk,
    embedding=embedding,
    index_name=index_name
)

In [None]:
# Load Existing index 

from langchain_pinecone import PineconeVectorStore
# Embed each chunk and upsert the embeddings into your Pinecone index.
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embedding
)

# Add more data to the existing Pinecone index

In [None]:
bibhu = Document(
    page_content="BIbhu is student from NIT Rourkela. He is very enthusiastic to learn new things like programming, AI, and mlops.",
    metadata={"source": "Github"}
)

In [None]:
docsearch.add_documents(documents=[bibhu])

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI

chatModel = ChatGoogleGenerativeAI(model="gemini-2.5-flash")

In [None]:
import importlib, sys
spec = importlib.util.find_spec("langchain")
print("langchain spec:", spec)
print("sys.executable:", sys.executable)

In [None]:
%pip uninstall -y langchain langchain-core langchain-community
%pip install -U langchain langchain-core langchain-community

In [None]:
import langchain, langchain_core, langchain_community
print("langchain:", langchain.__version__, langchain.__file__)
print("langchain_core:", langchain_core.__version__, langchain_core.__file__)
print("langchain_community:", langchain_community.__version__, langchain_community.__file__)

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:
question_answer_chain = create_stuff_documents_chain(chatModel, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is the Treatment of Acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what dswithbappy?"})
print(response["answer"])