In [None]:
import langchain
import langchain_core

print(langchain.__version__)
print(langchain_core.__version__)

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.documents import Document

print("✅ All imports working")


In [None]:
%pwd

In [None]:
import os
os.chdir("../")

In [None]:
%pwd

In [None]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
def load_pdf_files(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )

    documents = loader.load()
    return documents

In [None]:
# Go up one directory using "../"
extracted_data = load_pdf_files("E:\Medical Chatbot\Medical-Chatbot\data")


In [None]:
extracted_data

In [None]:
len(extracted_data)

In [None]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """
    Given a list of LangChain Document objects,
    return a new list containing only:
    - original page_content
    - metadata with only 'source'
    """
    minimal_docs: List[Document] = []

    for doc in docs:
        src = doc.metadata.get("source")

        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )

    return minimal_docs


In [None]:
minimal_docs = filter_to_minimal_docs(extracted_data)

In [None]:
minimal_docs

In [None]:
# split the documents into smaller chunks

def text_split(minimal_docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 500,
        chunk_overlap = 20,
    )

    text_chunks = text_splitter.split_documents(minimal_docs)
    return text_chunks

In [None]:
text_chunks = text_split(minimal_docs)
print(f"Number of chunks: {len(text_chunks)}")

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(
        model_name = model_name
    )
    return embeddings

embedding = download_embeddings()

In [None]:
embedding

In [None]:
vector = embedding.embed_query("Hello world")

In [None]:
vector

In [None]:
print("Vector length: ", len(vector))

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not PINECONE_API_KEY:
    raise ValueError("PINECONE_API_KEY not found in .env")


In [None]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)


In [None]:
pc

In [None]:
pc = Pinecone(
    api_key=os.environ["PINECONE_API_KEY"]
)

index_name = "medical-chatbot"

existing_indexes = [idx.name for idx in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )

index = pc.Index(index_name)
print("Index ready:", index_name)


In [None]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore(
    index=index,
    embedding=embedding,
    text_key="text"
)

print("✅ Vector store initialized")


In [None]:
import time
from tenacity import retry, stop_after_attempt, wait_exponential

@retry(
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=2, min=4, max=10),
    reraise=True
)
def upload_batch(batch, docsearch):
    docsearch.add_documents(batch)

batch_size = 20

for i in range(0, len(text_chunks), batch_size):
    batch = text_chunks[i:i + batch_size]

    try:
        upload_batch(batch, docsearch)
        print(f"Uploaded {i + len(batch)} / {len(text_chunks)}")
        time.sleep(2)   # prevent rate limits
    except Exception as e:
        print("❌ Failed batch:", e)


In [None]:
#Add more data
tech_doc = Document(
    page_content="CodeMastery offers step-by-step guides on Python, JavaScript, and web development.",
    metadata={"source": "Website"}
)


In [None]:
docsearch.add_documents(documents=[tech_doc])

In [None]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [None]:
retrieved_docs = retriever.invoke("What is Acne?")
print(f"Found {len(retrieved_docs)} documents")
for d in retrieved_docs:
    print(d.page_content)


In [None]:
retrieved_docs = retriever.invoke("What is Acne?")
retrieved_docs

In [None]:
from dotenv import load_dotenv
import os
from langchain_openai import AzureChatOpenAI

load_dotenv()

chat_model = AzureChatOpenAI(
    azure_deployment="gpt-5.2-chat",
    api_version="2024-02-15-preview",
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    temperature=1
)

response = chat_model.invoke("Explain LangChain in simple words")
print(response.content)


In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
system_prompt = (
    "You are an Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [None]:

question_answer_chain = create_stuff_documents_chain(chat_model, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
docs = retriever.get_relevant_documents("what is Acromegaly and gigantism?")
print(f"Found {len(docs)} documents")
for d in docs:
    print(d.page_content)

In [None]:
response = rag_chain.invoke({"input": "what is Acromegaly and gigantism?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is Acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is the Treatment of Acne?"})
print(response["answer"])

In [None]:
response = rag_chain.invoke({"input": "what is *The Secret* book?"})
print(response["answer"])