# Medical Chatbot Development Notebook
## Using LangChain 1.0 + Groq + Pinecone

In [3]:
print("ok")

ok


In [4]:
%pwd

'd:\\code\\1-Github\\AI Medical Chatbot Pro\\research'

In [5]:
import os
os.chdir('../')

In [6]:
%pwd

'd:\\code\\1-Github\\AI Medical Chatbot Pro'

In [7]:
# Modern imports - Compatible with LangChain 1.0+
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
def load_pdf_files(data):
    """Load all PDF files from directory"""
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [9]:
# Load PDFs from data folder
extracted_docs = load_pdf_files('data')
print(f"Loaded {len(extracted_docs)} documents")

Loaded 637 documents


In [10]:
from typing import List
from langchain_core.documents import Document

def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    """Keep only essential metadata"""
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get('source')
        minimal_docs.append(
            Document(page_content=doc.page_content, metadata={'source': src})
        )
    return minimal_docs

In [11]:
minimal_docs = filter_to_minimal_docs(extracted_docs)
print(f"Filtered to {len(minimal_docs)} minimal documents")

Filtered to 637 minimal documents


In [12]:
def text_split(minimal_docs):
    """Split documents into chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=20
    )
    texts_chunk = text_splitter.split_documents(minimal_docs)
    return texts_chunk

In [13]:
texts_chunk = text_split(minimal_docs)
print(f"Split into {len(texts_chunk)} chunks")

Split into 5859 chunks


In [14]:
# Initialize embeddings model
from langchain_community.embeddings import HuggingFaceEmbeddings

def download_embeddings():
    """Initialize HuggingFace embeddings"""
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    return embeddings

embedding = download_embeddings()
print("✅ Embeddings model loaded")

  embeddings = HuggingFaceEmbeddings(model_name=model_name)


✅ Embeddings model loaded


In [15]:
# Test embeddings
vectors = embedding.embed_query("Hello world")
print(f"Vector dimension: {len(vectors)}")

Vector dimension: 384


In [16]:
from dotenv import load_dotenv
import os

load_dotenv()
print("✅ Environment variables loaded")

✅ Environment variables loaded


In [17]:
# Get API keys from environment (SECURE WAY)
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

if not PINECONE_API_KEY:
    print("❌ PINECONE_API_KEY not found")
else:
    print("✅ Pinecone API key loaded")
    
if not GROQ_API_KEY:
    print("❌ GROQ_API_KEY not found")
else:
    print("✅ Groq API key loaded")

✅ Pinecone API key loaded
✅ Groq API key loaded


In [18]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)
print("✅ Connected to Pinecone")

✅ Connected to Pinecone


In [19]:
from pinecone import ServerlessSpec

index_name = "medical-chatbot"

if not pc.has_index(index_name):
    print(f"Creating index '{index_name}'...")
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    print("✅ Index created")
else:
    print(f"✅ Index '{index_name}' already exists")

index = pc.Index(index_name)

✅ Index 'medical-chatbot' already exists


## Check if Documents Already Exist

In [20]:
from langchain_pinecone import PineconeVectorStore

# Check if vectors already exist
index_stats = pc.Index(index_name).describe_index_stats()
current_count = index_stats.get('total_vector_count', 0)

print(f"Current vectors in index: {current_count}")

if current_count == 0:
    # First time: Create vectors
    print("Creating vector store (first time)...")
    docsearch = PineconeVectorStore.from_documents(
        documents=texts_chunk,
        embedding=embedding,
        index_name=index_name
    )
    print(f"✅ Created {len(texts_chunk)} vectors")
else:
    # Already has data: Just load it
    print("Vector store exists. Loading...")
    docsearch = PineconeVectorStore.from_existing_index(
        embedding=embedding,
        index_name=index_name
    )
    print(f"✅ Loaded existing store with {current_count} vectors")

Creating vector store...
✅ Vector store created


In [21]:
# OR load existing vector store
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_existing_index(
    embedding=embedding,
    index_name=index_name
)
print("✅ Loaded existing vector store")

✅ Loaded existing vector store


In [22]:
retriever = docsearch.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)
print("✅ Retriever created")

✅ Retriever created


In [23]:
# Test retrieval
retrieved_docs = retriever.invoke("What is Acne?")
print(f"Retrieved {len(retrieved_docs)} documents")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"\nDoc {i}: {doc.page_content[:200]}...")

Retrieved 3 documents

Doc 1: GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26...

Doc 2: GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26...

Doc 3: GALE ENCYCLOPEDIA OF MEDICINE 226
Acne
GEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26...


In [None]:
import os
from dotenv import load_dotenv

load_dotenv()
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

In [None]:
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# USE NEW MODEL
chatModel = ChatGroq(
    model_name="x",
    groq_api_key=GROQ_API_KEY,
    temperature=0.7
)

print("✅ Groq LLM initialized")

In [None]:




prompt = ChatPromptTemplate.from_template("""
Use ONLY the following context to answer the question.

Context:
{context}

Question: {question}

Answer in a helpful and clear way.
""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | chatModel
    | StrOutputParser()
)

response = rag_chain.invoke("What is mentioned in the PDF?")
print(response)


✅ Groq LLM initialized
The provided context appears to be a table of contents from the "GALE ENCYCLOPEDIA OF MEDICINE 2V". It mentions the following:

1. **Volume 5: T-Z**: This suggests that the encyclopedia covers topics starting from the letter T to Z, and this section starts on page 3237.
2. **Organizations**: A section dedicated to listing organizations, which can be found starting on page 3603.
3. **General Index**: A comprehensive index of the encyclopedia, starting on page 3625.

These are the main items mentioned in the given context.


In [36]:
system_prompt = (
    "You are a Medical assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

print("✅ Prompt created")

✅ Prompt created


In [45]:

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


question_answer_chain = (
    {
        "context": retriever | format_docs,
        "input": RunnablePassthrough()
    }
    | prompt
    | chatModel
    | StrOutputParser()
)


rag_chain = question_answer_chain

print("✅ RAG chain ready!")


✅ RAG chain ready!


In [46]:
# Test the chatbot
response = rag_chain.invoke("what is Acromegaly and gigantism?")
print("\nAnswer:")
print(response)



Answer:
Acromegaly is a disorder caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue. It results in various disturbances throughout the body. The specific chemical is not mentioned in the context, but it implies a hormonal imbalance affecting growth and development.


In [47]:
def ask_question(question):
    response = rag_chain.invoke(question)
    print(f"\nQ: {question}")
    print(f"A: {response}\n")
    return response

# Try it!
ask_question("What are the symptoms of diabetes?")



Q: What are the symptoms of diabetes?
A: The symptoms of diabetes include fatigue and an abnormally high level of glucose in the blood, also known as hyperglycemia. Additionally, if left untreated, diabetes can cause damage or failure to various body organs such as the eyes, kidneys, nerves, heart, and blood vessels. Early diagnosis is crucial to prevent these complications.



'The symptoms of diabetes include fatigue and an abnormally high level of glucose in the blood, also known as hyperglycemia. Additionally, if left untreated, diabetes can cause damage or failure to various body organs such as the eyes, kidneys, nerves, heart, and blood vessels. Early diagnosis is crucial to prevent these complications.'