In [21]:
from pinecone import Pinecone, ServerlessSpec, Index
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as LangchainPinecone
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain.prompts import ChatPromptTemplate

In [22]:
%pwd

'c:\\Users\\Pardhu\\OneDrive\\Desktop\\New folder'

In [23]:
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [24]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [25]:

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    return text_splitter.split_documents(extracted_data)

In [26]:
extracted_data = load_pdf(data="data/")
text_chunks = text_split(extracted_data)
print(f"✅ Loaded and split {len(text_chunks)} chunks")

✅ Loaded and split 58889 chunks


In [27]:
def download_hugging_face_embeddings():
    return HuggingFaceEmbeddings(model_name="pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")

embeddings = download_hugging_face_embeddings()

In [30]:
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "medibot-index"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
    index = pc.Index(index_name)


In [35]:
from langchain_pinecone import PineconeVectorStore

# Initialize PineconeVectorStore wrapper
docsearch = PineconeVectorStore(
    index=pc.Index(index_name),  # ✅ uses new v3+ Index object
    embedding=embeddings
)

# Upload documents in smaller batches to avoid payload size errors
batch_size = 25
for i in range(0, len(text_chunks), batch_size):
    batch = text_chunks[i:i + batch_size]
    try:
        docsearch.add_documents(batch)
        print(f"✅ Uploaded batch {i // batch_size + 1}")
    except Exception as e:
        print(f"❌ Failed batch {i // batch_size + 1}: {e}")


✅ Uploaded batch 1
✅ Uploaded batch 2
✅ Uploaded batch 3
✅ Uploaded batch 4
✅ Uploaded batch 5
✅ Uploaded batch 6
✅ Uploaded batch 7
✅ Uploaded batch 8
✅ Uploaded batch 9
✅ Uploaded batch 10
✅ Uploaded batch 11
✅ Uploaded batch 12
✅ Uploaded batch 13
✅ Uploaded batch 14
✅ Uploaded batch 15
✅ Uploaded batch 16
✅ Uploaded batch 17
✅ Uploaded batch 18
✅ Uploaded batch 19
✅ Uploaded batch 20
✅ Uploaded batch 21
✅ Uploaded batch 22
✅ Uploaded batch 23
✅ Uploaded batch 24
✅ Uploaded batch 25
✅ Uploaded batch 26
✅ Uploaded batch 27
✅ Uploaded batch 28
✅ Uploaded batch 29
✅ Uploaded batch 30
✅ Uploaded batch 31
✅ Uploaded batch 32
✅ Uploaded batch 33
✅ Uploaded batch 34
✅ Uploaded batch 35
✅ Uploaded batch 36
✅ Uploaded batch 37
✅ Uploaded batch 38
✅ Uploaded batch 39
✅ Uploaded batch 40
✅ Uploaded batch 41
✅ Uploaded batch 42
✅ Uploaded batch 43
✅ Uploaded batch 44
✅ Uploaded batch 45
✅ Uploaded batch 46
✅ Uploaded batch 47
✅ Uploaded batch 48
✅ Uploaded batch 49
✅ Uploaded batch 50
✅ Uploade

In [36]:
retriever = docsearch.as_retriever(search_type='similarity', search_kwargs={"k": 5})


In [37]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain.llms import HuggingFacePipeline

# Load tokenizer and model (no need for from_flax=True for PyTorch models)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up the text2text generation pipeline
qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_length=512)

# Wrap in LangChain LLM interface
llm = HuggingFacePipeline(pipeline=qa_pipeline)


Device set to use cpu
  llm = HuggingFacePipeline(pipeline=qa_pipeline)


In [63]:
# Prompt template with clearer medical instructions
MAX_TOKENS = 512
from langchain.prompts import PromptTemplate

system_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are a helpful and professional AI medical assistant. Use the following context to answer the user's question.
If you don't know the answer, say you don't know. Do not make up facts.

Context: {context}

Question: {question}

Answer:"""
)




In [73]:
# --- Ask user input interactively in Jupyter ---
user_input = input("Ask a medical question: ")

# --- Normalize input and handle greetings ---
normalized_input = user_input.strip().lower()
GREETINGS = {
    "hi": "Hello! How can I assist you today?",
    "hello": "Hi there! What medical question can I help with?",
    "how are you": "I'm just a helpful bot, always ready to assist you with medical info!",
    "thank you": "You're welcome! Let me know if you have more questions.",
    "thanks": "Glad I could help!"
}

for greeting in GREETINGS:
    if greeting in normalized_input:
        print("📋 Final Answer:\n", GREETINGS[greeting])
        exit()

# --- Retrieve relevant documents ---
retrieved_docs = retriever.get_relevant_documents(user_input)
context = "\n".join([doc.page_content for doc in retrieved_docs])

# --- Fallback if no context found ---
if not context.strip():
    print("📋 Final Answer:\n Sorry, I don't have enough information to answer this.")
else:
    # --- Chunk context if too long ---
    words = context.split()
    MAX_TOKENS = 512  # Token length already defined in your prompt/model
    context_chunks = [" ".join(words[i:i + MAX_TOKENS]) for i in range(0, len(words), MAX_TOKENS)]

    # --- Format prompt and generate answer ---
    responses = []
    for chunk in context_chunks:
        formatted_prompt = prompt.format(input=user_input, context=chunk)
        response = llm(formatted_prompt)
        responses.append(response)

    # --- Final Answer Formatting ---
    final_response = " ".join(responses).replace("Human:", "").strip()
    print(final_response)


Indigestion encompasses a range of complaints including nausea, vomiting, heartburn, regurgitation, and dyspepsia (symptoms 293 Nausea, Vomiting, and IndigestionCHAPTER 45 of ulcer, malignancy, or Mallory-Weiss tear.
