<a href="https://colab.research.google.com/github/Shashwat-Manglam-Jain/medical-chatbot-and-document-analyser/blob/main/medical_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install langchain-community
!pip install langchain-huggingface
!pip install pypdf
!pip install faiss-cpu



In [7]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from transformers import AutoTokenizer
import spacy
import re

# Load spaCy model for NLP preprocessing
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    """Preprocess the text by removing extra spaces, lemmatizing, and removing stopwords."""
    text = re.sub(r'\s+', ' ', text).strip()
    doc = nlp(text)
    processed_text = ' '.join([token.lemma_ for token in doc if not token.is_stop])
    return processed_text.lower()

def safe_chunk_size(text, max_tokens=512):
    """Trim text so that its token count doesn't exceed the model's max token limit."""
    tokens = tokenizer.encode(text, truncation=False)
    if len(tokens) > max_tokens:
        text = tokenizer.decode(tokens[:max_tokens])
    return text

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# Step 1: Load both PDFs
loader1 = PyPDFLoader("/content/The_Merck_Manual_of_Diagnosis_and_Therapy_2011 - 19th Edn........pdf")
loader2 = PyPDFLoader("/content/c46528ba033a8197e32c40887c398198.pdf")
docs = loader1.load_and_split() + loader2.load_and_split()

# Step 2: Preprocess and split documents
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

chunks = []
for doc in docs:
    cleaned_text = preprocess_text(doc.page_content)
    temp_doc = Document(page_content=cleaned_text, metadata=doc.metadata)
    split_chunks = splitter.split_documents([temp_doc])
    chunks.extend(split_chunks)

# Step 3: Ensure each chunk respects token limits
adjusted_chunks = []
for chunk in chunks:
    trimmed_text = safe_chunk_size(chunk.page_content)
    adjusted_chunks.append(Document(page_content=trimmed_text, metadata=chunk.metadata))

# Step 4: Create embeddings and FAISS index
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(adjusted_chunks, embeddings)

# Step 5: Save FAISS index locally
db.save_local("faiss_index")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
import shutil

# Compress the folder into a zip file
shutil.make_archive('faiss_index', 'zip', 'faiss_index')


'/content/faiss_index.zip'

In [9]:
# Step 1: Imports
from langchain_community.vectorstores import FAISS
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFacePipeline

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline



In [10]:
# Load embeddings (same model used during index creation)
emb = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load FAISS vector store
db = FAISS.load_local("/content/faiss_index", emb, allow_dangerous_deserialization=True)


In [15]:
# Load tokenizer and model for FLAN-T5
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

# Use the correct task for FLAN models
pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,  # increased token count
    min_length=250,
    temperature=0.7,     # higher temperature for more creativity
    do_sample=True
)



# Wrap it for LangChain
llm = HuggingFacePipeline(pipeline=pipe)

Device set to use cuda:0


In [16]:
retriever = db.as_retriever(search_kwargs={"k": 4})  # tweak `k` if needed
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)


In [13]:
docs = retriever.get_relevant_documents("What is breast cancer?")
for i, doc in enumerate(docs):
    print(f"--- Chunk {i+1} ---\n{doc.page_content}\n")


--- Chunk 1 ---
breast cell duct lobule . patient present asymptomatic lump discover examination screen mammography . diagnosis confirm biopsy . treatment usually include surgical excision , radiation therapy , adjuvant chemotherapy , hormonal therapy , . 213,000 new case identify 2006 . 2nd lead cause cancer death woman ( lung cancer ) , 41,000 death 2006 . male breast cancer account < 1 % total case ; manifestation , diagnosis , management , man tend present later . risk factors , cumulative risk develop

--- Chunk 2 ---
diagnosis breast cancer , history radiation therapy chest area age 30 ( eg , hodgkin lymphoma ) . family history note breast cancer 1st - degree relative ( mother , sister , daughter ) , family history positive , relative carry 2 know breast cancer gene , brca1 brca2 . physical examination : examination focus breast adjacent tissue . breast inspect skin change area lump presence nipple discharge . skin change include erythema , exaggeration normal skin marking , trac

  docs = retriever.get_relevant_documents("What is breast cancer?")


In [17]:
question = "Answer the following medical question in detail: What is breast cancer?"

response = qa_chain.invoke(question)
print(response)


Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors


{'query': 'Answer the following medical question in detail: What is breast cancer?', 'result': "fibrocystic disease , descriptive term small fluid-fill cyst modest diagnosis breast cancer , history radiation therapy chest area age 30 ( eg , hodgkin lymphoma ) , breast cancer gene , brca1 brca2 . physical examination : examination focus breast adjacent tissue . breast inspect skin change area lump presence nipple discharge . skin change include erythema , exaggeration normal skin marking , trace edema term peau d'orange ( ).  fibrocystic disease '' , descriptive term small fluid - fill cyst modest diagnosis breast cancer , history radiation therapy chest area age 30 ( eg , hodgkin lymphoma ) , benign breast masses 1 5 10 breast biopsy lead diagnosis cancer , history radiation therapy chest area age 30 ( eg , hodgkin lymphoma ) , breast cancer gene , brca1 brca2 . physical examination : examination focus breast adjacent tissue . breast inspect skin change area lump presence nipple discha