## Medical FAQ Chatbot

In [None]:
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
import faiss
import warnings
warnings.filterwarnings('ignore')

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pavit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pavit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pavit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pavit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavit\AppData\Roaming\nltk_data...
[nltk_data] 

True

In [2]:
# Load the dataset
df = pd.read_csv('medquad.csv')

# Display basic information
print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
print(df.head())
print(f"\nDataset Info:")
print(df.info())
print(f"\nMissing Values:")
print(df.isnull().sum())
print(f"\nFocus Areas Distribution:")
print(df['focus_area'].value_counts())

Dataset Shape: (16412, 4)

Columns: ['question', 'answer', 'source', 'focus_area']

First few rows:
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area  
0   Glaucoma  
1   Glaucoma  
2   Glaucoma  
3   Glaucoma  
4   Glaucoma  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16412 entries, 0 to 16411
Data colu

In [3]:
#  Data Preprocessing 

# Remove duplicates and handle missing values
df = df.drop_duplicates(subset=['question', 'answer'])
df = df.dropna(subset=['question', 'answer'])

# Clean text function
def clean_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text):
        return ""
    # Convert to lowercase
    text = str(text).lower()
    # Remove special characters and extra whitespace
    text = re.sub(r'[^a-zA-Z0-9\s\?\.]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply cleaning
df['question_clean'] = df['question'].apply(clean_text)
df['answer_clean'] = df['answer'].apply(clean_text)

# Remove very short questions or answers
df = df[(df['question_clean'].str.len() > 10) & (df['answer_clean'].str.len() > 20)]

print(f"Dataset shape after cleaning: {df.shape}")
print(df.head())

Dataset shape after cleaning: (16358, 6)
                                 question  \
0                What is (are) Glaucoma ?   
1                  What causes Glaucoma ?   
2     What are the symptoms of Glaucoma ?   
3  What are the treatments for Glaucoma ?   
4                What is (are) Glaucoma ?   

                                              answer           source  \
0  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   
1  Nearly 2.7 million people have glaucoma, a lea...  NIHSeniorHealth   
2  Symptoms of Glaucoma  Glaucoma can develop in ...  NIHSeniorHealth   
3  Although open-angle glaucoma cannot be cured, ...  NIHSeniorHealth   
4  Glaucoma is a group of diseases that can damag...  NIHSeniorHealth   

  focus_area                          question_clean  \
0   Glaucoma                  what is are glaucoma ?   
1   Glaucoma                  what causes glaucoma ?   
2   Glaucoma     what are the symptoms of glaucoma ?   
3   Glaucoma  what are th

In [4]:
#  Text Processing Functions 
class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        # Keep medical question words
        medical_keep_words = {'what', 'when', 'where', 'who', 'how', 'why', 'which'}
        self.stop_words = self.stop_words - medical_keep_words
    
    def preprocess(self, text):
        """Advanced text preprocessing"""
        # Tokenize
        tokens = word_tokenize(text.lower())
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens 
                 if token.isalnum() and token not in self.stop_words]
        return ' '.join(tokens)

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Apply preprocessing
df['question_processed'] = df['question_clean'].apply(preprocessor.preprocess)

print("Text preprocessing completed!")
print(f"Sample processed question: {df['question_processed'].iloc[0]}")


Text preprocessing completed!
Sample processed question: what glaucoma


In [5]:
# Create TF-IDF Model
print("Creating TF-IDF vectorizer...")

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.8
)

# Fit and transform questions
tfidf_matrix = tfidf_vectorizer.fit_transform(df['question_processed'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print("TF-IDF model created successfully!")


Creating TF-IDF vectorizer...
TF-IDF matrix shape: (16358, 5000)
TF-IDF model created successfully!


In [6]:
#  Create Sentence Transformer Model 
print("Loading Sentence Transformer model")

# Load pre-trained model
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all questions
print("Generating embeddings for questions...")
question_embeddings = sentence_model.encode(
    df['question_clean'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

print(f"Embeddings shape: {question_embeddings.shape}")
print("Sentence embeddings created successfully!")


Loading Sentence Transformer model
Generating embeddings for questions...
Generating embeddings for questions...


Batches: 100%|██████████| 512/512 [00:48<00:00, 10.48it/s]

Embeddings shape: (16358, 384)
Sentence embeddings created successfully!





In [7]:
# Create FAISS Index 
print("Creating FAISS index for fast similarity search...")

# Normalize embeddings for cosine similarity
faiss.normalize_L2(question_embeddings)

# Create FAISS index
dimension = question_embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product for cosine similarity
index.add(question_embeddings)

print(f"FAISS index created with {index.ntotal} vectors")

Creating FAISS index for fast similarity search...
FAISS index created with 16358 vectors


In [8]:
# Chatbot Class Definition
class MedicalChatbot:
    def __init__(self, df, tfidf_vectorizer, tfidf_matrix, 
                 sentence_model, faiss_index, preprocessor):
        self.df = df.reset_index(drop=True)
        self.tfidf_vectorizer = tfidf_vectorizer
        self.tfidf_matrix = tfidf_matrix
        self.sentence_model = sentence_model
        self.faiss_index = faiss_index
        self.preprocessor = preprocessor
    
    def get_answer_tfidf(self, question, top_k=3):
        """Get answer using TF-IDF similarity"""
        # Preprocess question
        processed_q = self.preprocessor.preprocess(clean_text(question))
        
        # Vectorize question
        question_vec = self.tfidf_vectorizer.transform([processed_q])
        
        # Calculate similarity
        similarities = cosine_similarity(question_vec, self.tfidf_matrix)[0]
        
        # Get top matches
        top_indices = similarities.argsort()[-top_k:][::-1]
        
        results = []
        for idx in top_indices:
            results.append({
                'question': self.df.iloc[idx]['question'],
                'answer': self.df.iloc[idx]['answer'],
                'focus_area': self.df.iloc[idx]['focus_area'],
                'similarity': similarities[idx]
            })
        
        return results
    
    def get_answer_semantic(self, question, top_k=3):
        """Get answer using Semantic similarity (Sentence Transformers + FAISS)"""
        # Encode question
        question_embedding = self.sentence_model.encode([clean_text(question)])
        faiss.normalize_L2(question_embedding)
        
        # Search in FAISS index
        similarities, indices = self.faiss_index.search(question_embedding, top_k)
        
        results = []
        for idx, sim in zip(indices[0], similarities[0]):
            results.append({
                'question': self.df.iloc[idx]['question'],
                'answer': self.df.iloc[idx]['answer'],
                'focus_area': self.df.iloc[idx]['focus_area'],
                'similarity': float(sim)
            })
        
        return results
    
    def get_answer_hybrid(self, question, top_k=3, weights=(0.4, 0.6)):
        """Hybrid approach combining TF-IDF and Semantic similarity"""
        # Get results from both methods
        tfidf_results = self.get_answer_tfidf(question, top_k=10)
        semantic_results = self.get_answer_semantic(question, top_k=10)
        
        # Combine scores
        combined_scores = {}
        
        for result in tfidf_results:
            q = result['question']
            combined_scores[q] = {
                'tfidf': result['similarity'],
                'semantic': 0,
                'answer': result['answer'],
                'focus_area': result['focus_area']
            }
        
        for result in semantic_results:
            q = result['question']
            if q in combined_scores:
                combined_scores[q]['semantic'] = result['similarity']
            else:
                combined_scores[q] = {
                    'tfidf': 0,
                    'semantic': result['similarity'],
                    'answer': result['answer'],
                    'focus_area': result['focus_area']
                }
        
        # Calculate weighted scores
        for q in combined_scores:
            tfidf_score = combined_scores[q]['tfidf']
            semantic_score = combined_scores[q]['semantic']
            combined_scores[q]['final_score'] = (
                weights[0] * tfidf_score + weights[1] * semantic_score
            )
        
        # Sort by final score
        sorted_results = sorted(
            combined_scores.items(),
            key=lambda x: x[1]['final_score'],
            reverse=True
        )[:top_k]
        
        # Format results
        results = []
        for q, data in sorted_results:
            results.append({
                'question': q,
                'answer': data['answer'],
                'focus_area': data['focus_area'],
                'similarity': data['final_score']
            })
        
        return results
    
    def answer(self, question, method='hybrid', top_k=3):
        """Main function to get answer"""
        if method == 'tfidf':
            return self.get_answer_tfidf(question, top_k)
        elif method == 'semantic':
            return self.get_answer_semantic(question, top_k)
        else:
            return self.get_answer_hybrid(question, top_k)

print("Chatbot class defined successfully!")

chatbot = MedicalChatbot(
    df=df,
    tfidf_vectorizer=tfidf_vectorizer,
    tfidf_matrix=tfidf_matrix,
    sentence_model=sentence_model,
    faiss_index=index,
    preprocessor=preprocessor
)

print("Medical FAQ Chatbot initialized successfully!")


Chatbot class defined successfully!
Medical FAQ Chatbot initialized successfully!


In [9]:
def test_chatbot(question, method='hybrid'):
    """Test function to display results nicely"""
    print(f"\n{'='*80}")
    print(f"QUESTION: {question}")
    print(f"METHOD: {method.upper()}")
    print(f"{'='*80}\n")
    
    results = chatbot.answer(question, method=method, top_k=3)
    
    for i, result in enumerate(results, 1):
        print(f"[Result {i}] (Similarity: {result['similarity']:.4f})")
        print(f"Focus Area: {result['focus_area']}")
        print(f"Q: {result['question']}")
        print(f"A: {result['answer'][:300]}...")
        print(f"{'-'*80}\n")

# Test with sample questions
test_questions = [
    "What is glaucoma?",
    "How to prevent high blood pressure?",
    "What are the symptoms of diabetes?",
    "What causes heart disease?"
]

for question in test_questions:
    test_chatbot(question, method='hybrid')



QUESTION: What is glaucoma?
METHOD: HYBRID

[Result 1] (Similarity: 0.9938)
Focus Area: Glaucoma
Q: What is (are) Glaucoma ?
A: Glaucoma is a group of diseases that can damage the eye's optic nerve. It is a leading cause of blindness in the United States. It usually happens when the fluid pressure inside the eyes slowly rises, damaging the optic nerve. Often there are no symptoms at first. Without treatment, people with glau...
--------------------------------------------------------------------------------

[Result 2] (Similarity: 0.6742)
Focus Area: Glaucoma
Q: What are the treatments for Glaucoma ?
A: Yes. Immediate treatment for early stage, open-angle glaucoma can delay progression of the disease. That's why early diagnosis is very important. Glaucoma treatments include medicines, laser surgery, conventional surgery, or a combination of any of these. While these treatments may save remaining vi...
--------------------------------------------------------------------------------

[

In [10]:
# Save Models 
print("Saving models and data...")

# Save TF-IDF model
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(tfidf_matrix, f)

# Save preprocessor
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

# Save FAISS index
faiss.write_index(index, 'faiss_index.bin')

# Save embeddings
np.save('question_embeddings.npy', question_embeddings)

# Save processed dataframe
df.to_csv('processed_medquad.csv', index=False)

# Save chatbot configuration
config = {
    'model_name': 'all-MiniLM-L6-v2',
    'tfidf_max_features': 5000,
    'embedding_dimension': dimension
}

with open('config.pkl', 'wb') as f:
    pickle.dump(config, f)

print(" All models and data saved successfully!")
print("\nSaved files:")
print("- tfidf_vectorizer.pkl")
print("- tfidf_matrix.pkl")
print("- preprocessor.pkl")
print("- faiss_index.bin")
print("- question_embeddings.npy")
print("- processed_medquad.csv")
print("- config.pkl")

Saving models and data...
 All models and data saved successfully!

Saved files:
- tfidf_vectorizer.pkl
- tfidf_matrix.pkl
- preprocessor.pkl
- faiss_index.bin
- question_embeddings.npy
- processed_medquad.csv
- config.pkl
 All models and data saved successfully!

Saved files:
- tfidf_vectorizer.pkl
- tfidf_matrix.pkl
- preprocessor.pkl
- faiss_index.bin
- question_embeddings.npy
- processed_medquad.csv
- config.pkl


In [11]:
# Evaluation Metrics
from sklearn.metrics import accuracy_score
import random

def evaluate_chatbot(sample_size=50):
    """Evaluate chatbot performance"""
    print("Evaluating chatbot performance...\n")
    
    # Sample random questions
    sample_indices = random.sample(range(len(df)), min(sample_size, len(df)))
    
    correct_top1 = 0
    correct_top3 = 0
    avg_similarity = []
    
    for idx in sample_indices:
        original_question = df.iloc[idx]['question']
        correct_answer = df.iloc[idx]['answer']
        
        results = chatbot.answer(original_question, method='hybrid', top_k=3)
        
        # Check if correct answer is in top 1
        if results[0]['answer'] == correct_answer:
            correct_top1 += 1
        
        # Check if correct answer is in top 3
        for result in results:
            if result['answer'] == correct_answer:
                correct_top3 += 1
                avg_similarity.append(result['similarity'])
                break
    
    print(f"Evaluation Results (Sample Size: {sample_size})")
    print(f"{'='*50}")
    print(f"Top-1 Accuracy: {correct_top1/sample_size*100:.2f}%")
    print(f"Top-3 Accuracy: {correct_top3/sample_size*100:.2f}%")
    print(f"Average Similarity Score: {np.mean(avg_similarity):.4f}")

evaluate_chatbot(sample_size=50)


Evaluating chatbot performance...

Evaluation Results (Sample Size: 50)
Top-1 Accuracy: 86.00%
Top-3 Accuracy: 92.00%
Average Similarity Score: 0.9739
Evaluation Results (Sample Size: 50)
Top-1 Accuracy: 86.00%
Top-3 Accuracy: 92.00%
Average Similarity Score: 0.9739


In [None]:
#  Interactive Testing

def interactive_test():
    """Interactive testing interface
    Notes:
    - In notebooks, the Esc key doesn't reach Python input(); it toggles command mode.
    - To exit: type 'quit'/'exit'/'q'/'esc', or press Ctrl+C (Interrupt) to stop.
    """
    print("\n" + "="*80)
    print("MEDICAL FAQ CHATBOT - INTERACTIVE MODE")
    print("="*80)
    print("Type your medical question or 'quit'/'exit'/'q'/'esc' to exit")
    print("Tip: Press Ctrl+C or click 'Interrupt/Stop' to exit anytime.\n")

    while True:
        try:
            question = input("Your Question: ").strip()
        except (KeyboardInterrupt, EOFError):
            print("\nExiting interactive mode. Goodbye!")
            break

        if not question:
            print("Please enter a valid question.\n")
            continue

        if question.lower() in ['quit', 'exit', 'q', 'esc']:
            print("Thank you for using Medical FAQ Chatbot!")
            break

        try:
            results = chatbot.answer(question, method='hybrid', top_k=3) or []
        except Exception as e:
            print(f"Sorry, something went wrong: {e}")
            continue

        if not results:
            print("No answer found. Try rephrasing your question.\n")
            continue

        print(f"\n{'='*80}")
        print("TOP ANSWER:")
        print(f"{'='*80}")
        print(f"Focus Area: {results[0]['focus_area']}")
        print(f"Confidence: {results[0]['similarity']:.4f}")
        print(f"\nAnswer: {results[0]['answer']}\n")

        try:
            show_more = input("Show more results? (y/n): ").strip().lower()
        except (KeyboardInterrupt, EOFError):
            print("\nExiting interactive mode. Goodbye!")
            break
        if show_more.startswith('y'):
            for i, result in enumerate(results[1:], 2):
                print(f"\n[Alternative Answer {i}]")
                print(f"Q: {result['question']}")
                print(f"A: {result['answer']}")
                print(f"Confidence: {result['similarity']:.4f}\n")

        print(f"{'='*80}\n")

# Uncomment/comment to run/stop interactive mode
interactive_test()


MEDICAL FAQ CHATBOT - INTERACTIVE MODE
Type your medical question or 'quit'/'exit'/'q'/'esc' to exit
Tip: Press Ctrl+C or click 'Interrupt/Stop' to exit anytime.


TOP ANSWER:
Focus Area: Glaucoma
Confidence: 0.9712

Answer: Glaucoma is a group of diseases that can damage the eye's optic nerve. It is a leading cause of blindness in the United States. It usually happens when the fluid pressure inside the eyes slowly rises, damaging the optic nerve. Often there are no symptoms at first. Without treatment, people with glaucoma will slowly lose their peripheral, or side vision. They seem to be looking through a tunnel. Over time, straight-ahead vision may decrease until no vision remains.    A comprehensive eye exam can tell if you have glaucoma. People at risk should get eye exams at least every two years. They include       -  African Americans over age 40    -  People over age 60, especially Mexican Americans    -  People with a family history of glaucoma       There is no cure, but gl