# 🤖 Document Chatbot Demo

This notebook demonstrates the key features of the enhanced document chatbot system.

## Features Demonstrated:
1. Document ingestion from multiple file formats
2. Intelligent retrieval with confidence scoring
3. Answer generation with citations
4. Answer refusal when information is not available
5. Performance monitoring

In [None]:
# Import required libraries
import sys
from pathlib import Path
import time
import pandas as pd

# Add project root to path
project_root = Path('.').absolute()
sys.path.insert(0, str(project_root))

from app.chatbot import DocumentChatbot
from app.config import get_config

print("✅ Libraries imported successfully")

## 1. Initialize the Chatbot

In [None]:
# Initialize the chatbot
print("🤖 Initializing Document Chatbot...")
chatbot = DocumentChatbot()

# Get initial statistics
stats = chatbot.get_stats()
print(f"📊 Initial Statistics:")
print(f"   - Total documents: {stats['vector_store']['total_documents']}")
print(f"   - Collection: {stats['vector_store']['collection_name']}")
print(f"   - Chunk size: {stats['config']['chunk_size']}")
print(f"   - Retrieval K: {stats['config']['retrieval_k']}")
print("✅ Chatbot initialized")

## 2. Document Ingestion Demo

In [None]:
# Check if data folder exists
data_folder = project_root / "data"
print(f"📁 Checking data folder: {data_folder}")

if not data_folder.exists():
    print("❌ Data folder not found. Creating it...")
    data_folder.mkdir(exist_ok=True)
    print("⚠️  Please add documents (PDF, TXT, MD, DOCX) to the 'data' folder and re-run this cell")
else:
    # List files in data folder
    files = list(data_folder.glob("*"))
    print(f"📄 Found {len(files)} files in data folder:")
    for file in files[:5]:  # Show first 5 files
        print(f"   - {file.name}")
    if len(files) > 5:
        print(f"   ... and {len(files) - 5} more files")

In [None]:
# Reset knowledge base and ingest documents
print("🗑️  Resetting knowledge base...")
reset_result = chatbot.reset_knowledge_base()
print(f"Reset result: {reset_result['success']}")

if data_folder.exists() and list(data_folder.glob("*")):
    print(f"📚 Ingesting documents from: {data_folder}")
    start_time = time.time()
    
    result = chatbot.ingest_documents(str(data_folder))
    
    if result['success']:
        print(f"✅ {result['message']}")
        stats = result['stats']
        print(f"📊 Ingestion Statistics:")
        print(f"   - New chunks: {stats.get('new_chunks', 0)}")
        print(f"   - Total documents: {stats.get('total_documents', 0)}")
        print(f"   - Processing time: {stats.get('processing_time', 0):.2f}s")
    else:
        print(f"❌ {result['message']}")
else:
    print("❌ No documents found to ingest")

## 3. Question Answering Demo

In [None]:
# Example questions to test the system
example_questions = [
    "What is machine learning?",
    "What are the key principles mentioned in the document?",
    "How should I approach model evaluation?",
    "What is deep learning?",  # This might not be in the documents
    "Can you summarize the main topics covered?"
]

print(f"📝 Testing {len(example_questions)} example questions...")
print("=" * 60)

In [None]:
# Function to display results nicely
def display_answer(question, result, question_num):
    print(f"\n🔹 Question {question_num}: {question}")
    print("-" * 50)
    print(f"🤖 Answer:")
    print(result['answer'])
    
    if result['citations']:
        print(f"\n📚 Citations: {', '.join(result['citations'])}")
    
    print(f"\n📊 Confidence: {result['confidence']:.3f}")
    print(f"⏱️  Response time: {result['total_time']:.2f}s")
    
    if result['retrieval_results']:
        print(f"\n🔍 Top retrieved source:")
        top_result = result['retrieval_results'][0]
        print(f"   Source: {top_result['source']} (page {top_result['page']})")
        print(f"   Similarity score: {top_result['score']}")
        print(f"   Content preview: {top_result['content'][:100]}...")
    
    print("\n" + "." * 60)

# Test each question
results = []
for i, question in enumerate(example_questions, 1):
    result = chatbot.ask_question(question)
    display_answer(question, result, i)
    
    # Store for analysis
    results.append({
        'question': question,
        'answer_length': len(result['answer']),
        'confidence': result['confidence'],
        'response_time': result['total_time'],
        'has_citations': len(result['citations']) > 0,
        'num_sources': len(result['retrieval_results'])
    })
    
    time.sleep(1)  # Brief pause between questions

## 4. Performance Analysis

In [None]:
# Create a DataFrame for analysis
df = pd.DataFrame(results)

print("📈 Performance Analysis")
print("=" * 30)
print(f"Average response time: {df['response_time'].mean():.2f}s")
print(f"Max response time: {df['response_time'].max():.2f}s")
print(f"Min response time: {df['response_time'].min():.2f}s")
print(f"Average confidence: {df['confidence'].mean():.3f}")
print(f"Questions with citations: {df['has_citations'].sum()}/{len(df)}")
print(f"Average answer length: {df['answer_length'].mean():.0f} characters")

# Display the results table
print("\n📊 Detailed Results:")
display_df = df[['question', 'confidence', 'response_time', 'has_citations']].copy()
display_df['question'] = display_df['question'].str[:50] + '...'  # Truncate for display
display_df.columns = ['Question', 'Confidence', 'Time (s)', 'Has Citations']
print(display_df.to_string(index=False))

## 5. Interactive Question Answering

In [None]:
# Interactive question answering
def ask_interactive_question():
    question = input("🔹 Your question (or 'quit' to exit): ").strip()
    
    if question.lower() in ['quit', 'exit', 'q']:
        return False
    
    if not question:
        print("Please enter a question.")
        return True
    
    print("\n🔍 Searching for relevant information...")
    result = chatbot.ask_question(question)
    
    print(f"\n🤖 Answer:")
    print(result['answer'])
    
    if result['citations']:
        print(f"\n📚 Sources: {', '.join(result['citations'])}")
    
    print(f"\n📊 Confidence: {result['confidence']:.3f} | ⏱️ Time: {result['total_time']:.2f}s")
    print("\n" + "-" * 60)
    
    return True

print("💬 Interactive Mode")
print("You can now ask questions about your documents!")
print("Type 'quit' to exit.")
print("=" * 40)

# Note: In Jupyter, this will only work if run interactively
# For demo purposes, we'll just show how it would work
print("Note: In Jupyter notebooks, interactive input may not work properly.")
print("To try interactive mode, run: python main.py interactive")

## 6. Document Search Demo

In [None]:
# Demonstrate document search without answer generation
search_query = "machine learning"
print(f"🔍 Searching for: '{search_query}'")

search_results = chatbot.search_documents(search_query, k=5)

print(f"\n📄 Found {len(search_results)} relevant documents:")
print("=" * 50)

for i, result in enumerate(search_results, 1):
    print(f"\n{i}. {result['source']} (Page {result['page']})")
    print(f"   Similarity Score: {result['similarity_score']:.3f}")
    print(f"   Citations: {', '.join(result['citations'])}")
    print(f"   Content: {result['content'][:200]}...")
    print()

## 7. System Information

In [None]:
# Get comprehensive system information
final_stats = chatbot.get_stats()
available_sources = chatbot.get_available_sources()

print("📊 Final System Statistics")
print("=" * 30)
print(f"Total documents in knowledge base: {final_stats['vector_store']['total_documents']}")
print(f"Collection name: {final_stats['vector_store']['collection_name']}")
print(f"Persist directory: {final_stats['vector_store']['persist_directory']}")

print(f"\n⚙️ Configuration:")
print(f"Chunk size: {final_stats['config']['chunk_size']}")
print(f"Chunk overlap: {final_stats['config']['chunk_overlap']}")
print(f"Retrieval K: {final_stats['config']['retrieval_k']}")
print(f"Confidence threshold: {final_stats['config']['confidence_threshold']}")

print(f"\n📚 Available Sources ({len(available_sources)}):")
for source in available_sources:
    print(f"   - {source}")

## 8. Next Steps

This notebook demonstrated the core functionality of the Document Chatbot system. Here are some next steps you can try:

### Using the System
1. **Add more documents**: Place additional PDF, TXT, MD, or DOCX files in the `data/` folder
2. **Try the CLI**: Run `python main.py --help` to see all available commands
3. **Use the web interface**: Run `streamlit run streamlit_app.py` for a web UI
4. **Try the FastAPI**: Run `python fastapi_app.py` for a REST API

### Customization
1. **Adjust configuration**: Edit `app/config.py` to change chunk sizes, thresholds, etc.
2. **Add LLM model**: Provide a model path for better answer generation
3. **Tune performance**: Adjust retrieval parameters based on your documents

### Advanced Features
1. **Citation formatting**: Customize how citations are displayed
2. **Confidence tuning**: Adjust thresholds for answer refusal
3. **Performance monitoring**: Track response times and accuracy

**Happy chatting with your documents! 🎉**