# Lab 4.4.6: Gradio Demo - SOLUTION

**Module:** 4.4 - Containerization & Cloud Deployment  
**This is the complete solution notebook with all exercises solved.**

---

## Exercise 1 Solution: Multi-Model Comparison Interface

In [None]:
import gradio as gr

# Multi-model comparison interface

model_comparison_code = '''
"""Multi-model comparison interface with Gradio."""

import gradio as gr
import time
from typing import List, Tuple


# Mock inference functions (replace with real models)
def model_a_inference(prompt: str) -> Tuple[str, float]:
    """Model A (e.g., Llama-7B)."""
    start = time.time()
    # Simulate inference
    time.sleep(0.5)
    response = f"[Model A] Response to: {prompt}\n\nThis is a simulated response from Llama-7B."
    latency = (time.time() - start) * 1000
    return response, latency


def model_b_inference(prompt: str) -> Tuple[str, float]:
    """Model B (e.g., Mistral-7B)."""
    start = time.time()
    # Simulate inference
    time.sleep(0.3)
    response = f"[Model B] Response to: {prompt}\n\nThis is a simulated response from Mistral-7B."
    latency = (time.time() - start) * 1000
    return response, latency


def compare_models(prompt: str, model_a_name: str, model_b_name: str):
    """Compare two models side by side."""
    # Get responses from both models
    response_a, latency_a = model_a_inference(prompt)
    response_b, latency_b = model_b_inference(prompt)
    
    return (
        response_a,
        response_b,
        f"{latency_a:.0f}ms",
        f"{latency_b:.0f}ms",
    )


# Build the interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Model Comparison Tool")
    gr.Markdown("Compare responses from two LLMs side by side")
    
    with gr.Row():
        with gr.Column():
            model_a_select = gr.Dropdown(
                choices=["llama-7b", "llama-13b", "llama-70b"],
                value="llama-7b",
                label="Model A",
            )
        with gr.Column():
            model_b_select = gr.Dropdown(
                choices=["mistral-7b", "mixtral-8x7b", "codellama-7b"],
                value="mistral-7b",
                label="Model B",
            )
    
    prompt_input = gr.Textbox(
        label="Prompt",
        placeholder="Enter your prompt here...",
        lines=3,
    )
    
    compare_btn = gr.Button("Compare", variant="primary")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Model A Response")
            output_a = gr.Textbox(label="Response", lines=10)
            latency_a = gr.Textbox(label="Latency")
        with gr.Column():
            gr.Markdown("### Model B Response")
            output_b = gr.Textbox(label="Response", lines=10)
            latency_b = gr.Textbox(label="Latency")
    
    # Connect the button
    compare_btn.click(
        fn=compare_models,
        inputs=[prompt_input, model_a_select, model_b_select],
        outputs=[output_a, output_b, latency_a, latency_b],
    )
    
    # Examples
    gr.Examples(
        examples=[
            ["Explain quantum computing in simple terms"],
            ["Write a Python function to reverse a string"],
            ["What are the benefits of using Docker?"],
        ],
        inputs=[prompt_input],
    )

if __name__ == "__main__":
    demo.launch()
'''

print("MODEL COMPARISON INTERFACE:")
print("=" * 60)
print(model_comparison_code)

## Exercise 2 Solution: RAG Chat Interface with Sources

In [None]:
# RAG interface with source display

rag_interface_code = '''
"""RAG Chat Interface with Source Display."""

import gradio as gr
from typing import List, Tuple, Dict
import json


# Simulated RAG functions
def search_documents(query: str) -> List[Dict]:
    """Search vector database for relevant documents."""
    # Mock search results
    return [
        {
            "content": f"Document 1 relevant to: {query}",
            "source": "docs/guide.md",
            "score": 0.92,
        },
        {
            "content": f"Document 2 with more context about: {query}",
            "source": "docs/tutorial.md", 
            "score": 0.87,
        },
        {
            "content": f"Document 3 providing details on: {query}",
            "source": "docs/reference.md",
            "score": 0.81,
        },
    ]


def generate_with_context(query: str, context: List[Dict]) -> str:
    """Generate response using retrieved context."""
    # Mock LLM generation
    context_text = "\n".join([d["content"] for d in context])
    return f"Based on the provided documents:\n\n{context_text}\n\nHere is my answer to: {query}"


def rag_chat(message: str, history: List, show_sources: bool = True):
    """RAG chat function with optional source display."""
    # Step 1: Retrieve relevant documents
    sources = search_documents(message)
    
    # Step 2: Generate response with context
    response = generate_with_context(message, sources)
    
    # Step 3: Format sources if requested
    if show_sources:
        source_text = "\n\n---\n**Sources:**\n"
        for s in sources:
            source_text += f"- [{s['source']}] (score: {s['score']:.2f})\n"
        response += source_text
    
    return response


# Document store
uploaded_documents = []


def upload_document(file):
    """Process uploaded document."""
    if file is None:
        return "No file uploaded"
    
    # Read and store document
    try:
        with open(file.name, 'r') as f:
            content = f.read()
        
        uploaded_documents.append({
            "name": file.name.split("/")[-1],
            "content": content,
        })
        
        return f"Uploaded: {file.name.split('/')[-1]} ({len(content)} chars)\nTotal documents: {len(uploaded_documents)}"
    except Exception as e:
        return f"Error: {str(e)}"


# Build interface
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# RAG Chat Demo")
    gr.Markdown("Chat with your documents using Retrieval-Augmented Generation")
    
    with gr.Row():
        # Left panel: Document upload
        with gr.Column(scale=1):
            gr.Markdown("### Documents")
            file_upload = gr.File(
                label="Upload Document",
                file_types=[".txt", ".md", ".pdf"],
            )
            upload_status = gr.Textbox(
                label="Status",
                interactive=False,
            )
            file_upload.change(upload_document, file_upload, upload_status)
            
            show_sources = gr.Checkbox(
                label="Show Sources",
                value=True,
            )
        
        # Right panel: Chat
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                height=500,
                show_label=False,
            )
            msg = gr.Textbox(
                label="Message",
                placeholder="Ask a question about your documents...",
            )
            
            def respond(message, history, show_src):
                response = rag_chat(message, history, show_src)
                history.append((message, response))
                return "", history
            
            msg.submit(respond, [msg, chatbot, show_sources], [msg, chatbot])
            
            clear_btn = gr.Button("Clear Chat")
            clear_btn.click(lambda: [], outputs=[chatbot])
    
    # Examples
    gr.Examples(
        examples=[
            ["What are the main topics covered?"],
            ["Summarize the key points"],
            ["How do I get started?"],
        ],
        inputs=[msg],
    )

if __name__ == "__main__":
    demo.launch()
'''

print("RAG CHAT INTERFACE:")
print("=" * 60)
print(rag_interface_code)

## Exercise 3 Solution: Production-Ready Hugging Face Space

In [None]:
# Complete HuggingFace Space app.py

hf_space_app = '''
"""Production-ready Gradio App for Hugging Face Spaces.

Features:
- Streaming responses
- Token counting
- Error handling
- Performance metrics
- Rate limiting
"""

import os
import time
import threading
from collections import deque
from datetime import datetime
from typing import List, Iterator, Tuple

import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# ============================================
# Configuration
# ============================================

MODEL_ID = os.environ.get("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
MAX_TOKENS = int(os.environ.get("MAX_TOKENS", 512))
RATE_LIMIT_PER_MINUTE = int(os.environ.get("RATE_LIMIT", 10))

# Rate limiting
request_times = deque(maxlen=100)
request_lock = threading.Lock()

# ============================================
# Model Loading
# ============================================

print(f"Loading model: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
)
print("Model loaded!")

# ============================================
# Helper Functions
# ============================================

def check_rate_limit() -> bool:
    """Check if request is within rate limit."""
    with request_lock:
        now = time.time()
        # Remove old requests
        while request_times and now - request_times[0] > 60:
            request_times.popleft()
        
        if len(request_times) >= RATE_LIMIT_PER_MINUTE:
            return False
        
        request_times.append(now)
        return True


def count_tokens(text: str) -> int:
    """Count tokens in text."""
    return len(tokenizer.encode(text))


def format_chat_prompt(message: str, history: List[Tuple[str, str]]) -> str:
    """Format chat history into a prompt."""
    prompt = "<|system|>You are a helpful AI assistant.</s>"
    
    for user_msg, assistant_msg in history:
        prompt += f"<|user|>{user_msg}</s>"
        prompt += f"<|assistant|>{assistant_msg}</s>"
    
    prompt += f"<|user|>{message}</s><|assistant|>"
    return prompt


# ============================================
# Inference Function
# ============================================

def chat(message: str, history: List[Tuple[str, str]]) -> Iterator[str]:
    """Chat function with streaming."""
    
    # Rate limiting
    if not check_rate_limit():
        yield "Rate limit exceeded. Please wait a moment and try again."
        return
    
    # Input validation
    if not message.strip():
        yield "Please enter a message."
        return
    
    if count_tokens(message) > 1000:
        yield "Message too long. Please keep it under 1000 tokens."
        return
    
    try:
        # Format prompt
        prompt = format_chat_prompt(message, history)
        
        # Tokenize
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        
        # Generate with streaming
        streamer = transformers.TextIteratorStreamer(
            tokenizer,
            skip_prompt=True,
            skip_special_tokens=True,
        )
        
        generation_kwargs = {
            **inputs,
            "streamer": streamer,
            "max_new_tokens": MAX_TOKENS,
            "temperature": 0.7,
            "top_p": 0.9,
            "do_sample": True,
        }
        
        # Start generation in background
        thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        
        # Stream output
        generated = ""
        for token in streamer:
            generated += token
            yield generated
        
        thread.join()
        
    except Exception as e:
        yield f"Error: {str(e)}. Please try again."


# ============================================
# Gradio Interface
# ============================================

# Custom CSS for NVIDIA theme
custom_css = """
.gradio-container {
    max-width: 900px !important;
}
#component-0 {
    background: linear-gradient(135deg, #76b900 0%, #1a1a2e 100%);
}
"""

# Build interface
demo = gr.ChatInterface(
    fn=chat,
    title="DGX Spark AI Assistant",
    description=f"""
    Powered by {MODEL_ID} running on DGX Spark.
    
    - Max tokens: {MAX_TOKENS}
    - Rate limit: {RATE_LIMIT_PER_MINUTE} requests/minute
    """,
    examples=[
        "What is the DGX Spark?",
        "Explain GPU memory management in simple terms",
        "Write a Python function to calculate factorial",
        "What are the benefits of containerization?",
    ],
    theme=gr.themes.Soft(
        primary_hue="green",
        neutral_hue="slate",
    ),
    css=custom_css,
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
    )
'''

print("PRODUCTION HUGGING FACE SPACE APP:")
print("=" * 60)
print(hf_space_app)

# Save to file
import os
os.makedirs("../docker-examples/gradio-space", exist_ok=True)
with open("../docker-examples/gradio-space/app.py", "w") as f:
    f.write(hf_space_app)
print("\nSaved to: ../docker-examples/gradio-space/app.py")

---

## Summary

This solution demonstrated:

1. **Model Comparison Interface**
   - Side-by-side model outputs
   - Latency measurement
   - Dropdown model selection

2. **RAG Chat Interface**
   - Document upload and processing
   - Source attribution display
   - Toggle for showing sources

3. **Production HuggingFace Space**
   - Streaming responses
   - Rate limiting
   - Error handling
   - Custom theming