# Interactive Demo for NeMo QA Chatbot

This notebook provides an interactive demo of the NeMo QA Chatbot.

In [None]:
import os
import sys
import json
import torch
import gradio as gr

# Add parent directory to path
sys.path.append(os.path.abspath('..'))

# Import NeMo QA modules
from nemo_qa.modeling.model import load_model
from nemo_qa.recipes.inference_recipe import format_prompt

## Load Model

Let's load the fine-tuned model.

In [None]:
# Replace with your model path
model_path = '../models/finetuned/final_model'

# Load model
model = load_model(model_path)

print(f"Model loaded from {model_path}")

## Create Chat Function

Let's create a chat function to interact with the model.

In [None]:
def chat(question, context="", history=None, temperature=0.7):
    """Chat with the model.
    
    Args:
        question: User question
        context: Optional context
        history: Conversation history
        temperature: Sampling temperature
    
    Returns:
        Model response
    """
    if history is None:
        history = []
    
    # Format prompt
    prompt = format_prompt(question, context, history)
    
    # Generate response
    with torch.inference_mode():
        output = model.generate(
            prompt,
            max_length=512,
            temperature=temperature,
            top_p=0.9,
            top_k=50,
            return_attention=True
        )
    
    # Extract response
    response = output['text'][0].replace(prompt, "").strip()
    attention = output.get('attention_weights')
    
    return response, attention

## Test Chat Function

Let's test the chat function.

In [None]:
# Test with a simple question
question = "What is LoRA fine-tuning?"
response, _ = chat(question)

print(f"Question: {question}")
print(f"Response: {response}")

## Create Gradio Interface

Now let's create a Gradio interface for the chatbot.

In [None]:
def gradio_chat(message, history, context):
    """Chat function for Gradio interface.
    
    Args:
        message: User message
        history: Conversation history
        context: Optional context
    
    Returns:
        Updated history
    """
    # Convert Gradio history to the format expected by the chat function
    chat_history = [(h[0], h[1]) for h in history]
    
    # Chat with the model
    response, _ = chat(message, context, chat_history)
    
    # Return the updated history
    return response

# Create Gradio interface
with gr.Blocks(title="NeMo QA Chatbot") as demo:
    gr.Markdown("# NeMo QA Chatbot Demo")
    gr.Markdown("Ask questions about LoRA fine-tuning, NeMo, and related topics.")
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(height=600)
            
            with gr.Row():
                message = gr.Textbox(
                    label="Question",
                    placeholder="Type your question here...",
                    show_label=False
                )
                submit_btn = gr.Button("Send")
        
        with gr.Column(scale=2):
            context = gr.Textbox(
                label="Context (Optional)",
                placeholder="Add optional context here...",
                lines=10
            )
    
    gr.Markdown("## Example Questions")
    gr.Examples(
        examples=[
            "What is LoRA fine-tuning?",
            "How does NeMo Curator improve data quality?",
            "Explain the advantages of LLAMA3 over LLAMA2.",
            "What are the key hyperparameters for LoRA?",
            "How can I implement efficient LoRA fine-tuning in NeMo?"
        ],
        inputs=message
    )
    
    # Set up event handlers
    submit_btn.click(
        gradio_chat,
        inputs=[message, chatbot, context],
        outputs=chatbot
    )
    
    message.submit(
        gradio_chat,
        inputs=[message, chatbot, context],
        outputs=chatbot
    )

# Launch the interface
demo.launch(share=True)

## Advanced Testing

Let's test the model with some more advanced scenarios.

In [None]:
# Test with context
question = "What are the key hyperparameters?"
context = """
LoRA (Low-Rank Adaptation) is a parameter-efficient fine-tuning technique that uses low-rank matrices
to adapt pre-trained models. The key hyperparameters in LoRA are:
- Rank (r): Controls the rank of the low-rank matrices
- Alpha (α): Scaling factor for LoRA updates
- Target modules: Which layers to apply LoRA to
- Dropout: Regularization parameter
"""

response, _ = chat(question, context)

print(f"Question: {question}")
print(f"Context: {context}")
print(f"Response: {response}")

In [None]:
# Test with conversation history
history = [
    ("What is NeMo?", "NeMo (Neural Modules) is NVIDIA's framework for building, training, and fine-tuning neural networks for various AI tasks, including natural language processing.")
]

question = "How does it support LoRA fine-tuning?"
response, _ = chat(question, history=history)

print(f"History: {history}")
print(f"Question: {question}")
print(f"Response: {response}")