# Tasman Agentic Analytics - Phase 1 Demo

This notebook demonstrates the **local-first agentic analytics system** with minimal LLM usage.

**Key Features:**
- Rule-based triage (search vs analysis)
- Template-based SQL generation
- LLM fallback only when needed
- Automatic visualization
- Full observability of each step

---

## 1. Setup & Validation

Load configuration, connect to DuckDB, and validate schema.

In [None]:
import os
import sys
import json
from pathlib import Path
from pprint import pprint

import pandas as pd
import yaml
from IPython.display import display, Image, Markdown

# Add parent directory to path for imports
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

# Import our modules
from core.duckdb_connector import DuckDBConnector
from core.triage_local import LocalTriage
from core.local_text_to_sql import LocalTextToSQL
from core.llm_clients import LLMClient
from agents.agent_triage import TriageAgent
from agents.agent_text_to_sql import TextToSQLAgent
from agents.agent_search import SearchAgent

print("‚úÖ Imports successful")

In [None]:
# Load environment variables
try:
    from dotenv import load_dotenv
    load_dotenv(PROJECT_ROOT / ".env")
    print("‚úÖ Environment loaded")
except Exception as e:
    print(f"‚ÑπÔ∏è  No .env file or dotenv not available: {e}")

# Setup directories
CONFIG_DIR = PROJECT_ROOT / "config"
DATA_DIR = PROJECT_ROOT / "data"
CACHE_DIR = PROJECT_ROOT / ".cache" / "llm"
OUTPUT_DIR = Path.cwd() / "outputs"

for d in (CACHE_DIR, OUTPUT_DIR):
    d.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Project root: {PROJECT_ROOT}")
print(f"üìÅ Output dir: {OUTPUT_DIR}")

In [None]:
# Load configuration files
def load_yaml(path: Path):
    with open(path, 'r') as f:
        return yaml.safe_load(f)

db_config = load_yaml(CONFIG_DIR / "db.yaml")
business_context = load_yaml(CONFIG_DIR / "business_context.yaml")
templates = load_yaml(CONFIG_DIR / "sql_templates.yaml")

with open(CONFIG_DIR / "schema.json", 'r') as f:
    schema = json.load(f)

print("‚úÖ Configuration loaded")
print(f"   - {len(schema)} tables in schema")
print(f"   - {len(templates)} SQL templates")
print(f"   - {len(business_context['roles'])} roles configured")

In [None]:
# Get DuckDB path and connect
DUCKDB_PATH = Path(db_config['duckdb_path']).expanduser().resolve()
DEFAULT_LIMIT = db_config.get('default_limit', 1000)

print(f"ü¶Ü DuckDB path: {DUCKDB_PATH}")
print(f"üî¢ Default limit: {DEFAULT_LIMIT}")

# Check if DB exists
if not DUCKDB_PATH.exists():
    print(f"\n‚ö†Ô∏è  WARNING: DuckDB file not found at {DUCKDB_PATH}")
    print("   You'll need to create this database before running queries.")
    print("   The notebook will continue but queries will fail.")
else:
    print("‚úÖ DuckDB file exists")

In [None]:
# Initialize database connector
db_connector = DuckDBConnector(str(DUCKDB_PATH), default_limit=DEFAULT_LIMIT)

try:
    db_connector.connect()
    print("‚úÖ Connected to DuckDB")
    
    # List tables
    tables = db_connector.list_tables()
    print(f"\nüì¶ Tables in database ({len(tables)}):")
    for table in tables:
        print(f"   - {table}")
    
    # Validate schema
    is_valid, errors = db_connector.validate_schema(schema)
    
    if is_valid:
        print("\n‚úÖ Schema validation passed")
    else:
        print("\n‚ùå Schema validation failed:")
        for error in errors:
            print(f"   - {error}")
        
except Exception as e:
    print(f"‚ùå Database connection failed: {e}")
    print("   Notebook will continue but queries will fail.")

## 2. Configure LLM (Optional)

Set up LLM client for fallback. The system will work without LLM for known query patterns.

In [None]:
# Configure LLM provider
MODEL_PROVIDER = os.getenv("MODEL_PROVIDER", "openai").lower()
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
ANTHROPIC_MODEL = os.getenv("ANTHROPIC_MODEL", "claude-3-5-haiku-20241022")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

print(f"ü§ñ LLM Provider: {MODEL_PROVIDER}")

# Initialize LLM client (optional)
llm_client = None

try:
    if MODEL_PROVIDER == "openai" and OPENAI_API_KEY:
        llm_client = LLMClient(
            provider="openai",
            cache_dir=CACHE_DIR,
            temperature=0.2,
            max_tokens=512
        )
        print(f"‚úÖ OpenAI client initialized (model: {OPENAI_MODEL})")
        
    elif MODEL_PROVIDER == "anthropic" and ANTHROPIC_API_KEY:
        llm_client = LLMClient(
            provider="anthropic",
            cache_dir=CACHE_DIR,
            temperature=0.2,
            max_tokens=512
        )
        print(f"‚úÖ Anthropic client initialized (model: {ANTHROPIC_MODEL})")
    else:
        print("‚ÑπÔ∏è  No API key found - running in local-only mode")
        print("   Templates will handle known queries; unknown queries will fail gracefully.")
        
except Exception as e:
    print(f"‚ö†Ô∏è  LLM initialization failed: {e}")
    print("   Continuing in local-only mode.")
    llm_client = None

## 3. Initialize Agents

Set up triage, text-to-SQL, and search agents.

In [None]:
# Initialize agents
triage_agent = TriageAgent(
    business_context=business_context,
    llm_client=llm_client,
    llm_threshold=0.6
)

text_to_sql_agent = TextToSQLAgent(
    templates=templates,
    schema=schema,
    business_context=business_context,
    llm_client=llm_client,
    llm_threshold=0.6,
    default_limit=DEFAULT_LIMIT
)

search_agent = SearchAgent(
    triage_agent=triage_agent,
    text_to_sql_agent=text_to_sql_agent,
    db_connector=db_connector,
    output_dir=OUTPUT_DIR
)

print("‚úÖ Agents initialized")
print("   - TriageAgent: Rule-based + optional LLM fallback")
print("   - TextToSQLAgent: Template matching + optional LLM")
print("   - SearchAgent: End-to-end orchestration")

## 4. Business Context & Templates

Explore available roles, KPIs, and SQL templates.

In [None]:
# Display available roles and their KPIs
print("üë• Available Roles:\n")

for role_name, role_config in business_context['roles'].items():
    print(f"**{role_name.upper()}**")
    print(f"  KPIs: {', '.join(role_config['kpis'])}")
    print(f"  Dimensions: {', '.join(role_config['dims'][:5])}...")
    print(f"  Time window: {role_config['defaults']['time_window_days']} days")
    print()

In [None]:
# Display available SQL templates
print("üìã Available SQL Templates:\n")

for i, template in enumerate(templates, 1):
    print(f"{i}. **{template['id']}**")
    print(f"   Role: {template.get('role_hint', 'any')}")
    print(f"   Example questions:")
    for utterance in template['utterances'][:2]:
        print(f"     - \"{utterance}\"")
    print()

## 5. Ask Questions

Now let's ask questions and see the system in action!

In [None]:
# Helper function to display results
def display_result(result):
    """Display search result in a nice format."""
    print("="*80)
    print(f"QUESTION: {result['question']}")
    print(f"ROLE: {result.get('role', 'auto-detected')}")
    print("="*80)
    
    # Show steps
    print("\nüìã STEPS:")
    for step in result['steps']:
        step_name = step['step'].replace('_', ' ').title()
        used_llm = step.get('used_llm', False)
        llm_indicator = "ü§ñ (LLM)" if used_llm else "‚ö° (Local)"
        print(f"  {step_name}: {llm_indicator}")
        if 'confidence' in step:
            print(f"    Confidence: {step['confidence']:.2f}")
        if 'method' in step:
            print(f"    Method: {step['method']}")
    
    # Show status
    status = result.get('status', 'unknown')
    print(f"\nüìä STATUS: {status}")
    
    if status != 'success':
        print(f"\n‚ö†Ô∏è  {result.get('message', 'Unknown error')}")
        if 'errors' in result:
            for error in result['errors']:
                print(f"   - {error}")
        return
    
    # Show SQL
    if 'sql' in result:
        print("\nüíæ SQL QUERY:")
        print("```sql")
        print(result['sql'])
        print("```")
    
    # Show data preview
    if 'data' in result and not result['data'].empty:
        print(f"\nüìä RESULTS ({result['row_count']} rows):")
        display(result['data'].head(10))
    
    # Show visualization
    if result.get('chart_path'):
        print(f"\nüìà VISUALIZATION ({result['chart_type']}):")
        display(Image(filename=str(result['chart_path'])))
    
    # Show summary
    if 'summary' in result:
        print(f"\nüìù SUMMARY:")
        print(f"   {result['summary']}")
    
    print("\n" + "="*80 + "\n")

### Example 1: Template-matched query (local, no LLM)

In [None]:
# This should match a template and NOT call LLM
result = search_agent.search(
    question="show ad spend per channel over time",
    role="marketer"
)

display_result(result)

### Example 2: Another template-matched query

In [None]:
result = search_agent.search(
    question="which campaigns have the best click-through rate",
    role="marketer"
)

display_result(result)

### Example 3: CEO perspective query

In [None]:
result = search_agent.search(
    question="revenue by product category",
    role="ceo"
)

display_result(result)

### Example 4: Novel query (may trigger LLM if available)

In [None]:
# This is a novel query not in templates - will try LLM if available
result = search_agent.search(
    question="what are the top 10 products by revenue in the last 30 days?",
    role="cpo"
)

display_result(result)

### Example 5: Auto role detection

In [None]:
# No role specified - system will infer from question content
result = search_agent.search(
    question="conversion rate by device"
)

display_result(result)

## 6. Observability & Debugging

Inspect the internal workings of the system.

In [None]:
# Detailed view of triage result
test_question = "show me the top spending campaigns"

triage_result = triage_agent.triage(test_question, role="marketer")
print("üîç TRIAGE RESULT:\n")
pprint(triage_result)

In [None]:
# Detailed view of SQL generation
sql_result = text_to_sql_agent.generate_sql(test_question, role="marketer")
print("üîç SQL GENERATION RESULT:\n")
pprint(sql_result)

## 7. Test Cells

Quick smoke tests to ensure everything works.

In [None]:
# Test 1: Schema validation
print("TEST 1: Schema Validation")
is_valid, errors = db_connector.validate_schema(schema)
assert is_valid, f"Schema validation failed: {errors}"
print("‚úÖ PASSED\n")

# Test 2: Template coverage
print("TEST 2: Template Coverage")
test_questions = [
    ("spend by channel", "marketer"),
    ("ctr by campaign", "marketer"),
    ("orders by category", "ceo"),
]

for question, role in test_questions:
    result = text_to_sql_agent.generate_sql(question, role)
    assert result['sql'] is not None, f"Failed to generate SQL for: {question}"
    assert result['method'] == 'template_match', f"Expected template match for: {question}"
    print(f"  ‚úì {question}")

print("‚úÖ PASSED\n")

# Test 3: Triage classification
print("TEST 3: Triage Classification")
search_questions = ["show me revenue", "how many orders", "plot spend by channel"]
analysis_questions = ["why did revenue drop", "what drives conversion", "segment customers"]

for q in search_questions:
    result = triage_agent.triage(q)
    assert result['mode'] == 'search', f"Expected 'search' for: {q}"
    print(f"  ‚úì '{q}' ‚Üí search")

for q in analysis_questions:
    result = triage_agent.triage(q)
    assert result['mode'] == 'analysis', f"Expected 'analysis' for: {q}"
    print(f"  ‚úì '{q}' ‚Üí analysis")

print("‚úÖ PASSED\n")

print("üéâ ALL TESTS PASSED")

## 8. Interactive Query Interface

Try your own questions!

In [None]:
# Interactive query
YOUR_QUESTION = "show me conversion rate by device"  # ‚Üê Change this!
YOUR_ROLE = "marketer"  # ‚Üê Change this! Options: marketer, ceo, cpo, coo, or None for auto-detect

result = search_agent.search(
    question=YOUR_QUESTION,
    role=YOUR_ROLE if YOUR_ROLE != "None" else None
)

display_result(result)

## 9. Cache Statistics

Check LLM cache hits/misses.

In [None]:
# Count cache files
if CACHE_DIR.exists():
    cache_files = list(CACHE_DIR.glob("*.json"))
    print(f"üì¶ LLM Cache Statistics:")
    print(f"   Total cached responses: {len(cache_files)}")
    print(f"   Cache directory: {CACHE_DIR}")
    
    if len(cache_files) > 0:
        print("\n   Recent cache entries:")
        for cache_file in sorted(cache_files, key=lambda f: f.stat().st_mtime, reverse=True)[:5]:
            with open(cache_file, 'r') as f:
                data = json.load(f)
                key_parts = data.get('key_parts', [])
                if len(key_parts) > 2:
                    print(f"     - {key_parts[0]} call: {key_parts[-1][:50]}...")
else:
    print("‚ÑπÔ∏è  No cache directory found")

## 10. Cleanup

Close database connection.

In [None]:
# Close database connection
db_connector.close()
print("‚úÖ Database connection closed")