In [1]:
# %%
# If running in a fresh environment, uncomment the next cell once to install deps:
# !pip install -qU langchain langchain-community langchain-text-splitters langchain-huggingface \
#                 transformers accelerate sentence-transformers faiss-cpu sqlalchemy pandas tabulate sqlparse

import os
import re
from dataclasses import dataclass
from typing import List, Dict, Any, Optional, Tuple

import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine
import sqlparse

# LangChain bits
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_huggingface import HuggingFacePipeline
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings

# HF Transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# %%
# Setup Ollama model (make sure Ollama is running locally)
try:
    # Try the new recommended import first
    from langchain_ollama import OllamaLLM
    print("Using new langchain_ollama.OllamaLLM")
except ImportError:
    # Fallback to the old import if new package not available
    from langchain_community.llms import Ollama as OllamaLLM
    print("Using legacy langchain_community.llms.Ollama")

# Configure Ollama model
MODEL_NAME = "qwen2.5-coder:7b"  # or whatever model you have installed in Ollama
# Common alternatives: "codellama", "deepseek-coder", "qwen2.5-coder", "llama3.1"

print(f"Connecting to Ollama model: {MODEL_NAME}")
print("Make sure Ollama is running locally (ollama serve)")

# Use only valid parameters for Ollama
llm = OllamaLLM(
    model=MODEL_NAME,
    temperature=0.2,
    # Increased timeout to handle complex queries
    timeout=300,  # 5 minutes timeout (was default 60 seconds)
    # Valid parameters for Ollama
    num_predict=1024,  # max tokens - increased for complex queries
    top_p=0.9,
    repeat_penalty=1.05,
    # Additional valid parameters
    num_ctx=4096,  # context window
)

# Test the connection with a simple query
print("Testing Ollama connection...")
try:
    test_response = llm.invoke("Hello, respond with just 'OK'")
    print("Ollama connection successful!")
    print(f"Test response: {test_response[:50]}...")
except Exception as e:
    print(f"Ollama connection failed: {e}")
    print("Please ensure Ollama is running: ollama serve")
    raise

print("Setup complete!")

Using new langchain_ollama.OllamaLLM
Connecting to Ollama model: qwen2.5-coder:7b
Make sure Ollama is running locally (ollama serve)
Testing Ollama connection...
Ollama connection successful!
Test response: OK...
Setup complete!


In [3]:

# Use the existing documentation file or create path to our database documentation
SCHEMA_DOC_PATH = "../DATABASE_DOCUMENTATION.txt"

# Check if our documentation exists, if not use the existing one
import os
if not os.path.exists(SCHEMA_DOC_PATH):
    SCHEMA_DOC_PATH = "../data/data_documentation.txt"
    print(f"Using existing documentation: {SCHEMA_DOC_PATH}")
else:
    print(f"Using database documentation: {SCHEMA_DOC_PATH}")

# Load text doc
loader = TextLoader(SCHEMA_DOC_PATH, encoding="utf-8")
docs = loader.load()

# Chunk for retrieval
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""],
)
chunks = splitter.split_documents(docs)

# Embeddings
embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Vector store
vectorstore = FAISS.from_documents(chunks, embed)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})

print(f"Loaded {len(docs)} documents, split into {len(chunks)} chunks")
print("RAG system ready!")

Using existing documentation: ../data/data_documentation.txt
Loaded 1 documents, split into 15 chunks
RAG system ready!


In [4]:
# %%
# Connect to the Employee Information database
SQLALCHEMY_DATABASE_URI = "sqlite:///../data/Employee_Information.db"

engine: Engine = create_engine(SQLALCHEMY_DATABASE_URI, future=True)

# Test the connection
with engine.connect() as conn:
    result = conn.execute(text("SELECT name FROM sqlite_master WHERE type='table'"))
    tables = [row[0] for row in result.fetchall()]
    print(f"Connected successfully! Found tables: {tables}")

Connected successfully! Found tables: ['users', 'departments', 'employee_records']


In [5]:
# %%
PLANNER_SYSTEM = """You are a precise data analysis planner.
Break a user's request into the smallest possible, sequential SQL sub-tasks that can be executed one-by-one.
Each sub-task should have a clear purpose and a short description.
Return a numbered list. Avoid overlap; each step should build on prior outputs if needed.
If a sub-task needs a temporary view/CTE, specify it succinctly.
"""

PLANNER_USER_TEMPLATE = """User request:
{user_request}

Schema context (selected excerpts):
{schema_context}

Return a list like:
1) <title> — <what this step does>
2) ...
Keep it under {max_steps} steps if possible.
"""

SQL_SYSTEM = """You are a senior analytics engineer who writes correct, dialect-appropriate SQL.
You will generate SQL for a single sub-task at a time, using the provided schema context and any prior step outputs.
Constraints:
- Prefer ANSI SQL; avoid vendor-specific features unless necessary.
- Use explicit column lists rather than SELECT * when possible.
- If building on previous outputs, reference them via CTEs using the `WITH` clause and include minimal necessary columns.
- DO NOT guess columns/tables that do not exist in the schema snippets provided.
- Keep queries idempotent and safe to run.
Return ONLY the SQL code; no commentary.
"""

SQL_USER_TEMPLATE = """Database dialect: auto-detect (assume SQLite unless otherwise specified).
Current sub-task: {subtask_text}

Relevant schema snippets:
{schema_context}

If there are prior steps, here are the prior SQL snippets that produced intermediate outputs (if any):
{prior_sql}

Now produce a single SQL statement that accomplishes ONLY this sub-task.
If you must build on previous results, reuse them via CTEs inline (restate them minimally).
Return only SQL.
"""


In [6]:
# %%
@dataclass
class StepResult:
    step_id: int
    title: str
    description: str
    sql: Optional[str] = None
    result_df: Optional[pd.DataFrame] = None
    error: Optional[str] = None


def call_llm(system_prompt: str, user_prompt: str, max_retries: int = 3) -> str:
    """Small helper to call the LC LLM with system+human messages, with retry logic for timeouts."""
    msgs = [SystemMessage(content=system_prompt), HumanMessage(content=user_prompt)]
    
    for attempt in range(max_retries):
        try:
            print(f"[LLM] Calling Ollama (attempt {attempt + 1}/{max_retries})...")
            out = llm.invoke(msgs)
            if hasattr(out, "content"):
                print(f"[LLM] Response received successfully")
                return out.content.strip()
            # Fallback for HF text
            print(f"[LLM] Response received successfully (fallback format)")
            return str(out).strip()
            
        except Exception as e:
            error_msg = str(e)
            if "Read timed out" in error_msg or "timeout" in error_msg.lower():
                print(f"[LLM] Timeout on attempt {attempt + 1}/{max_retries}: {error_msg}")
                if attempt < max_retries - 1:
                    print(f"[LLM] Retrying in 5 seconds...")
                    import time
                    time.sleep(5)
                    continue
                else:
                    print(f"[LLM] All retry attempts failed due to timeout")
                    raise TimeoutError(f"Ollama failed to respond after {max_retries} attempts due to timeout")
            else:
                print(f"[LLM] Non-timeout error: {error_msg}")
                raise
    
    # Should not reach here
    raise Exception("Unexpected error in call_llm")


def retrieve_schema_context(query_text: str, k: int = 6) -> str:
    """Grab top-k schema chunks for the planning or generation step."""
    # Updated to use invoke instead of get_relevant_documents
    rel_docs = retriever.invoke(query_text)
    return "\n---\n".join(d.page_content for d in rel_docs[:k])


def parse_numbered_list(text: str) -> List[Tuple[str, str]]:
    """
    Parse "1) Title — description" style list into [(title, desc), ...].
    Quite forgiving to variations.
    """
    lines = [ln.strip() for ln in text.splitlines() if ln.strip()]
    steps = []
    for ln in lines:
        m = re.match(r"^\s*\d+\)?[.)]?\s*(.+)$", ln)
        if not m:
            continue
        body = m.group(1)
        # split on em dash or hyphen dash types
        parts = re.split(r"\s+—\s+|\s+-\s+|:\s+", body, maxsplit=1)
        if len(parts) == 2:
            steps.append((parts[0].strip(), parts[1].strip()))
        else:
            steps.append((body.strip(), ""))  # only title
    return steps


def sql_is_plausible(sql: str) -> bool:
    """Very light static check using sqlparse (ensures at least one statement, no obvious empties)."""
    parsed = sqlparse.parse(sql)
    return len(parsed) >= 1 and any(tok.ttype is not None or tok.is_group for tok in parsed[0].tokens)


def execute_sql(engine: Engine, sql: str, max_rows: int = 500, timeout_seconds: int = 60) -> pd.DataFrame:
    """
    Execute SQL directly with timeout protection; return DataFrame. Limits rows to avoid runaway results.
    Any errors will be raised as exceptions.
    """
    import signal
    from contextlib import contextmanager
    
    @contextmanager
    def sql_timeout(duration):
        """Context manager to timeout SQL operations"""
        def timeout_handler(signum, frame):
            raise TimeoutError(f"SQL execution timed out after {duration} seconds")
        
        signal.signal(signal.SIGALRM, timeout_handler)
        signal.alarm(duration)
        try:
            yield
        finally:
            signal.alarm(0)
    
    # Clean the SQL - remove trailing semicolons and whitespace
    clean_sql = sql.strip().rstrip(';').strip()
    
    # Check if query already has LIMIT clause
    sql_upper = clean_sql.upper()
    if 'LIMIT' not in sql_upper:
        # Add LIMIT clause safely
        limited_sql = f"SELECT * FROM ({clean_sql}) AS limited_query LIMIT {max_rows}"
    else:
        # Query already has LIMIT, use as-is
        limited_sql = clean_sql
    
    # Execute with timeout protection
    with sql_timeout(timeout_seconds):
        with engine.connect() as conn:
            df = pd.read_sql(text(limited_sql), conn)
    return df


def generate_sql_for_subtask(subtask_text: str, prior_sql: List[str], user_request: str) -> str:
    schema_ctx = retrieve_schema_context(user_request + "\n\n" + subtask_text)
    prior = "\n\n".join([f"-- Step {i+1}\n{q}" for i, q in enumerate(prior_sql)]) if prior_sql else "(none)"
    user_prompt = SQL_USER_TEMPLATE.format(
        subtask_text=subtask_text,
        schema_context=schema_ctx,
        prior_sql=prior
    )
    raw = call_llm(SQL_SYSTEM, user_prompt)
    # Extract fenced code if LLM returned it with backticks
    code_block = re.search(r"```(?:sql)?\s*(.*?)```", raw, re.DOTALL | re.IGNORECASE)
    sql = code_block.group(1).strip() if code_block else raw.strip()
    return sql

In [7]:
# %%
def plan_subtasks(user_request: str, max_steps: int = 4) -> List[Tuple[str, str]]:
    print(f"[PLANNING] Starting task planning for: {user_request}")
    schema_ctx = retrieve_schema_context(user_request)
    print(f"[PLANNING] Retrieved schema context ({len(schema_ctx)} characters)")
    
    user_prompt = PLANNER_USER_TEMPLATE.format(
        user_request=user_request,
        schema_context=schema_ctx,
        max_steps=max_steps
    )
    print(f"[PLANNING] Calling LLM for task breakdown...")
    plan_text = call_llm(PLANNER_SYSTEM, user_prompt)
    
    steps = parse_numbered_list(plan_text)
    if not steps:
        print(f"[PLANNING] No steps parsed, using default single-step approach")
        steps = [("Single-step query", "Directly produce the final answer in one query.")]
    
    print(f"[PLANNING] Generated {len(steps)} steps:")
    for i, (title, desc) in enumerate(steps, 1):
        print(f"[PLANNING]   Step {i}: {title}")
    
    return steps[:max_steps]


def run_agent(user_request: str,
              engine: Engine,
              max_steps: int = 4,
              per_step_retries: int = 2) -> List[StepResult]:
    print(f"[AGENT] Starting agent execution")
    print(f"[AGENT] Configuration: max_steps={max_steps}, retries={per_step_retries}")
    
    steps = plan_subtasks(user_request, max_steps=max_steps)
    results: List[StepResult] = []
    prior_sql: List[str] = []

    for i, (title, desc) in enumerate(steps, start=1):
        print(f"\n[STEP {i}] Processing: {title}")
        print(f"[STEP {i}] Description: {desc}")
        
        sr = StepResult(step_id=i, title=title, description=desc)

        # Generate initial SQL
        print(f"[STEP {i}] Generating initial SQL...")
        sql = generate_sql_for_subtask(f"{title} — {desc}", prior_sql, user_request)
        sr.sql = sql
        print(f"[STEP {i}] Generated SQL: {sql[:100]}...")

        # Track all attempts for debugging
        attempted_queries = [sql]
        attempt = 0
        success = False
        final_error = None

        while attempt < per_step_retries and not success:
            attempt += 1
            current_sql = attempted_queries[-1]
            print(f"[STEP {i}] Attempt {attempt}/{per_step_retries}")

            # Check SQL plausibility first
            print(f"[STEP {i}] Validating SQL syntax...")
            if not sql_is_plausible(current_sql):
                print(f"[STEP {i}] SQL syntax validation FAILED")
                syntax_error = "SQL syntax validation failed. Please check for proper SQL syntax, missing keywords, or malformed statements."
                
                if attempt < per_step_retries:
                    print(f"[STEP {i}] Generating corrected SQL for syntax error...")
                    corrected_sql = generate_sql_for_subtask(
                        f"(Fix syntax error - Attempt {attempt}) {title} — {desc}. "
                        f"Previous failed query: {current_sql}. "
                        f"Error: {syntax_error}", 
                        prior_sql, user_request
                    )
                    attempted_queries.append(corrected_sql)
                    sr.sql = corrected_sql
                    print(f"[STEP {i}] Generated corrected SQL: {corrected_sql[:100]}...")
                    continue
                else:
                    print(f"[STEP {i}] Max retries reached for syntax errors")
                    final_error = f"SQL syntax validation failed after {per_step_retries} attempts. "
                    final_error += f"Attempted queries: {'; '.join(attempted_queries)}"
                    break

            # Try to execute SQL
            print(f"[STEP {i}] SQL syntax validation PASSED")
            print(f"[STEP {i}] Executing SQL query...")
            
            try:
                df_result = execute_sql(engine, current_sql, max_rows=500)
                print(f"[STEP {i}] SQL execution SUCCESSFUL - {len(df_result)} rows returned")
                success = True
                sr.result_df = df_result
            except Exception as e:
                execution_error = str(e)
                print(f"[STEP {i}] SQL execution FAILED: {execution_error}")
                
                if attempt < per_step_retries:
                    print(f"[STEP {i}] Generating corrected SQL for execution error...")
                    corrected_sql = generate_sql_for_subtask(
                        f"(Fix execution error - Attempt {attempt}) {title} — {desc}. "
                        f"Previous failed query: {current_sql}. "
                        f"Execution error: {execution_error}", 
                        prior_sql, user_request
                    )
                    attempted_queries.append(corrected_sql)
                    sr.sql = corrected_sql
                    print(f"[STEP {i}] Generated corrected SQL: {corrected_sql[:100]}...")
                else:
                    print(f"[STEP {i}] Max retries reached for execution errors")
                    final_error = f"SQL execution failed after {per_step_retries} attempts. "
                    final_error += f"Last error: {execution_error}. "
                    final_error += f"Attempted queries: {'; '.join(attempted_queries)}"

        # Set error if not successful
        if not success:
            print(f"[STEP {i}] Step FAILED after all retries")
            sr.error = final_error
        else:
            print(f"[STEP {i}] Step completed SUCCESSFULLY")

        results.append(sr)

        # Only append prior SQL if success
        if success and sr.sql:
            prior_sql.append(sr.sql)
            print(f"[STEP {i}] Added successful SQL to context for next steps")
        else:
            print(f"[AGENT] Pipeline stopped at step {i} due to repeated failures")
            print(f"[AGENT] Failed queries attempted: {attempted_queries}")
            break

    print(f"\n[AGENT] Execution completed - {len([r for r in results if r.error is None])} successful steps")
    return results

In [8]:
# %%
from IPython.display import display, Markdown

def display_pipeline(outputs: List[StepResult]):
    for sr in outputs:
        display(Markdown(f"### Step {sr.step_id}: {sr.title}"))
        if sr.description:
            display(Markdown(f"> {sr.description}"))
        if sr.sql:
            display(Markdown("**SQL used:**"))
            display(Markdown(f"```sql\n{sr.sql}\n```"))
        if sr.error:
            display(Markdown(f"**Error:** `{sr.error}`"))
        elif sr.result_df is not None:
            display(Markdown("**Result Preview (up to 500 rows):**"))
            display(sr.result_df.head(25))  # show first rows nicely
        display(Markdown("---"))

def display_final(outputs: List[StepResult]):
    # Final successful step
    successful = [o for o in outputs if o.error is None and o.result_df is not None]
    if not successful:
        display(Markdown("## Final Result\nNo successful results to display."))
        return
    last = successful[-1]
    display(Markdown("## Final Result"))
    display(Markdown("**Query that produced the final result:**"))
    display(Markdown(f"```sql\n{last.sql}\n```"))
    display(last.result_df)

In [9]:
# %%
# Example requests (customize these based on your schema):
# user_request = "Show average salary by department with employee count."
# user_request = "Top 10 highest paid employees with their department and performance score."
# user_request = "Department budget vs actual salary costs with variance analysis."

user_request = "Top 10 highest paid employees with their department, performance score, and hire date."

outputs = run_agent(
    user_request=user_request,
    engine=engine,
    max_steps=4,
    per_step_retries=2,
)

display_pipeline(outputs)
display_final(outputs)

[AGENT] Starting agent execution
[AGENT] Configuration: max_steps=4, retries=2
[PLANNING] Starting task planning for: Top 10 highest paid employees with their department, performance score, and hire date.
[PLANNING] Retrieved schema context (3322 characters)
[PLANNING] Calling LLM for task breakdown...
[LLM] Calling Ollama (attempt 1/3)...
[LLM] Response received successfully (fallback format)
[PLANNING] Generated 4 steps:
[PLANNING]   Step 1: **Filter Employees by Performance Score:**
[PLANNING]   Step 2: **Order Employees by Salary:**
[PLANNING]   Step 3: **Limit to Top 10 Employees:**
[PLANNING]   Step 4: **Retrieve Final Result:**

[STEP 1] Processing: **Filter Employees by Performance Score:**
[STEP 1] Description: 
[STEP 1] Generating initial SQL...
[LLM] Calling Ollama (attempt 1/3)...
[LLM] Response received successfully (fallback format)
[STEP 1] Generated SQL: WITH high_performance_employees AS (
    SELECT u.name, er.performance_score, er.salary, d.dept_name...
[STEP 1] Atte

### Step 1: **Filter Employees by Performance Score:**

**SQL used:**

```sql
WITH high_performance_employees AS (
    SELECT u.name, er.performance_score, er.salary, d.dept_name
    FROM employee_records er
    JOIN users u ON er.user_id = u.id
    JOIN departments d ON er.dept_id = d.dept_id
    WHERE er.performance_score >= 4.0
)
SELECT * FROM high_performance_employees;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,name,performance_score,salary,dept_name
0,John Jackson,4.32,62655,Marketing
1,Justin Clark,4.65,74797,Engineering
2,Matthew Garcia,4.55,43761,Marketing
3,Justin Ramirez,4.89,48567,Sales
4,Jason Anderson,4.96,54947,Human Resources
5,Brandon Lewis,4.74,78630,Operations
6,Robert Garcia,4.44,44182,Marketing
7,Andrew Robinson,4.83,93091,Finance
8,David Miller,4.09,86853,Engineering
9,Sarah Robinson,4.79,42745,Marketing


---

### Step 2: **Order Employees by Salary:**

**SQL used:**

```sql
WITH high_performance_employees AS (
    SELECT u.name, er.performance_score, er.salary, d.dept_name
    FROM employee_records er
    JOIN users u ON er.user_id = u.id
    JOIN departments d ON er.dept_id = d.dept_id
    WHERE er.performance_score >= 4.0
)
SELECT name, performance_score, salary, dept_name
FROM high_performance_employees
ORDER BY salary DESC;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,name,performance_score,salary,dept_name
0,Andrew Robinson,4.26,108054,Engineering
1,Robert Robinson,4.67,106400,Research & Development
2,Michael Davis,4.71,106388,Engineering
3,Kimberly Smith,4.37,104944,Engineering
4,Brandon Smith,4.75,103009,Research & Development
5,Tyler King,4.13,101771,Research & Development
6,Jane Lopez,4.34,101484,Research & Development
7,John Thompson,4.86,99538,Research & Development
8,Brandon Rodriguez,4.73,98275,Research & Development
9,Michelle Wilson,4.4,97570,Research & Development


---

### Step 3: **Limit to Top 10 Employees:**

**SQL used:**

```sql
WITH high_performance_employees AS (
    SELECT u.name, er.performance_score, er.salary, d.dept_name
    FROM employee_records er
    JOIN users u ON er.user_id = u.id
    JOIN departments d ON er.dept_id = d.dept_id
    WHERE er.performance_score >= 4.0
)
SELECT name, performance_score, salary, dept_name
FROM high_performance_employees
ORDER BY salary DESC
LIMIT 10;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,name,performance_score,salary,dept_name
0,Andrew Robinson,4.26,108054,Engineering
1,Robert Robinson,4.67,106400,Research & Development
2,Michael Davis,4.71,106388,Engineering
3,Kimberly Smith,4.37,104944,Engineering
4,Brandon Smith,4.75,103009,Research & Development
5,Tyler King,4.13,101771,Research & Development
6,Jane Lopez,4.34,101484,Research & Development
7,John Thompson,4.86,99538,Research & Development
8,Brandon Rodriguez,4.73,98275,Research & Development
9,Michelle Wilson,4.4,97570,Research & Development


---

### Step 4: **Retrieve Final Result:**

**SQL used:**

```sql
WITH high_performance_employees AS (
    SELECT u.name, er.performance_score, er.salary, d.dept_name
    FROM employee_records er
    JOIN users u ON er.user_id = u.id
    JOIN departments d ON er.dept_id = d.dept_id
    WHERE er.performance_score >= 4.0
)
SELECT name, performance_score, salary, dept_name
FROM high_performance_employees
ORDER BY performance_score DESC;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,name,performance_score,salary,dept_name
0,Michael Thompson,4.98,74892,Human Resources
1,Samantha White,4.98,51166,Customer Support
2,Justin Ramirez,4.97,58402,Finance
3,Jason Anderson,4.96,54947,Human Resources
4,Joshua Robinson,4.95,61344,Operations
5,Jason Lee,4.94,63365,Marketing
6,Jonathan Taylor,4.93,68162,Marketing
7,Emily Williams,4.92,56778,Marketing
8,Amanda Robinson,4.9,46321,Sales
9,Nicole Martinez,4.9,73401,Engineering


---

## Final Result

**Query that produced the final result:**

```sql
WITH high_performance_employees AS (
    SELECT u.name, er.performance_score, er.salary, d.dept_name
    FROM employee_records er
    JOIN users u ON er.user_id = u.id
    JOIN departments d ON er.dept_id = d.dept_id
    WHERE er.performance_score >= 4.0
)
SELECT name, performance_score, salary, dept_name
FROM high_performance_employees
ORDER BY performance_score DESC;
```

Unnamed: 0,name,performance_score,salary,dept_name
0,Michael Thompson,4.98,74892,Human Resources
1,Samantha White,4.98,51166,Customer Support
2,Justin Ramirez,4.97,58402,Finance
3,Jason Anderson,4.96,54947,Human Resources
4,Joshua Robinson,4.95,61344,Operations
...,...,...,...,...
102,Jason King,4.05,80733,Engineering
103,Megan Flores,4.02,77332,Human Resources
104,Daniel Clark,4.02,83532,Customer Support
105,Brian White,4.01,64834,Marketing


In [10]:
# %%
import signal
import logging
import sys
from contextlib import contextmanager
from datetime import datetime
import os

# Setup logging to file
log_file_path = os.path.join(os.path.dirname(os.path.abspath("")), "notebooks", "query_automation_logs.txt")

# Create a custom logger
logger = logging.getLogger('query_automation')
logger.setLevel(logging.INFO)

# Remove any existing handlers to avoid duplicates
for handler in logger.handlers[:]:
    logger.removeHandler(handler)

# Create file handler that appends to the log file
file_handler = logging.FileHandler(log_file_path, mode='a')
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)

# Create console handler for stdout
console_handler = logging.StreamHandler(sys.stdout)
console_formatter = logging.Formatter('%(message)s')
console_handler.setFormatter(console_formatter)

# Add handlers to logger
logger.addHandler(file_handler)
logger.addHandler(console_handler)

def log_and_print(message, level='info'):
    """Log message to both file and console"""
    if level.lower() == 'info':
        logger.info(message)
    elif level.lower() == 'error':
        logger.error(message)
    elif level.lower() == 'warning':
        logger.warning(message)

@contextmanager
def timeout(duration):
    """Context manager to timeout operations"""
    def timeout_handler(signum, frame):
        raise TimeoutError(f"Operation timed out after {duration} seconds")
    
    signal.signal(signal.SIGALRM, timeout_handler)
    signal.alarm(duration)
    try:
        yield
    finally:
        signal.alarm(0)

# Test with different queries to see the improved error handling
def test_query(query_text: str):
    """Test function to run a query and show detailed results"""
    log_and_print(f"Testing query: {query_text}")
    log_and_print("=" * 50)
    
    try:
        with timeout(120):  # 2 minute timeout
            outputs = run_agent(
                user_request=query_text,
                engine=engine,
                max_steps=3,
                per_step_retries=2,
            )
        
        # Display results
        display_pipeline(outputs)
        display_final(outputs)
        
        # Summary of what happened
        log_and_print("\nSUMMARY:")
        success_count = 0
        for output in outputs:
            status = "SUCCESS" if output.error is None else "FAILED"
            log_and_print(f"Step {output.step_id}: {status}")
            if output.error:
                log_and_print(f"  Error: {output.error}")
            else:
                success_count += 1
        
        log_and_print(f"Test completed: {success_count}/{len(outputs)} steps successful")
        return True
        
    except TimeoutError as e:
        error_msg = f"TIMEOUT ERROR: {e}"
        log_and_print(error_msg, 'error')
        return False
    except Exception as e:
        error_msg = f"UNEXPECTED ERROR: {e}"
        log_and_print(error_msg, 'error')
        return False

# Test Case 1: Current - Basic query with joins
def test_case_1():
    """Test Case 1: Basic query requiring joins between tables"""
    log_and_print("\n" + "="*80)
    log_and_print("TEST CASE 1: Basic Query with Joins")
    log_and_print("="*80)
    return test_query("Show average salary by department with employee count and department head.")

# Test Case 2: Complex query with multiple joins and CTEs
def test_case_2():
    """Test Case 2: Complex query requiring multiple joins and CTEs"""
    log_and_print("\n" + "="*80)
    log_and_print("TEST CASE 2: Complex Query with Multiple Joins and CTEs")
    log_and_print("="*80)
    return test_query(
        "Create a comprehensive employee performance report showing: "
        "1) Employee name, department, and salary "
        "2) Performance ranking within their department "
        "3) Salary comparison to department average "
        "4) Years of service since hire date "
        "5) Only include employees with above-average performance (>3.5) "
        "6) Sort by department and performance score descending"
    )

# Test Case 3: Advanced query with multiple/recursive CTEs
def test_case_3():
    """Test Case 3: Advanced query with multiple CTEs and complex logic"""
    log_and_print("\n" + "="*80)
    log_and_print("TEST CASE 3: Advanced Query with Multiple CTEs and Recursive Logic")
    log_and_print("="*80)
    return test_query(
        "Generate an organizational hierarchy analysis: "
        "1) Create department hierarchy showing budget allocation "
        "2) Calculate cumulative salary costs by department "
        "3) Identify departments with budget variance (over/under spending) "
        "4) Show department efficiency metrics (avg performance per salary dollar) "
        "5) Create recursive calculation of department cost centers "
        "6) Include employee growth projections based on performance trends"
    )

# Test Case 4: Advanced query with window functions and partitions
def test_case_4():
    """Test Case 4: Advanced analytics with window functions and partitions"""
    log_and_print("\n" + "="*80)
    log_and_print("TEST CASE 4: Advanced Analytics with Window Functions and Partitions")
    log_and_print("="*80)
    return test_query(
        "Create advanced workforce analytics dashboard: "
        "1) Calculate running totals of salary costs by department and hire date "
        "2) Rank employees by performance within each department using DENSE_RANK "
        "3) Calculate moving averages of performance scores over time "
        "4) Identify salary percentiles (25th, 50th, 75th, 95th) by department "
        "5) Calculate year-over-year growth in department headcount "
        "6) Use LAG/LEAD functions to compare performance trends "
        "7) Partition data by location and calculate location-based metrics"
    )

# Run all test cases function with improved error handling
def run_all_tests():
    """Execute all test cases sequentially, continuing even if individual tests fail"""
    start_time = datetime.now()
    log_and_print(f"\n{'='*80}")
    log_and_print(f"STARTING COMPREHENSIVE QUERY AUTOMATION TESTING")
    log_and_print(f"Test session started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
    log_and_print(f"Log file: {log_file_path}")
    log_and_print("=" * 80)
    
    test_cases = [
        ("Basic Joins", test_case_1),
        ("Complex CTEs", test_case_2), 
        ("Recursive CTEs", test_case_3),
        ("Window Functions", test_case_4)
    ]
    
    results = {}
    
    for name, test_func in test_cases:
        log_and_print(f"\n{'-'*50}")
        log_and_print(f"Executing: {name}")
        log_and_print(f"{'-'*50}")
        
        try:
            success = test_func()
            results[name] = "SUCCESS" if success else "FAILED"
            log_and_print(f"Completed: {name} - {results[name]}")
        except Exception as e:
            results[name] = "ERROR"
            error_msg = f"CRITICAL ERROR in {name}: {str(e)}"
            log_and_print(error_msg, 'error')
        
        # Always continue to next test case
        log_and_print(f"Moving to next test case...")
    
    # Final summary
    end_time = datetime.now()
    duration = end_time - start_time
    
    log_and_print(f"\n{'='*80}")
    log_and_print(f"TEST SESSION COMPLETED")
    log_and_print(f"{'='*80}")
    log_and_print(f"Session ended at: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
    log_and_print(f"Total duration: {duration}")
    log_and_print(f"\nFINAL RESULTS:")
    
    success_count = 0
    for test_name, result in results.items():
        status_symbol = "[SUCCESS]" if result == "SUCCESS" else "[FAILED]"
        log_and_print(f"  {status_symbol} {test_name}: {result}")
        if result == "SUCCESS":
            success_count += 1
    
    log_and_print(f"\nOverall: {success_count}/{len(test_cases)} tests successful")
    log_and_print(f"Success rate: {(success_count/len(test_cases)*100):.1f}%")
    log_and_print(f"{'='*80}\n")
    
    return results

# Individual test execution (uncomment to run specific tests)
# test_case_1()  # Basic joins
# test_case_2()  # Complex CTEs  
# test_case_3()  # Recursive CTEs
# test_case_4()  # Window functions

# Run all tests with improved error handling
run_all_tests()


STARTING COMPREHENSIVE QUERY AUTOMATION TESTING
Test session started at: 2025-11-06 06:24:18
Log file: /Users/pulkit/Desktop/test/query_automation/notebooks/query_automation_logs.txt

--------------------------------------------------
Executing: Basic Joins
--------------------------------------------------

TEST CASE 1: Basic Query with Joins
Testing query: Show average salary by department with employee count and department head.
[AGENT] Starting agent execution
[AGENT] Configuration: max_steps=3, retries=2
[PLANNING] Starting task planning for: Show average salary by department with employee count and department head.
[PLANNING] Retrieved schema context (3163 characters)
[PLANNING] Calling LLM for task breakdown...
[LLM] Calling Ollama (attempt 1/3)...
[LLM] Response received successfully (fallback format)
[PLANNING] Generated 2 steps:
[PLANNING]   Step 1: **Create a Temporary View for Average Salary by Department:**
[PLANNING]   Step 2: **Display the Result:**

[STEP 1] Processing

### Step 1: **Create a Temporary View for Average Salary by Department:**

**SQL used:**

```sql
WITH avg_salary_by_dept AS (
    SELECT 
        d.dept_name,
        AVG(er.salary) AS avg_salary
    FROM 
        departments d
    LEFT JOIN 
        employee_records er ON d.dept_id = er.dept_id
    GROUP BY 
        d.dept_name
)
SELECT * FROM avg_salary_by_dept;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,dept_name,avg_salary
0,Customer Support,55937.885714
1,Engineering,86333.186047
2,Finance,71794.366667
3,Human Resources,69821.097561
4,Marketing,59764.424242
5,Operations,63160.809524
6,Research & Development,86376.285714
7,Sales,63473.0


---

### Step 2: **Display the Result:**

**SQL used:**

```sql
WITH avg_salary_by_dept AS (
    SELECT 
        d.dept_name,
        AVG(er.salary) AS avg_salary
    FROM 
        departments d
    LEFT JOIN 
        employee_records er ON d.dept_id = er.dept_id
    GROUP BY 
        d.dept_name
)
SELECT dept_name, avg_salary FROM avg_salary_by_dept;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,dept_name,avg_salary
0,Customer Support,55937.885714
1,Engineering,86333.186047
2,Finance,71794.366667
3,Human Resources,69821.097561
4,Marketing,59764.424242
5,Operations,63160.809524
6,Research & Development,86376.285714
7,Sales,63473.0


---

## Final Result

**Query that produced the final result:**

```sql
WITH avg_salary_by_dept AS (
    SELECT 
        d.dept_name,
        AVG(er.salary) AS avg_salary
    FROM 
        departments d
    LEFT JOIN 
        employee_records er ON d.dept_id = er.dept_id
    GROUP BY 
        d.dept_name
)
SELECT dept_name, avg_salary FROM avg_salary_by_dept;
```

Unnamed: 0,dept_name,avg_salary
0,Customer Support,55937.885714
1,Engineering,86333.186047
2,Finance,71794.366667
3,Human Resources,69821.097561
4,Marketing,59764.424242
5,Operations,63160.809524
6,Research & Development,86376.285714
7,Sales,63473.0



SUMMARY:
Step 1: SUCCESS
Step 2: SUCCESS
Test completed: 2/2 steps successful
Completed: Basic Joins - SUCCESS
Moving to next test case...

--------------------------------------------------
Executing: Complex CTEs
--------------------------------------------------

TEST CASE 2: Complex Query with Multiple Joins and CTEs
Testing query: Create a comprehensive employee performance report showing: 1) Employee name, department, and salary 2) Performance ranking within their department 3) Salary comparison to department average 4) Years of service since hire date 5) Only include employees with above-average performance (>3.5) 6) Sort by department and performance score descending
[AGENT] Starting agent execution
[AGENT] Configuration: max_steps=3, retries=2
[PLANNING] Starting task planning for: Create a comprehensive employee performance report showing: 1) Employee name, department, and salary 2) Performance ranking within their department 3) Salary comparison to department average 4) Yea

### Step 1: **Create a Temporary View for Department Averages**

**SQL used:**

```sql
WITH department_averages AS (
    SELECT d.dept_name,
           AVG(er.salary) AS avg_salary,
           AVG(er.performance_score) AS avg_performance
    FROM departments d
    LEFT JOIN employee_records er ON d.dept_id = er.dept_id
    GROUP BY d.dept_name
)
SELECT * FROM department_averages;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,dept_name,avg_salary,avg_performance
0,Customer Support,55937.885714,3.410857
1,Engineering,86333.186047,3.338372
2,Finance,71794.366667,3.653
3,Human Resources,69821.097561,3.511707
4,Marketing,59764.424242,3.754242
5,Operations,63160.809524,3.705952
6,Research & Development,86376.285714,3.406571
7,Sales,63473.0,3.51439


---

### Step 2: **Filter Employees with Above-Average Performance**

**SQL used:**

```sql
WITH avg_performance AS (
    SELECT AVG(performance_score) AS avg_perf
    FROM employee_records
)
SELECT u.name, er.performance_score, er.salary, d.dept_name
FROM employee_records er
JOIN users u ON er.user_id = u.id
JOIN departments d ON er.dept_id = d.dept_id
WHERE er.performance_score >= (SELECT avg_perf FROM avg_performance)
ORDER BY er.performance_score DESC;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,name,performance_score,salary,dept_name
0,Michael Thompson,4.98,74892,Human Resources
1,Samantha White,4.98,51166,Customer Support
2,Justin Ramirez,4.97,58402,Finance
3,Jason Anderson,4.96,54947,Human Resources
4,Joshua Robinson,4.95,61344,Operations
5,Jason Lee,4.94,63365,Marketing
6,Jonathan Taylor,4.93,68162,Marketing
7,Emily Williams,4.92,56778,Marketing
8,Amanda Robinson,4.9,46321,Sales
9,Nicole Martinez,4.9,73401,Engineering


---

### Step 3: **Generate the Comprehensive Employee Performance Report**

**SQL used:**

```sql
WITH department_averages AS (
    SELECT d.dept_name,
           AVG(er.salary) AS avg_salary,
           AVG(er.performance_score) AS avg_performance
    FROM departments d
    LEFT JOIN employee_records er ON d.dept_id = er.dept_id
    GROUP BY d.dept_name
),
avg_performance AS (
    SELECT AVG(performance_score) AS avg_perf
    FROM employee_records
)
SELECT u.name, er.performance_score, er.salary, d.dept_name
FROM employee_records er
JOIN users u ON er.user_id = u.id
JOIN departments d ON er.dept_id = d.dept_id
WHERE er.performance_score >= (SELECT avg_perf FROM avg_performance)
ORDER BY er.performance_score DESC;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,name,performance_score,salary,dept_name
0,Michael Thompson,4.98,74892,Human Resources
1,Samantha White,4.98,51166,Customer Support
2,Justin Ramirez,4.97,58402,Finance
3,Jason Anderson,4.96,54947,Human Resources
4,Joshua Robinson,4.95,61344,Operations
5,Jason Lee,4.94,63365,Marketing
6,Jonathan Taylor,4.93,68162,Marketing
7,Emily Williams,4.92,56778,Marketing
8,Amanda Robinson,4.9,46321,Sales
9,Nicole Martinez,4.9,73401,Engineering


---

## Final Result

**Query that produced the final result:**

```sql
WITH department_averages AS (
    SELECT d.dept_name,
           AVG(er.salary) AS avg_salary,
           AVG(er.performance_score) AS avg_performance
    FROM departments d
    LEFT JOIN employee_records er ON d.dept_id = er.dept_id
    GROUP BY d.dept_name
),
avg_performance AS (
    SELECT AVG(performance_score) AS avg_perf
    FROM employee_records
)
SELECT u.name, er.performance_score, er.salary, d.dept_name
FROM employee_records er
JOIN users u ON er.user_id = u.id
JOIN departments d ON er.dept_id = d.dept_id
WHERE er.performance_score >= (SELECT avg_perf FROM avg_performance)
ORDER BY er.performance_score DESC;
```

Unnamed: 0,name,performance_score,salary,dept_name
0,Michael Thompson,4.98,74892,Human Resources
1,Samantha White,4.98,51166,Customer Support
2,Justin Ramirez,4.97,58402,Finance
3,Jason Anderson,4.96,54947,Human Resources
4,Joshua Robinson,4.95,61344,Operations
...,...,...,...,...
147,James Thompson,3.58,70726,Finance
148,David Nguyen,3.57,103549,Engineering
149,Brandon Young,3.56,81814,Finance
150,John Allen,3.56,81501,Customer Support



SUMMARY:
Step 1: SUCCESS
Step 2: SUCCESS
Step 3: SUCCESS
Test completed: 3/3 steps successful
Completed: Complex CTEs - SUCCESS
Moving to next test case...

--------------------------------------------------
Executing: Recursive CTEs
--------------------------------------------------

TEST CASE 3: Advanced Query with Multiple CTEs and Recursive Logic
Testing query: Generate an organizational hierarchy analysis: 1) Create department hierarchy showing budget allocation 2) Calculate cumulative salary costs by department 3) Identify departments with budget variance (over/under spending) 4) Show department efficiency metrics (avg performance per salary dollar) 5) Create recursive calculation of department cost centers 6) Include employee growth projections based on performance trends
[AGENT] Starting agent execution
[AGENT] Configuration: max_steps=3, retries=2
[PLANNING] Starting task planning for: Generate an organizational hierarchy analysis: 1) Create department hierarchy showing budge

### Step 1: **Create Department Hierarchy Showing Budget Allocation**

**SQL used:**

```sql
WITH department_budget AS (
    SELECT dept_id, SUM(budget) as total_budget
    FROM departments
    GROUP BY dept_id
)
SELECT d.dept_name, d.location, db.total_budget
FROM departments d
JOIN department_budget db ON d.dept_id = db.dept_id;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,dept_name,location,total_budget
0,Engineering,Building A,500000
1,Marketing,Building B,200000
2,Sales,Building B,300000
3,Human Resources,Building C,150000
4,Finance,Building C,250000
5,Operations,Building A,180000
6,Customer Support,Building D,120000
7,Research & Development,Building A,400000
8,Engineering,Building A,500000
9,Marketing,Building B,200000


---

### Step 2: **Calculate Cumulative Salary Costs by Department**

**SQL used:**

```sql
WITH department_budget AS (
    SELECT dept_id, SUM(budget) as total_budget
    FROM departments
    GROUP BY dept_id
),
employee_salary AS (
    SELECT er.dept_id, AVG(er.salary) as avg_salary
    FROM employee_records er
    GROUP BY er.dept_id
)
SELECT d.dept_name, db.total_budget, es.avg_salary, (db.total_budget / es.avg_salary) as cumulative_salary_cost
FROM departments d
JOIN department_budget db ON d.dept_id = db.dept_id
JOIN employee_salary es ON d.dept_id = es.dept_id;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,dept_name,total_budget,avg_salary,cumulative_salary_cost
0,Engineering,500000,91324.058824,5.475009
1,Marketing,200000,57174.388889,3.49807
2,Sales,300000,64186.615385,4.673872
3,Human Resources,150000,71799.0,2.089166
4,Finance,250000,78473.684211,3.185781
5,Operations,180000,63810.0,2.820874
6,Customer Support,120000,50284.045455,2.386443
7,Research & Development,400000,96221.461538,4.157077
8,Engineering,500000,67478.777778,7.409737
9,Marketing,200000,62872.466667,3.181043


---

### Step 3: **Identify Departments with Budget Variance (Over/Under Spending)**

**SQL used:**

```sql
WITH department_budget AS (
    SELECT dept_id, SUM(budget) as total_budget
    FROM departments
    GROUP BY dept_id
),
employee_salary AS (
    SELECT er.dept_id, AVG(er.salary) as avg_salary
    FROM employee_records er
    GROUP BY er.dept_id
)
SELECT d.dept_name, db.total_budget, es.avg_salary, 
       (db.total_budget - es.avg_salary * 12) as budget_variance
FROM departments d
JOIN department_budget db ON d.dept_id = db.dept_id
JOIN employee_salary es ON d.dept_id = es.dept_id;
```

**Result Preview (up to 500 rows):**

Unnamed: 0,dept_name,total_budget,avg_salary,budget_variance
0,Engineering,500000,91324.058824,-595888.705882
1,Marketing,200000,57174.388889,-486092.666667
2,Sales,300000,64186.615385,-470239.384615
3,Human Resources,150000,71799.0,-711588.0
4,Finance,250000,78473.684211,-691684.210526
5,Operations,180000,63810.0,-585720.0
6,Customer Support,120000,50284.045455,-483408.545455
7,Research & Development,400000,96221.461538,-754657.538462
8,Engineering,500000,67478.777778,-309745.333333
9,Marketing,200000,62872.466667,-554469.6


---

## Final Result

**Query that produced the final result:**

```sql
WITH department_budget AS (
    SELECT dept_id, SUM(budget) as total_budget
    FROM departments
    GROUP BY dept_id
),
employee_salary AS (
    SELECT er.dept_id, AVG(er.salary) as avg_salary
    FROM employee_records er
    GROUP BY er.dept_id
)
SELECT d.dept_name, db.total_budget, es.avg_salary, 
       (db.total_budget - es.avg_salary * 12) as budget_variance
FROM departments d
JOIN department_budget db ON d.dept_id = db.dept_id
JOIN employee_salary es ON d.dept_id = es.dept_id;
```

Unnamed: 0,dept_name,total_budget,avg_salary,budget_variance
0,Engineering,500000,91324.058824,-595888.705882
1,Marketing,200000,57174.388889,-486092.666667
2,Sales,300000,64186.615385,-470239.384615
3,Human Resources,150000,71799.0,-711588.0
4,Finance,250000,78473.684211,-691684.210526
5,Operations,180000,63810.0,-585720.0
6,Customer Support,120000,50284.045455,-483408.545455
7,Research & Development,400000,96221.461538,-754657.538462
8,Engineering,500000,67478.777778,-309745.333333
9,Marketing,200000,62872.466667,-554469.6



SUMMARY:
Step 1: SUCCESS
Step 2: SUCCESS
Step 3: SUCCESS
Test completed: 3/3 steps successful
Completed: Recursive CTEs - SUCCESS
Moving to next test case...

--------------------------------------------------
Executing: Window Functions
--------------------------------------------------

TEST CASE 4: Advanced Analytics with Window Functions and Partitions
Testing query: Create advanced workforce analytics dashboard: 1) Calculate running totals of salary costs by department and hire date 2) Rank employees by performance within each department using DENSE_RANK 3) Calculate moving averages of performance scores over time 4) Identify salary percentiles (25th, 50th, 75th, 95th) by department 5) Calculate year-over-year growth in department headcount 6) Use LAG/LEAD functions to compare performance trends 7) Partition data by location and calculate location-based metrics
[AGENT] Starting agent execution
[AGENT] Configuration: max_steps=3, retries=2
[PLANNING] Starting task planning for: Cr

{'Basic Joins': 'SUCCESS',
 'Complex CTEs': 'SUCCESS',
 'Recursive CTEs': 'SUCCESS',
 'Window Functions': 'FAILED'}

In [11]:
# %%
# Test a simple query to verify Ollama timeout fixes
print("Testing Ollama with improved timeout handling...")
print("="*60)

# Test with a simple query first
simple_query = "Show the top 5 employees by salary with their department names."

try:
    print("Starting simple test query...")
    outputs = run_agent(
        user_request=simple_query,
        engine=engine,
        max_steps=2,
        per_step_retries=1,
    )
    
    print(f"\nTest Results:")
    for output in outputs:
        status = "SUCCESS" if output.error is None else "FAILED"
        print(f"Step {output.step_id}: {status}")
        if output.error:
            print(f"  Error: {output.error}")
    
    if any(output.error is None for output in outputs):
        print("\n[SUCCESS] Ollama timeout fix appears to be working!")
        display_final(outputs)
    else:
        print("\n[FAILED] Still encountering issues")
        
except Exception as e:
    print(f"Test failed with error: {e}")
    print("This indicates the Ollama timeout issue persists")

Testing Ollama with improved timeout handling...
Starting simple test query...
[AGENT] Starting agent execution
[AGENT] Configuration: max_steps=2, retries=1
[PLANNING] Starting task planning for: Show the top 5 employees by salary with their department names.
[PLANNING] Retrieved schema context (3008 characters)
[PLANNING] Calling LLM for task breakdown...
[LLM] Calling Ollama (attempt 1/3)...
[LLM] Response received successfully (fallback format)
[PLANNING] Generated 2 steps:
[PLANNING]   Step 1: **Filter Employees by Performance Score**
[PLANNING]   Step 2: **Sort and Limit Employees by Salary**

[STEP 1] Processing: **Filter Employees by Performance Score**
[STEP 1] Description: Select employees with a performance score of 4.0 or higher.
[STEP 1] Generating initial SQL...
[LLM] Calling Ollama (attempt 1/3)...
[LLM] Response received successfully (fallback format)
[STEP 1] Generated SQL: SELECT u.name, er.performance_score, er.salary, d.dept_name
FROM employee_records er
JOIN users 

## Final Result

**Query that produced the final result:**

```sql
WITH FilteredEmployees AS (
    SELECT u.name, er.performance_score, er.salary, d.dept_name
    FROM employee_records er
    JOIN users u ON er.user_id = u.id
    JOIN departments d ON er.dept_id = d.dept_id
    WHERE er.performance_score >= 4.0
)
SELECT name, performance_score, salary, dept_name
FROM FilteredEmployees
ORDER BY salary DESC
LIMIT 5;
```

Unnamed: 0,name,performance_score,salary,dept_name
0,Andrew Robinson,4.26,108054,Engineering
1,Robert Robinson,4.67,106400,Research & Development
2,Michael Davis,4.71,106388,Engineering
3,Kimberly Smith,4.37,104944,Engineering
4,Brandon Smith,4.75,103009,Research & Development


In [12]:
# %%
# Ollama Health Check and Diagnostic Function
def check_ollama_health():
    """Comprehensive Ollama health check and diagnostics"""
    print("OLLAMA HEALTH CHECK")
    print("="*50)
    
    # 1. Check if Ollama service is running
    try:
        import requests
        response = requests.get("http://localhost:11434/api/version", timeout=5)
        if response.status_code == 200:
            version_info = response.json()
            print(f"[SUCCESS] Ollama service is running (version: {version_info.get('version', 'unknown')})")
        else:
            print(f"[FAILED] Ollama service returned status code: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"[FAILED] Cannot connect to Ollama service: {e}")
        print("  Please run: ollama serve")
        return False
    
    # 2. Check available models
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=5)
        if response.status_code == 200:
            models = response.json().get('models', [])
            print(f"[SUCCESS] Found {len(models)} available models:")
            for model in models:
                name = model.get('name', 'unknown')
                size_gb = model.get('size', 0) / (1024**3)
                print(f"  - {name} ({size_gb:.1f} GB)")
            
            # Check if our target model is available
            model_names = [m.get('name', '') for m in models]
            if MODEL_NAME in model_names:
                print(f"[SUCCESS] Target model '{MODEL_NAME}' is available")
            else:
                print(f"[FAILED] Target model '{MODEL_NAME}' not found")
                print(f"  Available models: {', '.join(model_names)}")
                return False
        else:
            print(f"[FAILED] Cannot list models, status code: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"[FAILED] Error checking models: {e}")
        return False
    
    # 3. Test simple LLM call
    try:
        print("\n[TESTING] Testing simple LLM call...")
        start_time = pd.Timestamp.now()
        test_response = llm.invoke("Respond with exactly: 'Ollama working'")
        end_time = pd.Timestamp.now()
        duration = (end_time - start_time).total_seconds()
        
        if "working" in test_response.lower():
            print(f"[SUCCESS] LLM call successful ({duration:.1f}s)")
            print(f"  Response: {test_response[:100]}...")
        else:
            print(f"[WARNING] LLM responded but content unexpected: {test_response[:100]}...")
        return True
    except Exception as e:
        print(f"[FAILED] LLM call failed: {e}")
        if "timeout" in str(e).lower():
            print("  This is the timeout issue we're trying to fix!")
        return False

# Run the health check
check_ollama_health()

OLLAMA HEALTH CHECK
[SUCCESS] Ollama service is running (version: 0.12.9)
[SUCCESS] Found 1 available models:
  - qwen2.5-coder:7b (4.4 GB)
[SUCCESS] Target model 'qwen2.5-coder:7b' is available

[TESTING] Testing simple LLM call...
[SUCCESS] LLM call successful (3.5s)
  Response: Ollama working...


True