### SQL Validation
Before moving to more complex queries, let's see how the validation system works. The validator ensures that generated SQL queries are safe, syntactically correct, and use valid schema elements.

In [1]:
import logging
import os
import sys
from pathlib import Path

# Add the project root to Python path
project_root = Path().absolute().parent
sys.path.append(str(project_root))

logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

print(f"Project root: {project_root}")
print(f"Current working directory: {os.getcwd()}")

Project root: /home/vlad/dev/data-analyser
Current working directory: /home/vlad/dev/data-analyser/notebooks


In [4]:
from src.agent.agent import DataAnalysisAgent
from src.models.schemas import SQLQuery, ValidationResult
from src.tools.validator_tool import ValidatorTool

### Initialize SQL Validation Tool

In [5]:
agent = DataAnalysisAgent(config_path=str(project_root / "config" / "config.yaml"))
validator_tool = ValidatorTool(llm=agent.llm, schema_dict=agent.schema)

In [6]:
example_queries = [
    """
    SELECT segment, COUNT(DISTINCT model_id) AS unique_models
    FROM models
    GROUP BY segment
    ORDER BY unique_models DESC;
    """,
    """
    SELECT 
        region, 
        AVG(rating) AS average_rating, 
        AVG(sales_capacity) AS average_sales_capacity 
    FROM 
        dealerships 
    GROUP BY 
        region 
    ORDER BY 
        average_rating DESC;
    """,
    """
    SELECT 
        m.model_name, 
        sr.service_type, 
        AVG(sr.cost) AS average_service_cost, 
        SUM(sr.cost) AS total_service_revenue
    FROM 
        service_records sr
    JOIN 
        sales s ON sr.vin = s.vin
    JOIN 
        models m ON s.model_id = m.model_id
    GROUP BY 
        m.model_name, 
        sr.service_type
    ORDER BY 
        total_service_revenue DESC;
    """,
    """
    SELECT AVG(basket_size) AS average_basket_size
    FROM (
    SELECT COUNT(sale_id) AS basket_size
    FROM sales
    GROUP BY customer_id
    ) AS baskets;
    """,
]

In [7]:
for sql_query in example_queries:
    print("\nQuery:\n", sql_query)

    # validation steps
    syntax_check = validator_tool.check_syntax(sql_query)
    dangerous_code_check = validator_tool.check_dangerous_patterns(sql_query)
    shema_check = validator_tool.check_schema_compatibility(sql_query)

    if syntax_check[0]:
        print("✅ Syntax check passed")
    else:
        print("❌ Syntax check failed:", syntax_check[1])

    if dangerous_code_check[0]:
        print("✅ Dangerous code check passed")
    else:
        print("❌ Dangerous code check failed:", dangerous_code_check[1])

    if shema_check[0]:
        print("✅ Schema compatibility check passed")
    else:
        print("❌ Schema compatibility check failed:", shema_check[1])


Query:
 
    SELECT segment, COUNT(DISTINCT model_id) AS unique_models
    FROM models
    GROUP BY segment
    ORDER BY unique_models DESC;
    
✅ Syntax check passed
✅ Dangerous code check passed
✅ Schema compatibility check passed

Query:
 
    SELECT 
        region, 
        AVG(rating) AS average_rating, 
        AVG(sales_capacity) AS average_sales_capacity 
    FROM 
        dealerships 
    GROUP BY 
        region 
    ORDER BY 
        average_rating DESC;
    
✅ Syntax check passed
✅ Dangerous code check passed
✅ Schema compatibility check passed

Query:
 
    SELECT 
        m.model_name, 
        sr.service_type, 
        AVG(sr.cost) AS average_service_cost, 
        SUM(sr.cost) AS total_service_revenue
    FROM 
        service_records sr
    JOIN 
        sales s ON sr.vin = s.vin
    JOIN 
        models m ON s.model_id = m.model_id
    GROUP BY 
        m.model_name, 
        sr.service_type
    ORDER BY 
        total_service_revenue DESC;
    
✅ Syntax check pass

In [8]:
i = 0
sql_query = example_queries[i]

errors = []
warnings = []  # TODO: align in the future if it's needed
suggestion = None

# syntax validation
syntax_valid, syntax_error = validator_tool.check_syntax(sql_query)
if not syntax_valid:
    errors.append(syntax_error)

# dangerous patterns validation
safe, safety_error = validator_tool.check_dangerous_patterns(sql_query)
if not safe:
    errors.append(safety_error)

# schema compatibility validation
schema_valid, schema_error = validator_tool.check_schema_compatibility(sql_query)
if not schema_valid:
    errors.append(schema_error)

# final validation
is_valid = len(errors) == 0

is_valid, warnings, suggestion

(True, [], None)

In [9]:
ValidationResult(is_valid=is_valid, errors=errors, warnings=warnings, suggestion=None)



In [10]:
# check all
val_results = []

for i in range(len(example_queries)):
    sql_query = example_queries[i]
    task_description = ""

    val_results.append(
        validator_tool.validate_sql(
            sql_query=sql_query, task_description=task_description
        )
    )

val_results



### Incorrect SQL Testing

In [11]:
test_cases = [
    {
        "name": "❌ Incompatible Schema",
        "query": "SELECT region, SUM(sales_amount) as total_sales FROM sales_data GROUP BY region ORDER BY total_sales DESC",
        "task": "Show total sales by region",
        "expected": "VALID",
    },
    {
        "name": "❌ Dangerous DROP Query",
        "query": "DROP TABLE sales_data",
        "task": "Remove sales data table",
        "expected": "INVALID - Dangerous operation",
    },
    {
        "name": "❌ Dangerous DELETE Query",
        "query": "DELETE FROM sales_data WHERE region = 'North'",
        "task": "Remove northern region data",
        "expected": "INVALID - Dangerous operation",
    },
    {
        "name": "❌ Invalid Table Reference",
        "query": "SELECT * FROM nonexistent_table",
        "task": "Query non-existent table",
        "expected": "INVALID - Schema validation",
    },
    {
        "name": "❌ Syntax Error Query",
        "query": "SEL * FORM sales_data WHRE region = 'North'",
        "task": "Query with multiple typos",
        "expected": "INVALID - Syntax error",
    },
]

In [12]:
val_results = []

for i in range(len(test_cases)):
    sql_query = test_cases[i]["query"]
    task_description = test_cases[i]["task"]

    val_results.append(
        validator_tool.validate_sql(
            sql_query=sql_query, task_description=task_description
        )
    )

val_results

2025-07-02 14:10:22,320 - Parser - ERROR - Not supported query type: SEL * FORM


