### SQL Generation Tutorial
The notebook demonstrates how to generate SQL queries based on text description

In [9]:
import os
import sys
import logging
from pathlib import Path

# Add the project root to Python path
project_root = Path().absolute().parent
sys.path.append(str(project_root))

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print(f"Project root: {project_root}")
print(f"Current working directory: {os.getcwd()}")

Project root: /home/vlad/dev/data-analyser
Current working directory: /home/vlad/dev/data-analyser/notebooks


In [None]:
from src.agent.agent import DataAnalysisAgent
from src.tools.sql_tool import SQLTool
from dotenv import load_dotenv

load_dotenv()

True

## Initialize Agent and SQL Tool

In [None]:
# agent
agent = DataAnalysisAgent(config_path=str(project_root / "config" / "config.yaml"))

# tools
sql_tool = SQLTool(llm=agent.llm)

In [5]:
# display database schema
print("📋 Available Database Schema:")
print("=" * 50)

for table_name, columns in agent.schema.items():
    print(f"\n🗂️  Table: {table_name}")
    print("\tColumns:")
    for column in columns:
        print(f"\t - {column['column_name']} ({column['data_type']})")
        
print(f"\n📊 Total tables: {len(agent.schema)}")

📋 Available Database Schema:

🗂️  Table: models
	Columns:
	 - model_id (INTEGER)
	 - model_name (TEXT)
	 - model_code (TEXT)
	 - production_start_year (INTEGER)
	 - production_end_year (INTEGER)
	 - segment (TEXT)
	 - base_price (REAL)
	 - horsepower (INTEGER)
	 - body_type (TEXT)
	 - is_electric (INTEGER)
	 - description (TEXT)

🗂️  Table: dealerships
	Columns:
	 - dealership_id (INTEGER)
	 - name (TEXT)
	 - address (TEXT)
	 - city (TEXT)
	 - country (TEXT)
	 - region (TEXT)
	 - opening_date (TEXT)
	 - service_center (INTEGER)
	 - sales_capacity (INTEGER)
	 - rating (REAL)
	 - manager_name (TEXT)

🗂️  Table: customers
	Columns:
	 - customer_id (INTEGER)
	 - first_name (TEXT)
	 - last_name (TEXT)
	 - email (TEXT)
	 - phone (TEXT)
	 - address (TEXT)
	 - city (TEXT)
	 - country (TEXT)
	 - date_of_birth (TEXT)
	 - registration_date (TEXT)
	 - loyalty_points (INTEGER)
	 - preferred_dealership_id (INTEGER)

🗂️  Table: sales
	Columns:
	 - sale_id (INTEGER)
	 - customer_id (INTEGER)
	 - deale

In [6]:
# tickets to test
JIRA_PROJECT_KEY="KAN"

tickets = [
    {
        'project': JIRA_PROJECT_KEY,
        'summary': 'Car Models Analysis',
        'description': 'How many unqiue car models we have per car category? Sort the results in descending order!',
        'issuetype': 'Task',
    },
    {
        'project': JIRA_PROJECT_KEY,
        'summary': 'Dealership Performance by Region Analysis',
        'description': 'Analyze the average dealership rating and sales capacity by region. Which regions have the highest performing dealerships? Sort the results by average rating in descending order.',
        'issuetype': 'Task',
    },
    {
        'project': JIRA_PROJECT_KEY,
        'summary': 'Service Cost Analysis by Model and Service Type',
        'description': 'Analyze the average service costs by model and service type. Identify which models have higher maintenance costs and which service types contribute most to overall service revenue.',
        'issuetype': 'Task',
    },
    # irrelevant task that doesn't match the schema
    {
        "id": "DA-101",
        "summary": "Total Sales Overview",
        "description": "What is the average user basket size?"
    },
]

### SQL Generation

In [7]:
sql_queires = [
    sql_tool.generate_query(task_description=ticket['description'], schema_dict=agent.schema)
    for ticket in tickets
]

2025-07-02 14:03:25,355 - src.tools.sql_tool - INFO - Generating SQL query for task: How many unqiue car models we have per car category? Sort the results in descending order!
2025-07-02 14:03:28,300 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-02 14:03:31,292 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-02 14:03:31,298 - src.tools.sql_tool - INFO - Successfully generated SQL query
2025-07-02 14:03:31,300 - src.tools.sql_tool - INFO - Generating SQL query for task: Analyze the average dealership rating and sales capacity by region. Which regions have the highest performing dealerships? Sort the results by average rating in descending order.
2025-07-02 14:03:34,448 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-02 14:03:37,759 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1

In [8]:
sql_query = sql_queires[0]

print('ReturnedType -> ', type(sql_query))
print('\nGenerated SQL:\n', sql_query.query)
print('\nDescription:\n', sql_query.description)
print('\nUsed Tables:\n', sql_query.tables_used)

ReturnedType ->  <class 'src.models.schemas.SQLQuery'>

Generated SQL:
 SELECT segment, COUNT(DISTINCT model_id) AS unique_models
FROM models
GROUP BY segment
ORDER BY unique_models DESC;

Description:
 This query counts the number of unique model_ids in each segment from the 'models' table. It then groups the results by segment and orders them in descending order based on the count of unique model_ids.

Used Tables:
 ['models']
