In [1]:
# install modules
%pip install -q langchain langchain-community langchain-openai sqlalchemy psycopg2-binary langgraph


Note: you may need to restart the kernel to use updated packages.


In [1]:
import os, re
from sqlalchemy import create_engine, event
from langchain_openai import ChatOpenAI
from langchain_community.utilities import SQLDatabase
from langchain_community.agent_toolkits import SQLDatabaseToolkit
from langchain.agents import AgentType, create_sql_agent
from langchain import hub


In [2]:
# Use the read-only user created above (pooler is great for runtime queries)
READONLY_DB_URL = os.getenv("DATABASE_URL", "")

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY") # set via env, not in code

In [3]:
# 1) SQLAlchemy engine with timeouts & read-only transactions

engine = create_engine(
    READONLY_DB_URL,
    pool_pre_ping=True,
    pool_size=5,
    max_overflow=5,
    connect_args={
        # server-side protection against runaway queries (5s)
        "options": "-c statement_timeout=5000 -c idle_in_transaction_session_timeout=5000"
    },
)

@event.listens_for(engine, "connect")
def _enforce_readonly(dbapi_conn, record):
    # Optional belt & suspenders: force read-only transactions
    with dbapi_conn.cursor() as c:
        c.execute("SET default_transaction_read_only = on;")

In [4]:
# 2) LangChain DB wrapper (ALLOWLIST tables for safety)

ALLOWED_TABLES = ["properties", "market_analytics"]
db = SQLDatabase(engine=engine, include_tables=ALLOWED_TABLES)

In [5]:
# 3) Deterministic LLM for reliable SQL generation

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [6]:
# 4) Toolkit + runtime SQL guardrails (SELECT-only + LIMIT)

toolkit = SQLDatabaseToolkit(db=db, llm=llm)

# Guard: block non-SELECT and inject LIMIT 100 if missing
_original_db_run = db.run
_limit_re = re.compile(r"\bLIMIT\s+\d+", re.IGNORECASE)

def _guarded_run(sql: str, *args, **kwargs):
    up = sql.strip().upper()
    if not up.startswith("SELECT"):
        raise ValueError("Only SELECT queries are permitted.")
    if not _limit_re.search(sql):
        sql += " LIMIT 100"
    return _original_db_run(sql, *args, **kwargs)

db.run = _guarded_run  # enforce guardrails

In [7]:
# Optional: record every executed SQL for debugging (set to True to enable)
LOG_SQL = False
if LOG_SQL:
    def _logging_run(sql: str, *a, **k):
        print("\n--- SQL EXECUTED ---\n", sql)
        return _guarded_run(sql, *a, **k)
    db.run = _logging_run

In [8]:
# 5) Pull LangChain Hub prompt AND inject live schema snippet

# Grab the prebuilt SQL agent system prompt
prompt_template = hub.pull("langchain-ai/sql-agent-system-prompt")

# Format it for PostgreSQL
base_system = prompt_template.format(dialect="PostgreSQL", top_k=5)

# Fetch a concise schema to guide the model (reduces hallucinations)
schema_snippet = db.get_table_info(ALLOWED_TABLES)

# Merge: Hub prompt + schema (best of both worlds)
# Many LangChain helpers accept either a string system prompt or a ChatPromptValue.
# We'll pass a single combined string to the agent's 'prompt' parameter.
merged_system_prompt = (
    f"{base_system}\n\n"
    "### ADDITIONAL CONTEXT: DATABASE SCHEMA (READ CAREFULLY)\n"
    "Use ONLY these tables/columns. Prefer aggregates. Always include a LIMIT (<=100) unless a single value is asked.\n\n"
    f"{schema_snippet}"
)



In [9]:
# 6) Create the SQL Agent with the merged system prompt

agent = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    agent_type=AgentType.OPENAI_FUNCTIONS,  # robust tool-calling
    verbose=False,
    prefix=merged_system_prompt,            # <- merged Hub+Schema prompt
)

def ask(question: str) -> str:
    """Natural language → (SQL) → Answer using merged Hub+Schema prompt."""
    return agent.run(question)

In [11]:
print("SQL Agent ready. Tables:", db.get_usable_table_names())

SQL Agent ready. Tables: ['market_analytics', 'properties']


In [12]:
# 1. Ask about property listings
print(ask("List all active properties in Austin with their price and number of bedrooms."))

  return agent.run(question)


Here are the active properties in Austin along with their prices and number of bedrooms:

1. Price: $450,000.00 - Bedrooms: 2
2. Price: $325,000.00 - Bedrooms: 4
3. Price: $280,000.00 - Bedrooms: 3
4. Price: $520,000.00 - Bedrooms: 3
5. Price: $750,000.00 - Bedrooms: None (bedrooms not specified)


In [13]:
# 2. Find investment-type property
print(ask("Which property is described as a great rental property? Show its title, price, and year built."))

The property described as a great rental property is titled "Investment Duplex," priced at $280,000.00, and was built in the year 1995.


In [11]:
# 3. Market analytics summary
print(ask("What was the average and median price in Austin across January to March 2024?"))

  return agent.run(question)


RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-mKgENRf6dAMxh9UIYO3X1LU1 on tokens per min (TPM): Limit 100000, Used 100000, Requested 1410. Please try again in 10h9m7.2s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}}

In [None]:
# 4. Trend in Austin
print(ask("Describe the market trend in Austin in early 2024."))

In [None]:
# 5. Compare cities
print(ask("Compare average property prices between Austin, Dallas, and Houston in January 2024."))

In [None]:
# 6. Inventory & days on market
print(ask("Show the inventory level and days on market for Dallas in February 2024."))

In [None]:
# 7. High-end listings
print(ask("Which is the most expensive property in the database, and what type is it?"))

In [None]:
# db = SQLDatabase.from_uri('')
# print(db.dialect)
# print(db.get_usable_table_names())


In [None]:
# # initialize llm
# llm = init_chat_model('gpt-4o-mini', model_provider='openai')

# # initialize the toolkit
# toolkit = SQLDatabaseToolkit(db=db, llm=llm)
# tools = toolkit.get_tools()
# print(tools)


In [10]:
# prompt template for nl2sql
prompt_template = hub.pull('langchain-ai/sql-agent-system-prompt')
prompt_template.messages[0].pretty_print()





You are an agent designed to interact with a SQL database.
Given an input question, create a syntactically correct [33;1m[1;3m{dialect}[0m query to run, then look at the results of the query and return the answer.
Unless the user specifies a specific number of examples they wish to obtain, always limit your query to at most [33;1m[1;3m{top_k}[0m results.
You can order the results by a relevant column to return the most interesting examples in the database.
Never query for all the columns from a specific table, only ask for the relevant columns given the question.
You have access to tools for interacting with the database.
Only use the below tools. Only use the information returned by the below tools to construct your final answer.
You MUST double check your query before executing it. If you get an error while executing a query, rewrite the query and try again.

DO NOT make any DML statements (INSERT, UPDATE, DELETE, DROP etc.) to the database.

To start you should ALWAYS look at

In [None]:
# system_message = prompt_template.format(dialect='SQLite', top_k=5)

# # create the sql ai agent
# sql_agent = create_react_agent(llm, tools, prompt=system_message)

# # sample execution
# query = "Identify the ten most expensive products (name, unit price)."

# # stream the events
# for event in sql_agent.stream(
#     {"messages": ("user", query)},
#     stream_mode='values'
# ):
#     event['messages'][-1].pretty_print()
