## 1. Database Schema and Configuration

In [86]:
import os
import google.generativeai as genai
from test import execute_query
from config import GOOGLE_API_KEY, DATABASE_SCHEMA

import re
from langchain_google_genai import ChatGoogleGenerativeAI
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolExecutor
from langchain.tools import Tool
from langchain.schema import HumanMessage
from typing import TypedDict, Annotated, Sequence, Union
from typing import List, Tuple, Dict, Any

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.schema import SystemMessage, HumanMessage


genai.configure(api_key=GOOGLE_API_KEY)

os.environ["LANGCHAIN_API_KEY"] = "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"
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "text2sql"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"


## 2. Core Text-to-SQL Functions


In [91]:
# Load LLMs
llm_sql_generator = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite-preview-02-05")
llm_sql_validator = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite-preview-02-05")
llm_agent = ChatGoogleGenerativeAI(model="gemini-2.0-flash-lite-preview-02-05")

# Extract SQL from the model response
def extract_sql(text):
    match = re.search(r"```sql\s*(.*?)\s*```", text, re.DOTALL)
    return match.group(1) if match else text

# Tool 1: Generate SQL Query
def generate_sql(natural_language_query, DATABASE_SCHEMA=DATABASE_SCHEMA):
    """Agent to generate SQL from a natural language question."""
    prompt = f"""
    Database Schema:
    {DATABASE_SCHEMA}

    **************************
    Answer Repeating the question and evidence, and generating the SQL with a query plan.
    
    <---(Example)--->
    **Question**: How many Thai restaurants can be found in San Pablo Ave, Albany?
    **Evidence**: Thai restaurant refers to food_type = 'thai'; San Pablo Ave Albany refers to street_name
    = 'san pablo ave' AND T1.city = 'albany'
    **Query Plan**:
    ** Preparation Steps:**
    1. Initialize the process: Start preparing to execute the query.
    2. Prepare storage: Set up storage space (registers) to hold temporary results, initializing them to NULL.
    3. Open the location table: Open the location table so we can read from it.
    4. Open the generalinfo table: Open the generalinfo table so we can read from it.
    ** Matching Restaurants:**
    1. Start reading the location table: Move to the first row in the location table.
    2. Check if the street matches: Look at the street_name column of the current row in location. If it's not
    "san pablo ave," skip this row.
    3. Identify the matching row: Store the identifier (row ID) of this location entry.
    4. Find the corresponding row in generalinfo: Use the row ID from location to directly find the matching
    row in generalinfo.
    5. Check if the food type matches: Look at the food_type column in generalinfo. If it's not "thai," skip
    this row.
    6. Check if the city matches: Look at the city column in generalinfo. If it's not "albany," skip this row.
    ** Counting Restaurants:**
    1. Prepare to count this match: If all checks pass, prepare to include this row in the final count.
    2. Count this match: Increment the count for each row that meets all the criteria.
    3. Move to the next row in location: Go back to the location table and move to the next row, repeating
    the process until all rows are checked.
    4. Finalize the count: Once all rows have been checked, finalize the count of matching rows.
    5. Prepare the result: Copy the final count to prepare it for output.
    ** Delivering the Result:**
    1. Output the result: Output the final count, which is the number of restaurants that match all the
    specified criteria.
    2. End the process: Stop the query execution process.
    3. Setup phase: Before starting the actual query execution, the system prepares the specific values it will
    be looking for, like "san pablo ave," "thai," and "albany."
    **Final Optimized SQL Query:**
    SELECT COUNT(T1.id_restaurant) FROM generalinfo AS T1 INNER JOIN location AS T2
    ON T1.id_restaurant = T2.id_restaurant WHERE T1.food_type = 'thai' AND T1.city = 'albany' AND
    T2.street_name = 'san pablo ave' 
    
    Only return the SQL query without ``` backticks, no other text. 
    Ensure the table alias is correctly assigned
    
    ---------------------------------
    Question: {natural_language_query}

    SQL Query: 
    """
    response = llm_sql_generator.invoke([HumanMessage(content=prompt)])
    return extract_sql(response.content)

# Tool 2: Validate and Fix SQL Query for PostgreSQL
def validate_and_fix_sql(sql_query):
    """Agent to validate and correct SQL syntax for PostgreSQL."""
    prompt = f"""
    The following SQL query might have syntax issues. Your task is to analyze it and correct any mistakes 
    so that it works properly in **PostgreSQL**.

    Incorrect SQL:
    {sql_query}

    Return only the corrected SQL query without any explanations or ``` backticks.
    
    Make sure to use "ILIKE" instead of "=" for case insensitive matching. If not asked to be case sensitive

    Corrected SQL Query:
    """
    response = llm_sql_validator.invoke([HumanMessage(content=prompt)])
    return extract_sql(response.content)

# Define state type
class AgentState(TypedDict):
    input: str
    database_schema: str
    sql_query: str
    final_query: str

# Define nodes with updated configuration
def generate_sql_node(state: AgentState) -> AgentState:
    """Generate initial SQL query"""
    try:
        sql_query = generate_sql(
            natural_language_query=state["input"],
            DATABASE_SCHEMA=state["database_schema"]
        )
        state["sql_query"] = sql_query
        return state
    except Exception as e:
        print(f"Error in generate_sql_node: {str(e)}")
        raise

def validate_sql_node(state: AgentState) -> AgentState:
    """Validate and fix SQL query"""
    try:
        final_query = validate_and_fix_sql(state["sql_query"])
        state["final_query"] = final_query
        return state
    except Exception as e:
        print(f"Error in validate_sql_node: {str(e)}")
        raise

# Create workflow
workflow = StateGraph(AgentState)

# Add nodes
workflow.add_node("generate_sql", generate_sql_node)
workflow.add_node("validate_sql", validate_sql_node)

# Add edges
workflow.add_edge("generate_sql", "validate_sql")
workflow.add_edge("validate_sql", END)

# Set entry point
workflow.set_entry_point("generate_sql")

# Compile
agent_executor = workflow.compile()

# Example usage
def process_query(natural_language_query: str):
    try:
        result = agent_executor.invoke({
            "input": natural_language_query,
            "database_schema": DATABASE_SCHEMA,
            "sql_query": "",
            "final_query": ""
        })
        
        print("\nNatural Language Query:", natural_language_query)
        print("\nGenerated SQL:", result["final_query"])
        
        # Execute the query
        query_results = execute_query(result["final_query"].replace("\n", " "))
        print("\nQuery Results:\n", query_results,"\n\n\n")
        
        return result["final_query"], query_results
        
    except Exception as e:
        print(f"Error processing query: {str(e)}")
        return None, None

# Test the processing
# test_query = "Show me the top 5 customers who have rented the most movies"
test_query = "List all actors' first and last names."
sql, results = process_query(test_query)


Natural Language Query: List all actors' first and last names.

Generated SQL: SELECT first_name, last_name FROM actor;

Query Results:
  first_name  |  last_name   
-------------+--------------
 PENELOPE    | GUINESS
 NICK        | WAHLBERG
 ED          | CHASE
 JENNIFER    | DAVIS
 JOHNNY      | LOLLOBRIGIDA
 BETTE       | NICHOLSON
 GRACE       | MOSTEL
 MATTHEW     | JOHANSSON
 JOE         | SWANK
 CHRISTIAN   | GABLE
 ZERO        | CAGE
 KARL        | BERRY
 UMA         | WOOD
 VIVIEN      | BERGEN
 CUBA        | OLIVIER
 FRED        | COSTNER
 HELEN       | VOIGHT
 DAN         | TORN
 BOB         | FAWCETT
 LUCILLE     | TRACY
 KIRSTEN     | PALTROW
 ELVIS       | MARX
 SANDRA      | KILMER
 CAMERON     | STREEP
 KEVIN       | BLOOM
 RIP         | CRAWFORD
 JULIA       | MCQUEEN
 WOODY       | HOFFMAN
 ALEC        | WAYNE
 SANDRA      | PECK
 SISSY       | SOBIESKI
 TIM         | HACKMAN
 MILLA       | PECK
 AUDREY      | OLIVIER
 JUDY        | DEAN
 BURT        | DUKAKIS
 VAL  

In [34]:
import pandas as pd

file_path = r"E:\Assignment projects\Text2Sql-assignment\Pagila Evals Dataset(Sheet1).csv"

# Read CSV with an alternate encoding
test_queries = pd.read_csv(file_path, encoding="ISO-8859-1")  # or encoding="latin1"
print(test_queries.head())


   Query Number                             Natural Language Query Difficulty
0             1             List all actors' first and last names.       Easy
1             2      Show the titles of all films in the database.       Easy
2             3                       Get the names of all cities.       Easy
3             4           List all categories available for films.       Easy
4             5  Show the first name and last name of all custo...       Easy


In [76]:
test_queries

Unnamed: 0,Query Number,Natural Language Query,Difficulty,Query,SQL_Generated,Results
0,1,List all actors' first and last names.,Easy,List all actors' first and last names.,"SELECT first_name, last_name FROM actor;",first_name | last_name \n-------------+--...
1,2,Show the titles of all films in the database.,Easy,Show the titles of all films in the database.,SELECT title FROM film;,title \n---------------...
2,3,Get the names of all cities.,Easy,Get the names of all cities.,SELECT city FROM city;,city \n----------------...
3,4,List all categories available for films.,Easy,List all categories available for films.,SELECT name FROM category;,name \n-------------\n Action\n Animat...
4,5,Show the first name and last name of all custo...,Easy,Show the first name and last name of all custo...,"SELECT first_name, last_name FROM customer;",first_name | last_name \n-------------+--...
5,6,Show all films released in 2006.,Easy,Show all films released in 2006.,SELECT * FROM film WHERE release_year = 2006;,film_id | title | ...
6,7,"Find all actors with the last name ""Smith.""",Easy,"Find all actors with the last name ""Smith.""","SELECT actor_id, first_name, last_name\nFROM a...",actor_id | first_name | last_name \n---------...
7,8,List all customers who are from the city of N...,Easy,List all customers who are from the city of N...,"SELECT\n C.first_name,\n C.last_name\nFROM C...",first_name | last_name \n------------+-------...
8,9,Get all stores located in the country India.,Easy,Get all stores located in the country India.,SELECT S.store_id\nFROM store AS S\nJOIN addre...,store_id \n----------\n 13\n 18\n...
9,10,Show all films with a rental rate greater than...,Easy,Show all films with a rental rate greater than...,"SELECT film_id, title, rental_rate FROM film W...",film_id | title | rental_...


In [80]:
# test_queries = test_queries[:2]
results = []

for index, row in test_queries.iterrows():
    if pd.isna(row["SQL_Generated"]) or (isinstance(row["SQL_Generated"], str) and "(0 rows)" in row["SQL_Generated"]):
        sql, output = process_query(row["Natural Language Query"])
        print(output, "\n\n\n")

        # Update the DataFrame in place
        test_queries.at[index, "SQL_Generated"] = sql
        test_queries.at[index, "Results"] = output

# Save to CSV after updating all rows
test_queries.to_csv("evaluation_results.csv", index=False)



Natural Language Query: Find the films that have been rented more times than the average number of rentals per film.

Generated SQL: SELECT film_id
FROM inventory
GROUP BY film_id
HAVING COUNT(*) > (
    SELECT AVG(rental_count)
    FROM (
        SELECT COUNT(*) AS rental_count
        FROM rental
        JOIN inventory ON rental.inventory_id = inventory.inventory_id
        GROUP BY inventory.film_id
    ) AS film_rental_counts
);

Query Results:
  film_id 
---------
(0 rows)

 



 film_id 
---------
(0 rows)

 




Natural Language Query: For each actor, show their name and the percentage of films they've acted in compared to the total films.

Generated SQL: SELECT
  A.first_name,
  A.last_name,
  CAST(COUNT(FA.film_id) AS REAL) * 100 / (
    SELECT
      COUNT(*)
    FROM film
  ) AS percentage_of_films
FROM actor AS A
JOIN film_actor AS FA
  ON A.actor_id = FA.actor_id
GROUP BY
  A.actor_id,
  A.first_name,
  A.last_name;

Query Results:
  first_name  |  last_name   | percentage

In [81]:

import pandas as pd

file_path = r"E:\Assignment projects\Text2Sql-assignment\evaluation_results.csv"

# Read CSV with an alternate encoding
evaluated_df = pd.read_csv(file_path, encoding="ISO-8859-1")  # or encoding="latin1"
print(evaluated_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 6 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Query Number            40 non-null     int64 
 1   Natural Language Query  40 non-null     object
 2   Difficulty              40 non-null     object
 3   Query                   40 non-null     object
 4   SQL_Generated           40 non-null     object
 5   Results                 40 non-null     object
dtypes: int64(1), object(5)
memory usage: 2.0+ KB
None


In [45]:
evaluated_df["Results"]

0      first_name  |  last_name   \n-------------+--...
1                 title            \n---------------...
2                 city            \n----------------...
3         name     \n-------------\n Action\n Animat...
4      first_name  |  last_name   \n-------------+--...
5      film_id |          title          |          ...
6      actor_id | first_name | last_name \n---------...
7      first_name | last_name \n------------+-------...
8      store_id \n----------\n       13\n       18\n...
9      film_id |           title           | rental_...
10        name     | film_count \n-------------+----...
11                 count \n-------\n   200\n(1 row)\n\n
12          sum    \n----------\n 10923.45\n(1 row)\n\n
13                 count \n-------\n     0\n(1 row)\n\n
14                 count \n-------\n   223\n(1 row)\n\n
15                title            |    name     \n-...
16     first_name | last_name \n------------+-------...
17     first_name  |  last_name   |            t

In [82]:
check = evaluated_df.loc[28,"Results"]

print(check)

 customer_id | store_id | first_name | last_name | email | address_id | activebool | create_date | last_update | active 
-------------+----------+------------+-----------+-------+------------+------------+-------------+-------------+--------
(0 rows)




In [84]:
filtered_df = evaluated_df[evaluated_df["Results"].str.contains(r"\(0 rows\)", regex=True, na=False)]
# filtered_df = evaluated_df[evaluated_df["Results"].isna()]
filtered_df


Unnamed: 0,Query Number,Natural Language Query,Difficulty,Query,SQL_Generated,Results
6,7,"Find all actors with the last name ""Smith.""",Easy,"Find all actors with the last name ""Smith.""","SELECT actor_id, first_name, last_name\nFROM a...",actor_id | first_name | last_name \n---------...
7,8,List all customers who are from the city of Â...,Easy,List all customers who are from the city of Â...,"SELECT\n C.first_name,\n C.last_name\nFROM C...",first_name | last_name \n------------+-------...
16,17,"Show all actors who appeared in the film ""Ince...",Medium,"Show all actors who appeared in the film ""Ince...","SELECT A.first_name, A.last_name\nFROM actor A...",first_name | last_name \n------------+-------...
19,20,"Find all films rented by customer ""John Doe.""",Medium,"Find all films rented by customer ""John Doe.""",SELECT F.title\nFROM film AS F\nJOIN inventory...,title \n-------\n(0 rows)\n\n
25,26,Show all rentals made in the last 7 days.,Medium,Show all rentals made in the last 7 days.,SELECT *\nFROM rental\nWHERE rental_date >= NO...,rental_id | rental_date | inventory_id | cust...
28,29,Find all customers who registered in the last ...,Medium,Find all customers who registered in the last ...,SELECT * FROM customer WHERE create_date >= DA...,customer_id | store_id | first_name | last_na...
31,32,Find the films that have been rented more time...,Hard,Find the films that have been rented more time...,SELECT film_id\nFROM inventory\nGROUP BY film_...,film_id \n---------\n(0 rows)\n\n
35,36,Find customers who have rented more films this...,Hard,Find customers who have rented more films this...,"SELECT c.first_name, c.last_name\nFROM custome...",first_name | last_name \n------------+-------...
37,38,"For each customer, show the number of films re...",Hard,"For each customer, show the number of films re...",WITH MonthlyRentals AS (\n SELECT\n ...,customer_id | rental_month | rental_count | p...
38,39,Show the names of customers who have rented ev...,Hard,Show the names of customers who have rented ev...,"SELECT c.first_name, c.last_name\nFROM custome...",first_name | last_name \n------------+-------...


In [85]:
idx=7

query = filtered_df.iloc[idx]["Query"]
sql_query = filtered_df.iloc[idx]["SQL_Generated"]
print("Query:", query)
print("SQL:", sql_query, "\n\n\n")


result = execute_query(sql_query.replace("\n", " "))
print(result)

Query: Find customers who have rented more films this year than last year.
SQL: SELECT c.first_name, c.last_name
FROM customer c
JOIN rental r ON c.customer_id = r.customer_id
WHERE EXTRACT(YEAR FROM r.rental_date) = EXTRACT(YEAR FROM CURRENT_DATE)
GROUP BY c.customer_id, c.first_name, c.last_name
HAVING COUNT(DISTINCT r.rental_id) > (
    SELECT COUNT(DISTINCT r2.rental_id)
    FROM rental r2
    WHERE r2.customer_id = c.customer_id
      AND EXTRACT(YEAR FROM r2.rental_date) = EXTRACT(YEAR FROM CURRENT_DATE) - 1
); 





 first_name | last_name 
------------+-----------
(0 rows)




In [68]:
print(execute_query("""SELECT actor_id, first_name, last_name FROM actor WHERE last_name = 'Paltrow'"""))

 actor_id | first_name | last_name 
----------+------------+-----------
(0 rows)


