# Imports

In [64]:
import sys, os, importlib
path2add = os.path.normpath(os.path.abspath(os.path.join(os.getcwd(), os.path.pardir, 'sql_generator')))
#if (not (path2add in sys.path)) :
#    sys.path.append(path2add)

os.chdir(path2add)

import index_util, llm_util, db_util
importlib.reload(index_util)
importlib.reload(llm_util)
importlib.reload(db_util)
from index_util import Index
from llm_util import LLM, PromptFactory
from db_util import DBMetadata

# Index Data

In [7]:
data_path = os.getenv("DATA_PATH", "../data/rag_dataset")
idx_minsearch = Index(data_path, "minsearch")
print(len(idx_minsearch.index.docs))

184206


In [30]:
query = "What is total revenue for top 5 cutomers?"

results = idx_minsearch.search(query, 5)
print(results)


[{'keyword': 'DUMMY', 'instruction': 'Find the total revenue for each manufacturer.It is not neccessary to use all the tables.', 'schema': 'manufacturers:  (name [ VARCHAR ] revenue [ INTEGER) ]', 'output': 'SELECT SUM(revenue), name FROM manufacturers GROUP BY name', '__index_level_0__': 88709}, {'keyword': 'DUMMY', 'instruction': 'Find the total revenue of companies whose revenue is larger than the revenue of some companies based in Austin.It is not neccessary to use all the tables.', 'schema': 'manufacturers:  (revenue [ INTEGER ] headquarter [ VARCHAR) ]', 'output': "SELECT SUM(revenue) FROM manufacturers WHERE revenue > (SELECT MIN(revenue) FROM manufacturers WHERE headquarter = 'Austin')", '__index_level_0__': 97228}, {'keyword': 'DUMMY', 'instruction': 'Name the total number of top 10 with top 25 less than 2 and top 5 more than 0.It is not neccessary to use all the tables.', 'schema': ':  Tournament [ text ] Wins [ real ] Top-5 [ real ] Top-10 [ real ] Top-25 [ real ] Events [ r

# RAG Flow

In [40]:
pf = PromptFactory()
llm = LLM('openai', 'gpt-4o-mini', api_key=os.getenv("OPENAI_API_KEY"))

In [31]:
prompt = pf.get_prompt("basic_prompt", schema='test_schema', instruction='test_instruction')
print(prompt)

Here is a database schema: test_schema
  Please write me a syntactically correct SQL statement that answers the following question: test_instruction


In [34]:
response = llm.query(prompt)
print(response)

To provide you with a syntactically correct SQL statement, I need to know the specific question (test_instruction) you want to answer with the database schema "test_schema." Please provide the details of the question or instruction you have in mind.


In [41]:
result = llm.rag_query(query, 'test_schema', idx_minsearch)
print(result)

To write a SQL statement that calculates the total revenue for the top 5 customers, we need to assume a structure for the tables involved in your `test_schema`. Let's assume you have a `customers` table and an `orders` table, where the `orders` table contains the revenue for each order along with a reference to the customer.

Here's a generic SQL statement that would compute the total revenue for the top 5 customers based on their total revenue:

```sql
SELECT 
    c.customer_id,
    c.customer_name,
    SUM(o.revenue) AS total_revenue
FROM 
    test_schema.customers c
JOIN 
    test_schema.orders o ON c.customer_id = o.customer_id
GROUP BY 
    c.customer_id, c.customer_name
ORDER BY 
    total_revenue DESC
LIMIT 5;
```

### Explanation:
- `c.customer_id` and `c.customer_name`: Selecting the customer ID and name from the customers table.
- `SUM(o.revenue) AS total_revenue`: Calculating the total revenue for each customer by summing their order revenues.
- `JOIN`: Joining the `customer

In [70]:
importlib.reload(db_util)
from db_util import DBMetadata

db_meta = DBMetadata('postgres',
                      'dvdrental',
                      'localhost',
                      os.environ['POSTGRES_PORT'], 
                      os.environ['POSTGRES_USER'] ,
                      os.environ['POSTGRES_PASSWORD'])



In [71]:
test_result = db_meta.search_metadata(['customer'])
print(test_result)

  table_name                                        column_name
8       film  [description, film_id, fulltext, language_id, ...
