In [1]:
'''
In this project, we'll develop a start-up that can take a non-technical manager's Natural Language question, 
such as "What counties had the top sales?" and convert that into a SQL query. 

We will then use that SQL to query the data (which in this example will come from a .csv file we read in with 
Pandas and set-up a temporary DB in RAM) and then report back the results!

# Don't forget ot install openai, pandas, etc.
# You could also do this in a notebook by adding a !
# !pip install openai

'''

import openai
import os
import pandas as pd

In [3]:
'''
Our data will be some example Sales Data from:  https://www.kaggle.com/datasets/kyanyoga/sample-sales-data
We've already downloaded the dataset for you as "sales_data_sample.csv".
Let's read it in:
'''

df=pd.read_csv('DATA/sales_data_sample.csv');
df.head(5)

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [4]:
#Example Pandas Query for Sum of Sales by Quarter 
#SELECT SUM(SALES) FROM TABLE WHERE
df.groupby('QTR_ID').sum()['SALES']


  df.groupby('QTR_ID').sum()['SALES']


QTR_ID
1    2350817.73
2    2048120.30
3    1758910.81
4    3874780.01
Name: SALES, dtype: float64

In [5]:
'''
Or we can query via SQL Syntax, let's set-up a temporary in memory (RAM) database, basically export this CSV-->Pandas DF--> SQL DB

#TEMP DB in RAM
#PUSH Pandas DF -> TEM DB
#SQL QUERY on TEMP DB
'''
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import text

In [6]:
#TEMP DB in RAM
temp_db=create_engine('sqlite:///:memory:',echo=True)

In [7]:
#PUSH Pandas DF -> TEM DB
#Here we push our entire DataFrame to a table called Sales:
data=df.to_sql(name='Sales',con=temp_db)

2023-08-04 14:46:16,431 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-08-04 14:46:16,432 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Sales")
2023-08-04 14:46:16,432 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-08-04 14:46:16,433 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("Sales")
2023-08-04 14:46:16,435 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-08-04 14:46:16,436 INFO sqlalchemy.engine.Engine ROLLBACK
2023-08-04 14:46:16,439 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-08-04 14:46:16,440 INFO sqlalchemy.engine.Engine 
CREATE TABLE "Sales" (
	"index" BIGINT, 
	"ORDERNUMBER" BIGINT, 
	"QUANTITYORDERED" BIGINT, 
	"PRICEEACH" FLOAT, 
	"ORDERLINENUMBER" BIGINT, 
	"SALES" FLOAT, 
	"ORDERDATE" TEXT, 
	"STATUS" TEXT, 
	"QTR_ID" BIGINT, 
	"MONTH_ID" BIGINT, 
	"YEAR_ID" BIGINT, 
	"PRODUCTLINE" TEXT, 
	"MSRP" BIGINT, 
	"PRODUCTCODE" TEXT, 
	"CUSTOMERNAME" TEXT, 
	"PHONE" TEXT, 
	"ADDRESSLINE1" TEXT, 
	"ADDRESSLINE2" TEXT, 
	"CITY" TEXT, 
	"STATE" TEXT, 
	

In [8]:
#Using SQL Alchemy we can establish a connection to this temporary database and query it for the results:
#SQL QUERY on TEMP DB
with temp_db.connect() as conn:
    #makes the connection
    #run code indentation/block
    #auto close connection
    
    result=conn.execute(text("SELECT SUM(SALES) FROM SALES"))

2023-08-04 14:46:20,783 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-08-04 14:46:20,784 INFO sqlalchemy.engine.Engine SELECT SUM(SALES) FROM SALES
2023-08-04 14:46:20,785 INFO sqlalchemy.engine.Engine [generated in 0.00265s] ()
2023-08-04 14:46:20,787 INFO sqlalchemy.engine.Engine ROLLBACK


In [9]:
result.all()

[(10032628.85000001,)]

In [10]:
with temp_db.connect() as conn:
    result = conn.execute(text("Select ORDERNUMBER, SALES from Sales ORDER BY SALES DESC LIMIT 1"))

2023-08-04 14:46:25,235 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-08-04 14:46:25,237 INFO sqlalchemy.engine.Engine Select ORDERNUMBER, SALES from Sales ORDER BY SALES DESC LIMIT 1
2023-08-04 14:46:25,237 INFO sqlalchemy.engine.Engine [generated in 0.00214s] ()
2023-08-04 14:46:25,239 INFO sqlalchemy.engine.Engine ROLLBACK


In [11]:
result.all()

[(10407, 14082.8)]

In [None]:
'''
Remember to use the notebook as shown, you must set your OpenAI API Key as an environment variable. 
Obviously, there are many ways you could provide your API Key to the Python code, input() or even hard-coded, 
but those are typically not recommended for safety reasons. Having it as an environment variable let's 
the key live on the computer, but not actually be present in the code.

# Uncomment below and swap in your key to place your environment key using Python
# Then you can delete the key string and the code cell below will still work!
# os.environ["OPENAI_API_KEY"] = "YOUR KEY GOES HERE, THEN DELETE THIS LINE OF CODE TO PREVENT ANYONE FROM SEEING YOUR KEY"

'''

openai.api_key=os.getenv("OPENAI_API_KEY")

In [None]:
'''
### Inform GPT about the SQL Table Structure

We need to tell GPT what the table structure looks like before it can understand 
the schema enough to create a SQL query. 

Let's create a function to generate the first part of the example text below 
(which we can then attach a user natural language query to!)


**Below is an example input to GPT, we tell it the table structure and the NLP question, 
then at the end we tell it to "SELECT"...**

**Thus GPT must finish with the rest of the most reasonable SQL query.**
'''

### sqlite SQL tables, with their properties:
#
#Employee(id,name,department_id)
#Department(id,name, address)
#Salary_payments(id,employee_id,amount,date)
#
### A query to list the names of departments which employed more that 10 Employee
#SELECT

In [12]:
#This function returns a prompt that informs GPT that we want to work with SQL Tables

'''
This will generate prompt like this:

### sqlite SQL table, with its properties:
#
# Sales(ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,
# ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,MSRP,
# PRODUCTCODE,CUSTOMERNAME,PHONE,ADDRESSLINE1,ADDRESSLINE2,
# CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE)
#
'''
def create_table_defination(df):
    prompt="""### sqlite SQL table, with it properties:
    #
    #Sales({})
    #
    """.format(",".join(str(col) for col in df.columns))
    
    return prompt

In [13]:
df.columns

Index(['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER',
       'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID',
       'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE',
       'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE',
       'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME',
       'DEALSIZE'],
      dtype='object')

In [14]:
[(str(col) for col in df.columns)]
",".join(str(col) for col in df.columns)

'ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,MSRP,PRODUCTCODE,CUSTOMERNAME,PHONE,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE'

In [15]:
print(create_table_defination(df))

### sqlite SQL table, with it properties:
    #
    #Sales(ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,PRODUCTLINE,MSRP,PRODUCTCODE,CUSTOMERNAME,PHONE,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE)
    #
    


In [None]:
def prompt_input():
    nlp_text=input("Enter the information you want: ")
    return nlp_text

In [None]:
prompt_input()

In [None]:
def combine_prompts(df,query_prompt):
    defination=create_table_defination(df)
    query_init_string=f"### A query to answer: { query_prompt }\nSELECT"
    return defination+query_init_string

In [None]:
nlp_text=prompt_input()# NLP
combine_prompts(df,nlp_text) #DF + query that dose ...+NPL 