# Data

Loading and appending the Chicago Crime dataset

In [1]:
import pandas as pd
import duckdb
import openai
import time 
import os

Set the path of the CSV files:

In [5]:
path = "./data/titanic"

files = [x for x in os.listdir(path = path) if ".csv" in x]

print(files)

['titanic.csv']


In [7]:
data = pd.concat((pd.read_csv(path +"/" + f) for f in files), ignore_index=True)
data.columns = [c.strip().replace(" ", "_").lower() for c in data.columns]
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Source: https://platform.openai.com/examples/default-sql-translate

In [4]:
# Make sure to end the query with with a semicolon.
# Only output the pure SQL code. Do not give an explanation for your answer.

In [8]:
prompt_template = """

Given the following SQL table, your job is to write queries given a user’s request, giving only code, no explanantion, and ending the query with a semicolon. \n

CREATE TABLE {} ({}) \n

Write a SQL query that returns - {}
"""

def sql_prompt_generator(table_name, col_names, query):
    prompt = prompt_template.format(table_name, col_names, query)
    return prompt


In [18]:
table = "data"
col_names = str(list(data.columns)).replace('[', '').replace(']', '')
query = "How many passengers?"

p = sql_prompt_generator(table_name = table, col_names = col_names, query = query)

print(p)





Given the following SQL table, your job is to write queries given a user’s request, giving only code, no explanantion, and ending the query with a semicolon. 


CREATE TABLE data ('passengerid', 'survived', 'pclass', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket', 'fare', 'cabin', 'embarked') 


Write a SQL query that returns - How many passengers?



## Open AI API integration

In [12]:
from openai import OpenAI

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)

In [13]:
os.environ.get("OPENAI_API_KEY")

'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'

In [14]:
openai_api_models = pd.DataFrame(client.models.list().data)
openai_api_models.sort_values(1, inplace=True, ascending=False)
openai_api_models.head()

Unnamed: 0,0,1,2,3
62,"(id, tts-1-hd-1106)","(created, 1699053533)","(object, model)","(owned_by, system)"
61,"(id, tts-1-1106)","(created, 1699053241)","(object, model)","(owned_by, system)"
43,"(id, tts-1-hd)","(created, 1699046015)","(object, model)","(owned_by, system)"
18,"(id, gpt-3.5-turbo-1106)","(created, 1698959748)","(object, model)","(owned_by, system)"
60,"(id, dall-e-2)","(created, 1698798177)","(object, model)","(owned_by, system)"


In [15]:
def lang2sql(client, model, table, col_names, query):
    """
    Takes in a prompt and a query question and returns the corresponding dataframe
    """
    try:
        prompt = sql_prompt_generator(table_name = table, col_names = col_names, query = query)
        
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
        )

        response_text = response.choices[0].message.content
        startidtoken = "```sql"
        endidtoken = ";"
        sqlkey_select_loc = response_text.find(startidtoken)+len(startidtoken)
        sqlkey_semicolon_loc = response_text.find(endidtoken)
        query = response_text[sqlkey_select_loc:sqlkey_semicolon_loc]

        print("Executing query : ")
        print(query)

        query = query.replace("`", "\"")

        output_df = duckdb.sql(query).df()

        return prompt, response, output_df
    except Exception as e:
        print(e)
        return prompt, response, pd.DataFrame()


In [19]:
query = "How many passengers survived?"

prompt, response, output_df = lang2sql(client=client, model="gpt-4-1106-preview", table=table, col_names=col_names, query=query)

display(output_df)

Executing query : 

SELECT COUNT(*) FROM data WHERE survived = 1


Unnamed: 0,count_star()
0,342


In [21]:
data.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
query = "What is the most common last name of a passenger?"

prompt, response, output_df = lang2sql(client=client, model="gpt-4-1106-preview", table=table, col_names=col_names, query=query)

display(output_df)

Executing query : 

SELECT SUBSTRING_INDEX(name, ' ', -1) AS last_name, COUNT(*) AS count
FROM data
GROUP BY last_name
ORDER BY count DESC
LIMIT 1
Catalog Error: Scalar Function with name substring_index does not exist!
Did you mean "substring"?


In [24]:
query = "What was the number of survivors per age group?"

prompt, response, output_df = lang2sql(client=client, model="gpt-4-1106-preview", table=table, col_names=col_names, query=query)

display(output_df)

Executing query : 

SELECT age, COUNT(*) as survivor_count
FROM data 
WHERE survived = 1
GROUP BY age
ORDER BY age


Unnamed: 0,age,survivor_count
0,0.42,1
1,0.67,1
2,0.75,2
3,0.83,2
4,0.92,1
...,...,...
61,60.00,2
62,62.00,2
63,63.00,2
64,80.00,1


In [148]:
# system = """
# Given the following SQL table, your job is to write queries given a user’s request. 


# CREATE TABLE chicago_crime (ID BIGINT, Case Number VARCHAR, Date VARCHAR, Block VARCHAR, IUCR VARCHAR, Primary Type VARCHAR, Description VARCHAR, Location Description VARCHAR, Arrest BOOLEAN, Domestic BOOLEAN, Beat BIGINT, District BIGINT, Ward DOUBLE, Community Area BIGINT, FBI Code VARCHAR, X Coordinate DOUBLE, Y Coordinate DOUBLE, Year BIGINT, Updated On VARCHAR, Latitude DOUBLE, Longitude DOUBLE, Location VARCHAR) 
# """

# prompt = "Write a SQL query that returns - How many cases ended up with arrest?"

# response = openai.ChatCompletion.create(
#   model="gpt-3.5-turbo",
#   messages=[
#     {
#       "role": "system",
#       "content": system
#     },
#     {
#       "role": "user",
#       "content": prompt
#     }
#   ],
#   temperature=1,
#   max_tokens=256,
#   top_p=1,
#   frequency_penalty=0,
#   presence_penalty=0
# )

In [19]:
# print(response)

In [14]:
  message = [
    {
      "role": "system",
      "content": prompt.system
    },
    {
      "role": "user",
      "content": prompt.user
    }
    ]

message

[{'role': 'system',
  'content': '\n\n    Given the following SQL table, your job is to write queries given a user’s request. \n\n\n    CREATE TABLE chicago_crime (ID BIGINT, Case Number VARCHAR, Date VARCHAR, Block VARCHAR, IUCR VARCHAR, Primary Type VARCHAR, Description VARCHAR, Location Description VARCHAR, Arrest BOOLEAN, Domestic BOOLEAN, Beat BIGINT, District BIGINT, Ward DOUBLE, Community Area BIGINT, FBI Code VARCHAR, X Coordinate DOUBLE, Y Coordinate DOUBLE, Year BIGINT, Updated On VARCHAR, Latitude DOUBLE, Longitude DOUBLE, Location VARCHAR) \n\n    '},
 {'role': 'user',
  'content': 'Write a SQL query that returns - How many cases ended up with arrest?'}]

In [15]:
 response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages = message,
        temperature = 0,
        max_tokens = 256,
        frequency_penalty = 0,
        presence_penalty = 0)