# Data

Loading and appending the Chicago Crime dataset

In [1]:
import pandas as pd
import duckdb
import openai
import chicago_data
import datetime
import time 
import os

In [2]:
endpoint = "https://data.cityofchicago.org/resource/ijzp-q8t2"
start = datetime.datetime(2024,1,1,0,0,0)
end = datetime.datetime(2024,4,24,0,0,0)


In [3]:
chicago_crime = chicago_data.backfill_chicago_data(endpoint = endpoint, 
                                       start = start, 
                                       end = end, 
                                       offset = 24 * 30,
                                       limit = 100000)



In [4]:
chicago_crime.to_csv("data/chicago_crime.csv", index = False)

Set the path of the CSV files:

In [5]:
path = "./data"

files = [x for x in os.listdir(path = path) if ".csv" in x]

print(files)

['chicago_crime_2020-2024.csv', 'chicago_crime.csv']


In [6]:
chicago_crime = pd.concat((pd.read_csv(path +"/" + f) for f in files), ignore_index=True)

chicago_crime.head()

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
chicago_crime

Source: https://platform.openai.com/examples/default-sql-translate

In [None]:
prompt_template = """

Given the following SQL table, your job is to write queries given a user’s request. \n

CREATE TABLE {} ({}) \n

Write a SQL query that returns - {}
"""

In [None]:
def sql_prompt_generator(table_name, col_names, query):
    prompt = prompt_template.format(table_name, col_names, query)
    return prompt


In [None]:
table = "chicago_crime"
col_names = str(list(chicago_crime.columns)).replace('[', '').replace(']', '')
query = "How many cases are from the year 2023?"

p = sql_prompt_generator(table_name = table, col_names = col_names, query = query)

print(p)



In [None]:
openai.api_key = os.getenv('OPENAI_KEY')

In [None]:
openai_api_models = pd.DataFrame(openai.Model.list()["data"])

openai_api_models.head

In [None]:
response = openai.Completion.create(engine = "text-davinci-003",
                                     prompt= p)

In [None]:
query = response["choices"][0]["text"]
print(query)
duckdb.sql(query).show()

In [None]:
response = openai.Completion.create(engine = "text-davinci-003",
                                     prompt= p,
                                     n = 2)
print(response)
query = response["choices"][0]["text"]

In [None]:
response = openai.Completion.create(engine = "text-davinci-003",
                                     prompt= p,
                                     n = 2,
                                     temperature = 0)
print(response)
query = response["choices"][0]["text"]

In [None]:
response = openai.Completion.create(engine = "text-davinci-003",
                                     prompt= p,
                                     n = 2,
                                     temperature = 2)
print(response)
query = response["choices"][0]["text"]

In [None]:
def lang2sql(api_key, table_name, col_names, query, engine = "text-davinci-003", n = 1, temperature = 0):
    prompt = sql_prompt_generator(table_name = table_name, col_names = col_names, query = query)
    
    openai.api_key = api_key

    response = openai.Completion.create(engine = engine,
                                        prompt= prompt,
                                        n = n,
                                        temperature = temperature)
    return response


In [None]:
table_name = "chicago_crime"
col_names = str(list(chicago_crime.columns)).replace('[', '').replace(']', '')
api_key = os.getenv('OPENAI_KEY')

In [None]:
query = "How many cases are from the year 2023?"

response = lang2sql(api_key = api_key, table_name = table_name, col_names= col_names, query = query)

In [None]:
print(response)
sql_query = response["choices"][0]["text"]
print(sql_query)

duckdb.sql(sql_query).show()

In [None]:
query = "How many cases ended up with arrest?"

response = lang2sql(api_key = api_key, table_name = table_name, col_names= col_names, query = query)
sql_query = response["choices"][0]["text"]
print(sql_query)

In [None]:
tbl_describe = duckdb.sql("DESCRIBE SELECT * FROM chicago_crime;")
tbl_describe



In [None]:
col_attr = tbl_describe.df()[["column_name", "column_type"]]

col_attr["column_joint"] = col_attr["column_name"] + " " +  col_attr["column_type"]
col_attr

In [None]:
col_names = str(list(col_attr["column_joint"].values)).replace('[', '').replace(']', '').replace('\'', '')
query = "How many cases ended up with arrest?"
prompt = sql_prompt_generator(table_name = table_name, col_names = col_names, query = query)
print(prompt)

In [None]:

response = lang2sql(api_key = api_key, table_name = table_name, col_names= col_names, query = query)
print(response)
sql_query = response["choices"][0]["text"]
print(sql_query)

In [None]:
system = """
Given the following SQL table, your job is to write queries given a user’s request. 


CREATE TABLE chicago_crime (ID BIGINT, Case Number VARCHAR, Date VARCHAR, Block VARCHAR, IUCR VARCHAR, Primary Type VARCHAR, Description VARCHAR, Location Description VARCHAR, Arrest BOOLEAN, Domestic BOOLEAN, Beat BIGINT, District BIGINT, Ward DOUBLE, Community Area BIGINT, FBI Code VARCHAR, X Coordinate DOUBLE, Y Coordinate DOUBLE, Year BIGINT, Updated On VARCHAR, Latitude DOUBLE, Longitude DOUBLE, Location VARCHAR) 
"""

prompt = "Write a SQL query that returns - How many cases ended up with arrest?"

response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {
      "role": "system",
      "content": system
    },
    {
      "role": "user",
      "content": prompt
    }
  ],
  temperature=1,
  max_tokens=256,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)

In [None]:
print(response)

In [None]:
sql_query = response["choices"][0]["message"]["content"]
print(sql_query)


In [None]:
duckdb.sql(sql_query).show()

In [None]:
duckdb.sql("DESCRIBE SELECT * FROM chicago_crime;")

In [None]:
def create_message(table_name, query):

    class message:
        def __init__(message, system, user, column_names, column_attr):
            message.system = system
            message.user = user
            message.column_names = column_names
            message.column_attr = column_attr

    
    system_template = """

    Given the following SQL table, your job is to write queries given a user’s request. \n

    CREATE TABLE {} ({}) \n
    """

    user_template = "Write a SQL query that returns - {}"
    
    tbl_describe = duckdb.sql("DESCRIBE SELECT * FROM " + table_name +  ";")
    col_attr = tbl_describe.df()[["column_name", "column_type"]]
    col_attr["column_joint"] = col_attr["column_name"] + " " +  col_attr["column_type"]
    col_names = str(list(col_attr["column_joint"].values)).replace('[', '').replace(']', '').replace('\'', '')

    system = system_template.format(table_name, col_names)
    user = user_template.format(query)

    m = message(system = system, user = user, column_names = col_attr["column_name"], column_attr = col_attr["column_type"])
    return m


    

In [None]:
query = "How many cases ended up with arrest?"
msg = create_message(table_name = "chicago_crime", query = query)

print(msg.system)
print(msg.user)
print(msg.column_names)
print(msg.column_attr)

In [None]:
query = "How many cases ended up with arrest?"
prompt = create_message(table_name = "chicago_crime", query = query)

In [None]:
  message = [
    {
      "role": "system",
      "content": prompt.system
    },
    {
      "role": "user",
      "content": prompt.user
    }
    ]

message

In [None]:
 response = openai.ChatCompletion.create(
        model = "gpt-3.5-turbo",
        messages = message,
        temperature = 0,
        max_tokens = 256,
        frequency_penalty = 0,
        presence_penalty = 0)

In [None]:
response

In [None]:
sql = response["choices"][0]["message"]["content"]

print(sql)

In [None]:
duckdb.sql(sql).show()

In [None]:
def add_quotes(query, col_names):
    for i in col_names:
        if i in query:
            l = query.find(i)
            if query[l-1] != "'" and query[l-1] != '"': 
                query = str(query).replace(i, '"' + i + '"') 

    return(query)

In [None]:
q1 ="SELECT COUNT(*) FROM chicago_crime WHERE Primary Type = 'ROBBERY';"
q2 ="SELECT COUNT(*) FROM chicago_crime WHERE \"Primary Type\" = 'ROBBERY';"
q3 ="SELECT COUNT(*) FROM chicago_crime WHERE 'Primary Type' = 'ROBBERY';"

print(add_quotes(query = q1, col_names= prompt.column_names))
print(add_quotes(query = q2, col_names= prompt.column_names))
print(add_quotes(query = q3, col_names= prompt.column_names))

In [None]:
add_quotes(query = sql, col_names = prompt.column_names)

In [None]:
def lang2sql(api_key, table_name, query, model = "gpt-3.5-turbo", temperature = 0, max_tokens = 256, frequency_penalty = 0,presence_penalty= 0):
    class response:
        def __init__(output, message, response, sql):
            output.message = message
            output.response = response
            output.sql = sql

    openai.api_key = api_key

    m = create_message(table_name = table_name, query = query)

    message = [
    {
      "role": "system",
      "content": m.system
    },
    {
      "role": "user",
      "content": m.user
    }
    ]
    
    openai_response = openai.ChatCompletion.create(
        model = model,
        messages = message,
        temperature = temperature,
        max_tokens = max_tokens,
        frequency_penalty = frequency_penalty,
        presence_penalty = presence_penalty)
    
    sql_query = add_quotes(query = openai_response["choices"][0]["message"]["content"], col_names = m.column_names)

    output = response(message = m, response = openai_response, sql = sql_query)

    return output


In [None]:
api_key = os.getenv('OPENAI_KEY')

In [None]:
query = "How many cases ended up with arrest?"
response = lang2sql(api_key = api_key, table_name = "chicago_crime", query = query)

In [None]:
print(response.message)
print(response.sql)
print(response.response)



In [None]:
duckdb.sql(response.sql).show()

In [None]:
duckdb.sql(response.sql).show()

In [None]:
query = "How many cases ended up with arrest during 2022"
response = lang2sql(api_key = api_key, table_name = "chicago_crime", query = query)
print(response.sql)

In [None]:
duckdb.sql(response.sql).show()

In [None]:


query = "How many cases ended up with arrest between 2022 and 2023?"
response = lang2sql(api_key = api_key, table_name = "chicago_crime", query = query)
duckdb.sql(response.sql).show()

In [None]:
query = "Summarize the cases by primary type"
response = lang2sql(api_key = api_key, table_name = "chicago_crime", query = query)

print(response.sql)

duckdb.sql(response.sql).show()

In [None]:
query = "How many cases is the type of robbery?"
response = lang2sql(api_key = api_key, table_name = "chicago_crime", query = query)

print(response.sql)

duckdb.sql(response.sql).show()

In [None]:
duckdb.sql(response.sql).show()