# Data

Loading and appending the Chicago Crime dataset

In [4]:
import pandas as pd
import duckdb
import openai
import time 
import os

Set the path of the CSV files:

In [5]:
path = "./data_raw"

files = [x for x in os.listdir(path = path) if ".csv" in x]

print(files)

['chicago_crime_2022.csv', 'chicago_crime_2023.csv', 'chicago_crime_2021.csv']


In [6]:
chicago_crime = pd.concat((pd.read_csv(path +"/" + f) for f in files), ignore_index=True)

chicago_crime

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,...,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,12589893,JF109865,01/11/2022 03:00:00 PM,087XX S KINGSTON AVE,1565,SEX OFFENSE,INDECENT SOLICITATION OF A CHILD,RESIDENCE,False,True,...,7.0,46,17,1194660.0,1847481.0,2022,09/14/2023 03:41:59 PM,41.736409,-87.562410,"(41.736409029, -87.562410309)"
1,12592454,JF113025,01/14/2022 03:55:00 PM,067XX S MORGAN ST,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,False,True,...,16.0,68,26,1170805.0,1860170.0,2022,09/14/2023 03:41:59 PM,41.771782,-87.649437,"(41.771782439, -87.649436929)"
2,12601676,JF124024,01/13/2022 04:00:00 PM,031XX W AUGUSTA BLVD,1752,OFFENSE INVOLVING CHILDREN,AGGRAVATED CRIMINAL SEXUAL ABUSE BY FAMILY MEMBER,RESIDENCE,False,True,...,36.0,23,17,1155171.0,1906486.0,2022,09/14/2023 03:41:59 PM,41.899206,-87.705506,"(41.899206068, -87.705505587)"
3,12785595,JF346553,08/05/2022 09:00:00 PM,072XX S UNIVERSITY AVE,1544,SEX OFFENSE,SEXUAL EXPLOITATION OF A CHILD,APARTMENT,True,False,...,5.0,69,17,1185135.0,1857211.0,2022,09/14/2023 03:41:59 PM,41.763338,-87.597001,"(41.763337967, -87.597001131)"
4,12808281,JF373517,08/14/2022 02:00:00 PM,055XX W ARDMORE AVE,1562,SEX OFFENSE,AGGRAVATED CRIMINAL SEXUAL ABUSE,RESIDENCE,False,False,...,39.0,11,17,1138383.0,1937953.0,2022,09/14/2023 03:41:59 PM,41.985875,-87.766404,"(41.985875279, -87.766403857)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
648826,26461,JE455267,11/24/2021 12:51:00 AM,107XX S LANGLEY AVE,0110,HOMICIDE,FIRST DEGREE MURDER,VACANT LOT,False,False,...,9.0,50,01A,1182822.0,1833730.0,2021,09/19/2022 03:41:05 PM,41.698957,-87.606206,"(41.698957409, -87.606205674)"
648827,26041,JE281927,06/28/2021 01:12:00 AM,117XX S LAFLIN ST,0110,HOMICIDE,FIRST DEGREE MURDER,AUTO,False,False,...,34.0,53,01A,1168442.0,1826982.0,2021,09/01/2022 03:42:17 PM,41.680761,-87.659052,"(41.680760863, -87.659051873)"
648828,26238,JE353715,08/29/2021 03:07:00 AM,010XX N LAWNDALE AVE,0110,HOMICIDE,FIRST DEGREE MURDER,STREET,False,False,...,27.0,23,01A,1151525.0,1906643.0,2021,09/19/2022 03:41:05 PM,41.899709,-87.718893,"(41.899709327, -87.718893208)"
648829,26479,JE465230,12/03/2021 08:37:00 PM,000XX W 78TH PL,0110,HOMICIDE,FIRST DEGREE MURDER,PORCH,True,False,...,6.0,69,01A,1177156.0,1852951.0,2021,09/01/2022 03:42:17 PM,41.751832,-87.626374,"(41.751831742, -87.626373808)"


Source: https://platform.openai.com/examples/default-sql-translate

In [7]:
prompt_template = """

Given the following SQL table, your job is to write queries given a user’s request. \n

CREATE TABLE {} ({}) \n

Write a SQL query which return {}
"""

In [8]:
def sql_prompt_generator(table_name, col_names, query):
    prompt = prompt_template.format(table_name, col_names, query)
    return prompt


In [9]:
table = "chicago_crime"
col_names = str(list(chicago_crime.columns)).replace('[', '').replace(']', '')
query = "How many cases are from the year 2023?"

p = sql_prompt_generator(table_name = table, col_names = col_names, query = query)

print(p)





Given the following SQL table, your job is to write queries given a user’s request. 


CREATE TABLE chicago_crime ('ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude', 'Longitude', 'Location') 


Write a SQL query which return How many cases are from the year 2023?



In [44]:
openai.api_key = os.getenv('OPENAI_KEY')

In [52]:
response = openai.Completion.create(engine = "text-davinci-003",
                                     prompt= p)

In [60]:
query = response["choices"][0]["text"]

duckdb.sql(query).show()

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       200932 │
└──────────────┘



In [12]:
def lang2sql(api_key, table_name, col_names, query, engine = "text-davinci-003"):
    prompt = sql_prompt_generator(table_name = table_name, col_names = col_names, query = query)
    
    openai.api_key = api_key

    response = openai.Completion.create(engine = engine,
                                        prompt= prompt)
    return response


In [None]:
table_name = "chicago_crime"
col_names = str(list(chicago_crime.columns)).replace('[', '').replace(']', '')
query = "How many cases are from the year 2023?"
api_key = os.getenv('OPENAI_KEY')