# Language to SQL Code Generator with OpenAI

This notebook provides an example for using the OpenAI API to convert natural language questions to SQL queries.

In [18]:
import pandas as pd
import duckdb
import openai
import chicago_data
import prompts
import datetime
import time 
import os

## Loading the Data

In [27]:
endpoint = "https://data.cityofchicago.org/resource/ijzp-q8t2"
start = datetime.datetime(2023,1,1,0,0,0)
end = datetime.datetime(2024,4,24,0,0,0)
offset = 24 * 30
limit = 50000
path = "data/chicago_crime.csv"

In [28]:
# chicago_crime = chicago_data.backfill_chicago_data(endpoint = endpoint, 
#                                        start = start, 
#                                        end = end, 
#                                        offset = offset,
#                                        limit = limit)

In [29]:
# chicago_crime

Unnamed: 0,id,case_number,datetime,block,iucr,primary_type,description,location_description,arrest,domestic,...,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
0,12938772,JG100243,2023-01-01,073XX S ABERDEEN ST,0266,CRIMINAL SEXUAL ASSAULT,PREDATORY,RESIDENCE,True,False,...,007,17,68,02,1170254,1856107,2023,2024-02-09T15:40:56.000,41.760645062,-87.651574764
1,12947687,JG110648,2023-01-01,080XX S BLACKSTONE AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,...,004,8,45,14,1187560,1851999,2023,2023-08-19T15:40:26.000,41.748978449,-87.588278593
2,12939980,JG101927,2023-01-01,097XX S JEFFERY AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,004,7,51,14,1191212,1840858,2023,2023-08-19T15:40:26.000,41.718318926,-87.575256082
3,12998127,JG171086,2023-01-01,018XX W RACE AVE,1541,OBSCENITY,SALE / DISTRIBUTE OBSCENE MATERIAL TO MINOR,APARTMENT,True,False,...,012,1,24,26,1164129,1903787,2023,2023-08-19T15:40:26.000,41.891615204,-87.672679416
4,13035520,JG215525,2023-01-01,050XX S MARSHFIELD AVE,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,...,009,20,61,26,1166192,1871296,2023,2023-08-19T15:40:26.000,41.802412978,-87.666030177
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330217,13430870,JH227299,2024-04-16,030XX N HAUSSEN CT,1581,SEX OFFENSE,NON-CONSENSUAL DISSEMINATION OF PRIVATE SEXUAL...,RESIDENCE,False,False,...,025,31,21,17,1150314,1920186,2024,2024-04-23T15:41:34.000,41.936896274,-87.722987226
330218,13430783,JH227177,2024-04-16,105XX S VINCENNES AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,022,19,72,14,1168781,1834932,2024,2024-04-23T15:41:34.000,41.702569687,-87.657582667
330219,13432855,JH228770,2024-04-16,020XX N LA CROSSE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,...,025,26,19,08A,1143706,1913051,2024,2024-04-23T15:41:34.000,41.917443653,-87.74745197
330220,13430635,JH226916,2024-04-16,041XX N MILWAUKEE AVE,0610,BURGLARY,FORCIBLE ENTRY,COMMERCIAL / BUSINESS OFFICE,False,False,...,016,45,15,05,1142839,1927019,2024,2024-04-23T15:41:34.000,41.955789401,-87.750288365


In [30]:
# chicago_crime.to_csv(path, index = False)

In [31]:
chicago_crime = pd.read_csv(path)

chicago_crime

Unnamed: 0,id,case_number,datetime,block,iucr,primary_type,description,location_description,arrest,domestic,...,district,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude
0,12938772,JG100243,2023-01-01 00:00:00,073XX S ABERDEEN ST,0266,CRIMINAL SEXUAL ASSAULT,PREDATORY,RESIDENCE,True,False,...,7,17.0,68,02,1170254.0,1856107.0,2023,2024-02-09T15:40:56.000,41.760645,-87.651575
1,12947687,JG110648,2023-01-01 00:00:00,080XX S BLACKSTONE AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,RESIDENCE,False,False,...,4,8.0,45,14,1187560.0,1851999.0,2023,2023-08-19T15:40:26.000,41.748978,-87.588279
2,12939980,JG101927,2023-01-01 00:00:00,097XX S JEFFERY AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,4,7.0,51,14,1191212.0,1840858.0,2023,2023-08-19T15:40:26.000,41.718319,-87.575256
3,12998127,JG171086,2023-01-01 00:00:00,018XX W RACE AVE,1541,OBSCENITY,SALE / DISTRIBUTE OBSCENE MATERIAL TO MINOR,APARTMENT,True,False,...,12,1.0,24,26,1164129.0,1903787.0,2023,2023-08-19T15:40:26.000,41.891615,-87.672679
4,13035520,JG215525,2023-01-01 00:00:00,050XX S MARSHFIELD AVE,2825,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,True,...,9,20.0,61,26,1166192.0,1871296.0,2023,2023-08-19T15:40:26.000,41.802413,-87.666030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330217,13430870,JH227299,2024-04-16 00:00:00,030XX N HAUSSEN CT,1581,SEX OFFENSE,NON-CONSENSUAL DISSEMINATION OF PRIVATE SEXUAL...,RESIDENCE,False,False,...,25,31.0,21,17,1150314.0,1920186.0,2024,2024-04-23T15:41:34.000,41.936896,-87.722987
330218,13430783,JH227177,2024-04-16 00:00:00,105XX S VINCENNES AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,False,False,...,22,19.0,72,14,1168781.0,1834932.0,2024,2024-04-23T15:41:34.000,41.702570,-87.657583
330219,13432855,JH228770,2024-04-16 00:00:00,020XX N LA CROSSE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,...,25,26.0,19,08A,1143706.0,1913051.0,2024,2024-04-23T15:41:34.000,41.917444,-87.747452
330220,13430635,JH226916,2024-04-16 00:00:00,041XX N MILWAUKEE AVE,0610,BURGLARY,FORCIBLE ENTRY,COMMERCIAL / BUSINESS OFFICE,False,False,...,16,45.0,15,05,1142839.0,1927019.0,2024,2024-04-23T15:41:34.000,41.955789,-87.750288


In [32]:
duckdb.sql("DESCRIBE SELECT * FROM chicago_crime;")

┌──────────────────────┬─────────────┬─────────┬─────────┬─────────┬─────────┐
│     column_name      │ column_type │  null   │   key   │ default │  extra  │
│       varchar        │   varchar   │ varchar │ varchar │ varchar │ varchar │
├──────────────────────┼─────────────┼─────────┼─────────┼─────────┼─────────┤
│ id                   │ BIGINT      │ YES     │ NULL    │ NULL    │ NULL    │
│ case_number          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ datetime             │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ block                │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ iucr                 │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ primary_type         │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ description          │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ location_description │ VARCHAR     │ YES     │ NULL    │ NULL    │ NULL    │
│ arrest               │ BOOLEAN     │ YES     │ NUL

## Creating Prompt

In [None]:
my_prompt = prompts.SqlPrompt(table = "chicago_crime")

question = "How many cases are from the year 2024?"

my_prompt.set_prompt(question = question)

print(my_prompt.system)

print(my_prompt.user)

print(my_prompt.message)

## OpenAI API

In [None]:
my_prompt.openai_request(openai_api_key = os.getenv('OPENAI_KEY'))

In [None]:
print(my_prompt.query)


In [None]:
duckdb.sql(my_prompt.query)

In [None]:
my_prompt.get_data()

In [None]:
question = "How many cases ended up with arrest?"

In [None]:
my_prompt.ask_question(question = question, openai_api_key = os.getenv('OPENAI_KEY'))

In [None]:
print(my_prompt.query)

In [None]:
my_prompt.get_data()

In [None]:
print(my_prompt.query)