In [2]:
from datasets import SpiderDataset
from prompt.prompt_factory import PromptFactory
from enums import PromptRepresentationType, ShemaInfoOptions, ExampleSelectionType
import json


  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
test_dataset = SpiderDataset(
        dataset_dir="spider_data",
        file_name="dev.json",
        path_to_gold="dev_gold.sql",
        table_file_path="tables.json",
    )

Processing samples: 1034it [00:00, 274054.37it/s]


In [3]:
test_dataset.calculate_question_embeddings("all-MiniLM-L6-v2")

Calculating question embeddings: 100%|██████████| 1034/1034 [00:11<00:00, 93.71it/s] 


In [5]:
test_samples = test_dataset.samples

In [6]:
train_dataset = SpiderDataset(
        dataset_dir="spider_data",
        file_name="train_spider.json",
        path_to_gold="train_gold.sql",
        table_file_path="tables.json",
        
    )

Processing samples: 7000it [00:00, 550619.41it/s]


In [7]:
train_dataset.calculate_question_embeddings("all-MiniLM-L6-v2")

Calculating question embeddings: 100%|██████████| 7000/7000 [01:03<00:00, 109.96it/s]


In [8]:
samples_for_examples = train_dataset.samples

## Prompt params

In [9]:
OP_RULE='You must minimize execution time while ensuring correctness.'
ks = [3,5]
PROMPT_REPRESENTATIONS = [PromptRepresentationType.OAI, PromptRepresentationType.REASONING]
SCHEMA_INFO_OPTION = [ShemaInfoOptions.FULL_SCHEMA]
EXAMPLE_SELECTION_TYPES = [ExampleSelectionType.RAND_HARDNESS, ExampleSelectionType.COSINE_SIM_HARDNESS]


In [3]:
OUTPUT_FILE = "./pipeline_results/prompts.json"

In [None]:

PROMPTS_LIST = []

## BASE

In [11]:
for prompt_repr in PROMPT_REPRESENTATIONS:
    for schema_info in SCHEMA_INFO_OPTION:
        print(f'prompt_repr: {prompt_repr}, schema_info: {schema_info}')
        prompts = PromptFactory.build_prompts(
            prompt_type = prompt_repr,
            samples=test_samples,
            schema_info_option=schema_info,
        )
        res = {
            prompt_repr.value+ '$' + schema_info.value: prompts
        }
        PROMPTS_LIST.append(res)

prompt_repr: PromptRepresentationType.OAI, schema_info: ShemaInfoOptions.FULL_SCHEMA


Building prompts: 100%|██████████| 1034/1034 [00:00<00:00, 162680.91prompt/s]


prompt_repr: PromptRepresentationType.REASONING, schema_info: ShemaInfoOptions.FULL_SCHEMA


Building prompts: 100%|██████████| 1034/1034 [00:00<00:00, 180734.72prompt/s]


## BASE + META INFO

In [12]:
for prompt_repr in PROMPT_REPRESENTATIONS:
    for schema_info in SCHEMA_INFO_OPTION:
        print(f'prompt_repr: {prompt_repr}, schema_info: {schema_info} + FK')
        prompts = PromptFactory.build_prompts(
            prompt_type = prompt_repr,
            samples=test_samples,
            schema_info_option=schema_info,
            add_fk_info=True,
        )
        res = {
            prompt_repr.value+ '$' + schema_info.value + '$' + 'fk': prompts
        }
        PROMPTS_LIST.append(res)

prompt_repr: PromptRepresentationType.OAI, schema_info: ShemaInfoOptions.FULL_SCHEMA + FK


Building prompts: 100%|██████████| 1034/1034 [00:00<00:00, 161337.39prompt/s]


prompt_repr: PromptRepresentationType.REASONING, schema_info: ShemaInfoOptions.FULL_SCHEMA + FK


Building prompts: 100%|██████████| 1034/1034 [00:00<00:00, 136393.70prompt/s]


## BASE + META INFO + EXAMPLES

In [13]:
for prompt_repr in PROMPT_REPRESENTATIONS:
    for schema_info in SCHEMA_INFO_OPTION:
        for example_selection_type in EXAMPLE_SELECTION_TYPES:
            for k in ks:
                print(f'prompt_repr: {prompt_repr}, schema_info: {schema_info}, example_selection_type: {example_selection_type}, k: {k}')
                prompts = PromptFactory.build_prompts(
                    prompt_type = prompt_repr,
                    samples=test_samples,
                    examples=samples_for_examples,
                    example_selection_type=example_selection_type,
                    k=k,
                    schema_info_option=schema_info,
                    add_fk_info=True,
                )
                res = {
                    prompt_repr.value+ '$' + schema_info.value + '$' + 'fk' + '$' + example_selection_type.value + '$' + str(k): prompts
                }
                PROMPTS_LIST.append(res)

prompt_repr: PromptRepresentationType.OAI, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.RAND_HARDNESS, k: 3


Building prompts: 100%|██████████| 1034/1034 [00:34<00:00, 30.03prompt/s]


prompt_repr: PromptRepresentationType.OAI, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.RAND_HARDNESS, k: 5


Building prompts: 100%|██████████| 1034/1034 [00:33<00:00, 30.80prompt/s]


prompt_repr: PromptRepresentationType.OAI, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.COSINE_SIM_HARDNESS, k: 3


Building prompts: 100%|██████████| 1034/1034 [00:36<00:00, 28.41prompt/s]


prompt_repr: PromptRepresentationType.OAI, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.COSINE_SIM_HARDNESS, k: 5


Building prompts: 100%|██████████| 1034/1034 [00:36<00:00, 28.66prompt/s]


prompt_repr: PromptRepresentationType.REASONING, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.RAND_HARDNESS, k: 3


Building prompts: 100%|██████████| 1034/1034 [00:34<00:00, 30.29prompt/s]


prompt_repr: PromptRepresentationType.REASONING, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.RAND_HARDNESS, k: 5


Building prompts: 100%|██████████| 1034/1034 [00:34<00:00, 30.34prompt/s]


prompt_repr: PromptRepresentationType.REASONING, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.COSINE_SIM_HARDNESS, k: 3


Building prompts: 100%|██████████| 1034/1034 [00:36<00:00, 28.18prompt/s]


prompt_repr: PromptRepresentationType.REASONING, schema_info: ShemaInfoOptions.FULL_SCHEMA, example_selection_type: ExampleSelectionType.COSINE_SIM_HARDNESS, k: 5


Building prompts: 100%|██████████| 1034/1034 [00:36<00:00, 28.10prompt/s]


---

In [14]:
with open(OUTPUT_FILE, "w") as f:
    json.dump(PROMPTS_LIST, f, indent=2)

In [14]:
with open('./pipeline_results/dev_gold.txt', 'w') as f:
    for t in test_samples:
        f.write(t.query_gold + '\t' + t.db_id +'\n')

## Check prompts

In [4]:
with open(OUTPUT_FILE, 'r') as f:
    list_of_dicts = json.load(f)

prompt_merged_dict = {}
for d in list_of_dicts:
    prompt_merged_dict.update(d)

for k in prompt_merged_dict.keys():
    print(f'Key: {k}')

Key: openai$full_schema
Key: reasoning$full_schema
Key: openai$full_schema$fk
Key: reasoning$full_schema$fk
Key: openai$full_schema$fk$random_hardness$3
Key: openai$full_schema$fk$random_hardness$5
Key: openai$full_schema$fk$cosine_sim_hardness$3
Key: openai$full_schema$fk$cosine_sim_hardness$5
Key: reasoning$full_schema$fk$random_hardness$3
Key: reasoning$full_schema$fk$random_hardness$5
Key: reasoning$full_schema$fk$cosine_sim_hardness$3
Key: reasoning$full_schema$fk$cosine_sim_hardness$5


In [5]:
print(prompt_merged_dict['openai$full_schema'][960])

### Complete sqlite SQL query only and with no explanation.
### SQLite SQL tables, with their properties:
#
# breeds: (breed_code, breed_name)
# charges: (charge_id, charge_type, charge_amount)
# sizes: (size_code, size_description)
# treatment_types: (treatment_type_code, treatment_type_description)
# owners: (owner_id, first_name, last_name, street, city, state, zip_code, email_address, home_phone, cell_number)
# dogs: (dog_id, owner_id, abandoned_yn, breed_code, size_code, name, age, date_of_birth, gender, weight, date_arrived, date_adopted, date_departed)
# professionals: (professional_id, role_code, first_name, street, city, state, zip_code, last_name, email_address, home_phone, cell_number)
# treatments: (treatment_id, dog_id, professional_id, treatment_type_code, date_of_treatment, cost_of_treatment)
#
### Question: List the last name of the owner owning the youngest dog.
### Answer: SELECT


In [6]:
print(prompt_merged_dict['reasoning$full_schema'][960])

## Goal:
Your task is to translate any given natural language question into a valid SQL query written in the SQLite dialect.

## Return Format:
Write SQLite query only and with no explanation or comments.
Return only SQLite query in a format: SELECT ...

**Be careful and double-check:**
1. The correctness of SQLite syntax and order of operations and closes.
2. The choice of columns and tables when dealing with ambiguity of natural language
3. The correctness of the join conditions if it is used.

## Context:
**SQLite SQL tables, with their properties:**

breeds: (breed_code, breed_name)
charges: (charge_id, charge_type, charge_amount)
sizes: (size_code, size_description)
treatment_types: (treatment_type_code, treatment_type_description)
owners: (owner_id, first_name, last_name, street, city, state, zip_code, email_address, home_phone, cell_number)
dogs: (dog_id, owner_id, abandoned_yn, breed_code, size_code, name, age, date_of_birth, gender, weight, date_arrived, date_adopted, date_dep

In [6]:
print(prompt_merged_dict['openai$full_schema$fk'][960])

### Complete sqlite SQL query only and with no explanation.
### SQLite SQL tables, with their properties:
#
# breeds: (breed_code, breed_name)
# charges: (charge_id, charge_type, charge_amount)
# sizes: (size_code, size_description)
# treatment_types: (treatment_type_code, treatment_type_description)
# owners: (owner_id, first_name, last_name, street, city, state, zip_code, email_address, home_phone, cell_number)
# dogs: (dog_id, owner_id, abandoned_yn, breed_code, size_code, name, age, date_of_birth, gender, weight, date_arrived, date_adopted, date_departed)
# professionals: (professional_id, role_code, first_name, street, city, state, zip_code, last_name, email_address, home_phone, cell_number)
# treatments: (treatment_id, dog_id, professional_id, treatment_type_code, date_of_treatment, cost_of_treatment)
#
### Foreign Key References:
#
# dogs.owner_id references owners.owner_id
# dogs.breed_code references breeds.breed_code
# dogs.size_code references sizes.size_code
# treatments.do

In [7]:
print(prompt_merged_dict['reasoning$full_schema$fk'][960])

## Goal:
Your task is to translate any given natural language question into a valid SQL query written in the SQLite dialect.

## Return Format:
Write SQLite query only and with no explanation or comments.
Return only SQLite query in a format: SELECT ...

**Be careful and double-check:**
1. The correctness of SQLite syntax and order of operations and closes.
2. The choice of columns and tables when dealing with ambiguity of natural language
3. The correctness of the join conditions if it is used.

## Context:
**SQLite SQL tables, with their properties:**

breeds: (breed_code, breed_name)
charges: (charge_id, charge_type, charge_amount)
sizes: (size_code, size_description)
treatment_types: (treatment_type_code, treatment_type_description)
owners: (owner_id, first_name, last_name, street, city, state, zip_code, email_address, home_phone, cell_number)
dogs: (dog_id, owner_id, abandoned_yn, breed_code, size_code, name, age, date_of_birth, gender, weight, date_arrived, date_adopted, date_dep

In [8]:
print(prompt_merged_dict['openai$full_schema$fk$cosine_sim_hardness$3'][960])

### Complete sqlite SQL query only and with no explanation.
### SQLite SQL tables, with their properties:
#
# breeds: (breed_code, breed_name)
# charges: (charge_id, charge_type, charge_amount)
# sizes: (size_code, size_description)
# treatment_types: (treatment_type_code, treatment_type_description)
# owners: (owner_id, first_name, last_name, street, city, state, zip_code, email_address, home_phone, cell_number)
# dogs: (dog_id, owner_id, abandoned_yn, breed_code, size_code, name, age, date_of_birth, gender, weight, date_arrived, date_adopted, date_departed)
# professionals: (professional_id, role_code, first_name, street, city, state, zip_code, last_name, email_address, home_phone, cell_number)
# treatments: (treatment_id, dog_id, professional_id, treatment_type_code, date_of_treatment, cost_of_treatment)
#
### Foreign Key References:
#
# dogs.owner_id references owners.owner_id
# dogs.breed_code references breeds.breed_code
# dogs.size_code references sizes.size_code
# treatments.do

In [9]:
print(prompt_merged_dict['reasoning$full_schema$fk$cosine_sim_hardness$3'][960])

## Goal:
Your task is to translate any given natural language question into a valid SQL query written in the SQLite dialect.

## Return Format:
Write SQLite query only and with no explanation or comments.
Return only SQLite query in a format: SELECT ...

**Be careful and double-check:**
1. The correctness of SQLite syntax and order of operations and closes.
2. The choice of columns and tables when dealing with ambiguity of natural language
3. The correctness of the join conditions if it is used.

## Context:
**SQLite SQL tables, with their properties:**

breeds: (breed_code, breed_name)
charges: (charge_id, charge_type, charge_amount)
sizes: (size_code, size_description)
treatment_types: (treatment_type_code, treatment_type_description)
owners: (owner_id, first_name, last_name, street, city, state, zip_code, email_address, home_phone, cell_number)
dogs: (dog_id, owner_id, abandoned_yn, breed_code, size_code, name, age, date_of_birth, gender, weight, date_arrived, date_adopted, date_dep