# Testing notebook

## Working the CSVs and Excel sheets

In [1]:
# Exploring the 'training' dataset
import pandas as pd
df = pd.read_csv("txt2sql_alerce_train_v3_1.csv")
df.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,rephrased_request,rephrased_request_gpt-3.5-turbo-0125_t0.4,rephrased_request_gpt-4o-2024-05-13_t0.2
0,13,Give me all the SNe that were first detected b...,"['object', 'probability']",\n-- mjd date for December = 59914.0\n-- mjd d...,\n-- Super Nova (SNe) is a large explosion tha...,"\nSELECT\n object.oid, probability.class_na...",simple,object,none,,,
1,10,Get the object identifiers and probabilities i...,"['probability', 'object']",0,0,"\nSELECT\n sq1.oid, sq1.probability as SN_pro...",medium,object,simple,,,
2,15,"Get the object identifiers, probabilities in t...","['object', 'probability', 'detection', 'magstat']",\n-- mjd date for September 01 = 60188.0\n-- m...,\n-- A fast riser is defined as an object whos...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",advanced,other,multi,,,
3,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\n-- mjd date for the start of the year 2019 =...,0,"\nSELECT\n sq.oid, sq.fid, sq.dmdt_first,\n ...",advanced,other,multi,,,
4,25,Query objects within 10 degress of the next po...,"['probability', 'object']",\n-- mjd date for February 01 = 59976.0\n-- mj...,0,"\nWITH catalog ( source_id, ra, dec) AS (\n ...",advanced,spatial,simple,,,


In [2]:
df[df["request"] == "Find at most 10 cases where the feature called 'LinearTrend' is larger than 5 and their ZTF objects have a difference between last and first detection date larger than 1 day. Return the following columns. For the ZTF objects: ZTF identifier and number of detections; for their light curve classifier data at ranking 1: classifier version, class and probability; for the 'LinearTrend' feature: value and filter identifier"]["external_knowledge"].item()

"Since no feature version is specified, the combination between ZTF oid and fid can be non-unique in the 'feature' table output"

In [3]:
# Reading the excel file
excel = pd.read_excel("SQLusecases_alerce.xlsx", 
                      sheet_name="examples_alerce_usecasesV3_1")

# Cleaning the excel a bit
excel = excel.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])
excel.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,Set,python_format
0,0,Get objects that are likely to be YSOs (possib...,"['probability', 'feature']",\n-- feature.name can be 'Multiband_period'\n-...,\n-- Multiband_period: Period obtained using t...,"\nSELECT\n oid, probability, value, name, fid...",advanced,object,tree,Train,"sub_query_1='''\nSELECT\n feature.oid, prob_o..."
1,1,Get all the objects classified as AGN with a p...,"['object', 'probability', 'feature', 'magstat']",\n-- object.ndet represents the number of dete...,\n-- Amplitude: Half of the difference between...,"\nSELECT\n sq.oid, sq.value, sq.name, sq.fid ...",advanced,object,tree,Test,\nsub_query_object='''\nSELECT\n object.oid...
2,2,Give me the objects classified as YSO by their...,['probability'],,,"\nSELECT\n oid, probability\nFROM\n prob...",simple,object,none,Train,"query='''\nSELECT\n oid, probability\nFROM\..."
3,3,Give me the objects classified as YSO by the l...,"['object', 'probability']","\n-- last june in mjd date: [start=60096.0, en...",,\nSELECT\n *\nFROM\n probability\nWHERE\...,simple,object,simple,Train,query=f'''\nSELECT\n *\nFROM\n probabili...
4,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\n-- mjd date for the start of the year 2019 =...,,"\nSELECT\n sq.oid, sq.fid, sq.dmdt_first,\n ...",advanced,other,multi,Train,# objects classified as SN II with probability...


## Tests

### Gold values and query to test

In [4]:
# Select a query
query = "Find ZTF objects that have a probability larger than 0.5 of being a long period variable in the light curve classifier, as well as a WISE W1 magnitude between 7 and 8. Return all columns from the 'probability', 'xmatch' and 'allwise' tables for such objects. Give at most 100 rows in the resulting table"

# Check if the query is in the excel file
if query in excel["request"].to_list():
    # Obtain the gold SQL query and Python query
    sql_gold = excel[excel["request"] == query]["gold_query"].item()
    python_gold = excel[excel["request"] == query]["python_format"].item()

    # Obtain the necessary tables
    gold_tables = excel[excel["request"] == query]["table_info"].item()

    # Print all in orderly fashion
    print("Gold values\n")
    print("Tables needed for the query:")
    print(gold_tables + "\n")
    print("SQL gold query:")
    print(sql_gold)
    print("Python gold query:\n")
    print(python_gold)
    
else:
    # Obtain the gold SQL query
    sql_gold = df[df["request"] == query]["gold_query"].item()

    # Obtain the necessary tables
    gold_tables = df[df["request"] == query]["table_info"].item()

    # Print all in orderly fashion
    print("Gold values\n")
    print("Tables needed for the query:")
    print(gold_tables + "\n")
    print("SQL gold query:")
    print(sql_gold)

Gold values

Tables needed for the query:
['probability', 'xmatch', 'allwise']

SQL gold query:
SELECT
    *
FROM
    probability INNER JOIN
    xmatch
    ON probability.oid = xmatch.oid
    INNER JOIN
    allwise
    ON xmatch.oid_catalog = allwise.oid_catalog
WHERE
    w1mpro > 7
    AND w1mpro < 8
    AND classifier_name = 'lc_classifier'
    AND class_name = 'LPV'
    AND probability > 0.5
LIMIT 100


In [5]:
# Running the gold query
from secret.config import SQL_URL
import requests
import sqlalchemy as sa

# Setup params for query engine
params = requests.get(SQL_URL).json()['params']
engine = sa.create_engine(f"postgresql+psycopg2://{params['user']}:{params['password']}@{params['host']}/{params['dbname']}")
engine.begin()

pd.read_sql_query(sql_gold, con=engine)

Unnamed: 0,oid,classifier_name,classifier_version,class_name,probability,ranking,oid.1,catid,oid_catalog,dist,...,w1sigmpro,w2sigmpro,w3sigmpro,w4sigmpro,j_m_2mass,h_m_2mass,k_m_2mass,j_msig_2mass,h_msig_2mass,k_msig_2mass
0,ZTF23aamtlec,lc_classifier,lc_classifier_1.1.13,LPV,0.500004,1,ZTF23aamtlec,allwise,J210255.95+404942.9,0.433744,...,0.025,0.020,0.018,0.109,9.526,8.397,7.934,0.023,0.031,0.018
1,ZTF19aasgovc,lc_classifier,lc_classifier_1.1.13,LPV,0.500004,1,ZTF19aasgovc,allwise,J174854.34-202203.3,0.254094,...,0.026,0.022,0.022,0.056,10.268,8.977,8.489,0.051,0.047,0.029
2,ZTF19acsfhqw,lc_classifier,hierarchical_rf_1.1.0,LPV,0.500004,1,ZTF19acsfhqw,allwise,J193821.67+013549.5,0.287550,...,0.033,0.019,0.018,0.114,9.205,8.168,7.808,0.022,0.036,0.024
3,ZTF18aayohex,lc_classifier,hierarchical_rf_1.1.0,LPV,0.500040,1,ZTF18aayohex,allwise,J184523.26+104906.3,0.220967,...,0.032,0.020,0.014,0.038,8.795,7.809,7.317,0.019,0.053,0.018
4,ZTF19addhngs,lc_classifier,lc_classifier_1.1.13,LPV,0.500040,1,ZTF19addhngs,allwise,J172425.68-210330.8,0.428840,...,0.025,0.017,0.013,0.079,9.229,8.142,7.662,0.024,0.042,0.021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ZTF18abchfxi,lc_classifier,lc_classifier_1.1.13,LPV,0.500472,1,ZTF18abchfxi,allwise,J193844.33+153402.4,0.121267,...,0.023,0.018,0.020,0.130,9.508,8.403,7.961,0.020,0.027,0.020
96,ZTF18abdlwsg,lc_classifier,hierarchical_rf_1.1.0,LPV,0.500480,1,ZTF18abdlwsg,allwise,J205801.27+443915.8,0.541367,...,0.025,0.020,0.021,0.103,9.617,8.387,7.986,0.022,0.029,0.034
97,ZTF18acnexym,lc_classifier,hierarchical_rf_1.1.0,LPV,0.500480,1,ZTF18acnexym,allwise,J200645.95+402117.7,0.000027,...,0.029,0.020,0.018,0.134,8.935,7.914,7.539,0.027,0.016,0.016
98,ZTF18abehwsm,lc_classifier,hierarchical_rf_1.1.0,LPV,0.500480,1,ZTF18abehwsm,allwise,J005428.60+665432.8,0.816571,...,0.027,0.019,0.017,0.100,9.297,8.113,7.750,0.026,0.027,0.020


### Trying out the pipelines

In [6]:
from pprint import pprint
from main import run_pipeline, engine
engine.begin()

# Model to use
#model = "claude-3-5-sonnet-20240620"
model = "gpt-4o"

# Format for the pipeline
format = "python"

# RAG parameters
max_tokens = 10000
size = 700
overlap = 300
quantity = 10

# Running the pipeline
result, total_usage, prompts = run_pipeline(query, model, max_tokens, size, 
                                            overlap, quantity, format, False, 
                                            engine, rag_pipe=True, 
                                            self_corr=True)
print("Resulting table:")
print(result)
print("Total usage of the pipeline:")
pprint(total_usage)

# The prompts used will be saved in this file
with open(f"prompts/examples/prompts_query_{model}.txt", "w") as f:
    f.write(str(prompts))

  from .autonotebook import tqdm as notebook_tqdm


Tables needed: [probability, xmatch, allwise]
Difficulty: advanced
Resulting python query: 
# Sub-query to filter objects from the 'probability' table with the specified conditions
sub_query_probability = f'''
SELECT DISTINCT oid
FROM probability
WHERE classifier_name = 'lc_classifier'
  AND class_name = 'LPV'
  AND probability > 0.5
  AND ranking = 1
'''

# Sub-query to filter objects from the 'allwise' table with the specified WISE W1 magnitude range
sub_query_allwise = f'''
SELECT oid_catalog
FROM allwise
WHERE w1mpro BETWEEN 7 AND 8
'''

# Final query to join the results from the sub-queries and retrieve the required information
full_query = f'''
SELECT *
FROM probability
INNER JOIN xmatch ON probability.oid = xmatch.oid
INNER JOIN allwise ON xmatch.oid_catalog = allwise.oid_catalog
WHERE probability.oid IN ({sub_query_probability})
  AND allwise.oid_catalog IN ({sub_query_allwise})
LIMIT 100
'''

Resulting table:
             oid                       classifier_name     classifie

In [12]:
from pprint import pprint
from main import run_pipeline, engine
engine.begin()

# Model to use
#model = "claude-3-5-sonnet-20240620"
model = "o1-preview"

# Format for the pipeline
format = "python"

# RAG parameters
max_tokens = 10000
size = 700
overlap = 300
quantity = 10

# Running the pipeline
result, total_usage, prompts = run_pipeline(query, model, max_tokens, size, 
                                            overlap, quantity, format, False, 
                                            engine, rag_pipe=True, 
                                            self_corr=True)
print("Resulting table:")
print(result)
print("Total usage of the pipeline:")
pprint(total_usage)

# The prompts used will be saved in this file
with open(f"prompts/examples/prompts_query_{model}.txt", "w") as f:
    f.write(str(prompts))

Tables needed: [object, detection, magstat]
Difficulty: advanced
```python
# Get oids of objects classified as 'SN II' with probability > 0.6
sub_query_probability = f'''
SELECT oid
FROM probability
WHERE classifier_name='lc_classifier'
AND class_name='SN II'
AND ranking=1
AND probability > 0.6
'''

# Get oids of objects with first observation between 2019-01-01 and 2022-12-31
# MJD range from 58484 (2019-01-01) to 59580 (2022-01-01)
sub_query_object = f'''
SELECT oid
FROM object
WHERE firstmjd BETWEEN 58484 AND 59946
'''

# Combine previous subqueries to get oids satisfying both class and date criteria
sub_query_oid1 = f'''
SELECT prob_oids.oid
FROM ({sub_query_probability}) AS prob_oids
INNER JOIN ({sub_query_object}) AS obj_oids
ON prob_oids.oid = obj_oids.oid
'''

# Get oids with more than 30 detections
sub_query_ndet = f'''
SELECT oid
FROM detection
GROUP BY oid
HAVING COUNT(*) > 30
'''

# Combine to get oids satisfying class, date, and number of detections criteria
sub_query_oid2

Exception: Failed again: local variable 'results' referenced before assignment