# Testing notebook

## Working the CSVs and Excel sheets

In [1]:
# Exploring the 'training' dataset
import pandas as pd
df = pd.read_csv("txt2sql_alerce_train_v2.csv")
df.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,rephrased_request,rephrased_request_gpt-3.5-turbo-0125_t0.4,rephrased_request_gpt-4o-2024-05-13_t0.2
0,13,Give me all the SNe that were first detected b...,"['object', 'probability']",\n-- mjd date for December = 59914.0\n-- mjd d...,\n-- Super Nova (SNe) is a large explosion tha...,"\nSELECT\n object.oid, probability.class_na...",simple,object,none,,,
1,10,Get the object identifiers and probabilities i...,"['probability', 'object']",0,0,"\nSELECT\n sq1.oid, sq1.probability as SN_pro...",medium,object,simple,,,
2,15,"Get the object identifiers, probabilities in t...","['object', 'probability', 'detection', 'magstat']",\n-- mjd date for September 01 = 60188.0\n-- m...,\n-- A fast riser is defined as an object whos...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",advanced,other,multi,,,
3,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\n-- mjd date for the start of the year 2019 =...,0,"\nSELECT\n sq.oid, sq.fid, sq.dmdt_first,\n ...",advanced,other,multi,,,
4,25,Query objects within 10 degress of the next po...,"['probability', 'object']",\n-- mjd date for February 01 = 59976.0\n-- mj...,0,"\nWITH catalog ( source_id, ra, dec) AS (\n ...",advanced,spatial,simple,,,


In [2]:
# Reading the excel file
excel = pd.read_excel("SQLusecases_alerce.xlsx", 
                      sheet_name="examples_alerce_usecasesV3_1")

# Cleaning the excel a bit
excel = excel.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])
excel.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,Set,python_format
0,0,Get objects that are likely to be YSOs (possib...,"['probability', 'feature']",\n-- feature.name can be 'Multiband_period'\n-...,\n-- Multiband_period: Period obtained using t...,"\nSELECT\n oid, probability, value, name, fid...",advanced,object,tree,Train,"sub_query_1='''\nSELECT\n feature.oid, prob_o..."
1,1,Get all the objects classified as AGN with a p...,"['object', 'probability', 'feature', 'magstat']",\n-- object.ndet represents the number of dete...,\n-- Amplitude: Half of the difference between...,"\nSELECT\n sq.oid, sq.value, sq.name, sq.fid ...",advanced,object,tree,Test,\nsub_query_object='''\nSELECT\n object.oid...
2,2,Give me the objects classified as YSO by their...,['probability'],,,"\nSELECT\n oid, probability\nFROM\n prob...",simple,object,none,Train,"query='''\nSELECT\n oid, probability\nFROM\..."
3,3,Give me the objects classified as YSO by the l...,"['object', 'probability']","\n-- last june in mjd date: [start=60096.0, en...",,\nSELECT\n *\nFROM\n probability\nWHERE\...,simple,object,simple,Train,query=f'''\nSELECT\n *\nFROM\n probabili...
4,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\n-- mjd date for the start of the year 2019 =...,,"\nSELECT\n sq.oid, sq.fid, sq.dmdt_first,\n ...",advanced,other,multi,Train,# objects classified as SN II with probability...


## Tests

In [None]:
# Select a query
query = "Get the object identifier, candidate identifier, psf magnitudes, magnitude errors, and band identifiers as a function of time of the objects classified as SN II with probability larger than 0.6, number of detections greater than 50 and difference between minimum and maximum magnitudes in ZTF g-band greater than 2 mag."

# Obtain the gold SQL query and Python query
sql_gold = excel[excel["request"] == query]["gold_query"].item()
python_gold = excel[excel["request"] == query]["python_format"].item()

# Obtain the necessary tables
gold_tables = excel[excel["request"] == query]["table_info"].item()

# Print all in orderly fashion
print("Gold values\n")
print("Tables needed for the query:")
print(gold_tables + "\n")
print("SQL gold query:")
print(sql_gold)
print("Python gold query:\n")
print(python_gold)

Golden values

Tables needed for the query:
['object', 'probability', 'feature', 'detection']

SQL gold query:

SELECT
  sq.oid, sq.value, sq.name, sq.fid as feature_fid, sq.version,
  detection.candid, detection.fid as f_id,detection.magpsf, detection.sigmapsf_corr, detection.sigmapsf_corr_ext
FROM
  (
SELECT
  feature.oid, feature.value, feature.name, feature.fid, feature.version
FROM
  (
SELECT
    object.oid
FROM
    object INNER JOIN
    probability
    ON object.oid = probability.oid
WHERE
    probability.classifier_name='lc_classifier'
    AND probability.class_name='SNII'
    AND probability.probability > 0.6
    AND object.ndet > 50
) as obj_oids
    INNER JOIN
    feature ON feature.oid = obj_oids.oid
WHERE
  feature.name = 'delta_mag_fid'
  AND feature.value > 2
  AND feature.fid = 1
) AS sq
  INNER JOIN detection
  ON sq.oid = detection.oid
WHERE
  detection.fid = 1
ORDER BY oid

Python gold query:

# objects classified as SNII with P>0.6 and more than 50 detections
sub_que

In [14]:
# Running the gold query
from secret.config import SQL_URL
import requests
import sqlalchemy as sa

# Setup params for query engine
params = requests.get(SQL_URL).json()['params']
engine = sa.create_engine(f"postgresql+psycopg2://{params['user']}:{params['password']}@{params['host']}/{params['dbname']}")
engine.begin()

pd.read_sql_query(sql_gold, con=engine)

Unnamed: 0,oid,value,name,feature_fid,version,candid,f_id,magpsf,sigmapsf_corr,sigmapsf_corr_ext
0,ZTF18aatyqds,2.343400,delta_mag_fid,1,lc_classifier_1.2.1-P,557340746015015007,1,19.247040,0.060598,0.070315
1,ZTF18aatyqds,2.343400,delta_mag_fid,1,lc_classifier_1.2.1-P,536408480615015111,1,18.780142,0.059849,0.064325
2,ZTF18aatyqds,2.343400,delta_mag_fid,1,lc_classifier_1.2.1-P,539416486015015005,1,18.893660,0.051794,0.059117
3,ZTF18aatyqds,2.343400,delta_mag_fid,1,lc_classifier_1.2.1-P,539424060615015051,1,18.885460,0.068458,0.072989
4,ZTF18aatyqds,2.343400,delta_mag_fid,1,lc_classifier_1.2.1-P,542378576015015000,1,18.957990,0.108200,0.112212
...,...,...,...,...,...,...,...,...,...,...
776,ZTF23aaquhaz,3.265625,delta_mag_fid,1,lc_classifier_1.2.1-P-transitional,2461181500815015002,1,19.347256,,
777,ZTF23aaquhaz,3.265625,delta_mag_fid,1,lc_classifier_1.2.1-P-transitional,2463129660815015006,1,19.565609,,
778,ZTF23aaquhaz,3.265625,delta_mag_fid,1,lc_classifier_1.2.1-P-transitional,2467169690815015012,1,19.988169,,
779,ZTF23aaquhaz,3.265625,delta_mag_fid,1,lc_classifier_1.2.1-P-transitional,2472187320815015006,1,20.428522,,


In [13]:
import pandas as pd
from pprint import pprint
from main import run_pipeline, engine
engine.begin()

# Model to use
#model = "claude-3-5-sonnet-20240620"
model = "gpt-4o-2024-08-06"

# Format for the pipeline
format = "python"

# RAG parameters
max_tokens = 1000
size = 700
overlap = 300
quantity = 10

# Running the pipeline
result, total_usage, prompts = run_pipeline(query, model, max_tokens, size, 
                                            overlap, quantity, format, False, 
                                            engine, rag_pipe=False, 
                                            self_corr=True)
print("Resulting table:")
print(result)
print("Total usage of the pipeline:")
pprint(total_usage)

# The prompts used will be saved in this file
with open(f"prompts/examples/prompts_query_{model}.txt", "w") as f:
    f.write(str(prompts))

  from .autonotebook import tqdm as notebook_tqdm


Raised exception: Running SQL exception: (psycopg2.errors.QueryCanceled) canceling statement due to statement timeout

[SQL: 
SELECT
    detection.oid,
    detection.candid,
    detection.magpsf,
    detection.sigmapsf,
    detection.fid
FROM
    detection
WHERE
    detection.oid IN (
SELECT
    oid
FROM
    (
SELECT DISTINCT
    probability.oid
FROM
    probability
WHERE
    probability.classifier_name = 'lc_classifier'
    AND probability.class_name = 'SNII'
    AND probability.probability > 0.6
    AND probability.ranking = 1
) AS prob_oids
INTERSECT
SELECT
    oid
FROM
    (
SELECT
    object.oid
FROM
    object
WHERE
    object.ndet > 50
) AS det_oids
INTERSECT
SELECT
    oid
FROM
    (
SELECT
    magstat.oid
FROM
    magstat
WHERE
    magstat.fid = 1  -- Assuming fid=1 corresponds to ZTF g-band
    AND (magstat.magmax - magstat.magmin) > 2
) AS mag_oids
)
ORDER BY
    detection.mjd
]
(Background on this error at: https://sqlalche.me/e/20/e3q8)
Start retry with self-correction
Cor

Exception: Failed again: Running SQL exception: (psycopg2.errors.SyntaxError) syntax error at or near "{"
LINE 11:     detection.oid IN ({sub_query_combined})
                               ^

[SQL: 
SELECT
    detection.oid,
    detection.candid,
    detection.magpsf,
    detection.sigmapsf,
    detection.fid
FROM
    detection
WHERE
    detection.oid IN ({sub_query_combined})
ORDER BY
    detection.mjd
]
(Background on this error at: https://sqlalche.me/e/20/f405)