# Testing notebook

## Working the CSVs and Excel sheets

In [1]:
# Exploring the 'training' dataset
import pandas as pd
df = pd.read_csv("txt2sql_alerce_train_v2.csv")
df.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,rephrased_request,rephrased_request_gpt-3.5-turbo-0125_t0.4,rephrased_request_gpt-4o-2024-05-13_t0.2
0,13,Give me all the SNe that were first detected b...,"['object', 'probability']",\n-- mjd date for December = 59914.0\n-- mjd d...,\n-- Super Nova (SNe) is a large explosion tha...,"\nSELECT\n object.oid, probability.class_na...",simple,object,none,,,
1,10,Get the object identifiers and probabilities i...,"['probability', 'object']",0,0,"\nSELECT\n sq1.oid, sq1.probability as SN_pro...",medium,object,simple,,,
2,15,"Get the object identifiers, probabilities in t...","['object', 'probability', 'detection', 'magstat']",\n-- mjd date for September 01 = 60188.0\n-- m...,\n-- A fast riser is defined as an object whos...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",advanced,other,multi,,,
3,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\n-- mjd date for the start of the year 2019 =...,0,"\nSELECT\n sq.oid, sq.fid, sq.dmdt_first,\n ...",advanced,other,multi,,,
4,25,Query objects within 10 degress of the next po...,"['probability', 'object']",\n-- mjd date for February 01 = 59976.0\n-- mj...,0,"\nWITH catalog ( source_id, ra, dec) AS (\n ...",advanced,spatial,simple,,,


In [2]:
# Reading the excel file
excel = pd.read_excel("SQLusecases_alerce.xlsx", 
                      sheet_name="examples_alerce_usecasesV3_1")

# Cleaning the excel a bit
excel = excel.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])
excel.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,Set,python_format
0,0,Get objects that are likely to be YSOs (possib...,"['probability', 'feature']",\n-- feature.name can be 'Multiband_period'\n-...,\n-- Multiband_period: Period obtained using t...,"\nSELECT\n oid, probability, value, name, fid...",advanced,object,tree,Train,"sub_query_1='''\nSELECT\n feature.oid, prob_o..."
1,1,Get all the objects classified as AGN with a p...,"['object', 'probability', 'feature', 'magstat']",\n-- object.ndet represents the number of dete...,\n-- Amplitude: Half of the difference between...,"\nSELECT\n sq.oid, sq.value, sq.name, sq.fid ...",advanced,object,tree,Test,\nsub_query_object='''\nSELECT\n object.oid...
2,2,Give me the objects classified as YSO by their...,['probability'],,,"\nSELECT\n oid, probability\nFROM\n prob...",simple,object,none,Train,"query='''\nSELECT\n oid, probability\nFROM\..."
3,3,Give me the objects classified as YSO by the l...,"['object', 'probability']","\n-- last june in mjd date: [start=60096.0, en...",,\nSELECT\n *\nFROM\n probability\nWHERE\...,simple,object,simple,Train,query=f'''\nSELECT\n *\nFROM\n probabili...
4,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\n-- mjd date for the start of the year 2019 =...,,"\nSELECT\n sq.oid, sq.fid, sq.dmdt_first,\n ...",advanced,other,multi,Train,# objects classified as SN II with probability...


## Tests

### Gold values and query to test

In [6]:
# Select a query
query = "Get the object identifiers that had their first detection in July 2023, are within a search radius of 1000 arcmin from the center of the Abell 370 galaxy cluster (centered at RA=39.97, Dec=-1.5768), have sgscore1<0.3 and their highest probability in the stamp classifier is not obtained for class Asteroid nor class Bogus"

# Obtain the gold SQL query and Python query
sql_gold = excel[excel["request"] == query]["gold_query"].item()
python_gold = excel[excel["request"] == query]["python_format"].item()

# Obtain the necessary tables
gold_tables = excel[excel["request"] == query]["table_info"].item()

# Print all in orderly fashion
print("Gold values\n")
print("Tables needed for the query:")
print(gold_tables + "\n")
print("SQL gold query:")
print(sql_gold)
print("Python gold query:\n")
print(python_gold)

Gold values

Tables needed for the query:
['probability', 'object', 'feature']

SQL gold query:

WITH catalog (source_id, ra, dec) AS (
    VALUES
        ('source_1', 39.970416666666665, -1.5768055555555556)
),
features AS (
SELECT
  feature.oid, feature.value, feature.name, feature.version, obj_oids.meanra, obj_oids.meandec
FROM
  (
SELECT
    object.oid, object.meanra, object.meandec
FROM
    object INNER JOIN
    probability
    ON object.oid = probability.oid
WHERE
    probability.classifier_name='stamp_classifier'
    AND probability.ranking=1
    AND probability.class_name NOT IN ('asteroid','bogus')
    AND object.firstmjd > 60126.0
    AND object.firstmjd < 60156.0

) as obj_oids
    INNER JOIN
    feature ON feature.oid = obj_oids.oid
WHERE
  feature.name = 'sgscore1'
  AND feature.value < 0.3
)

SELECT
    c.source_id, f.oid, f.value, f.name, f.version, f.meanra, f.meandec,
    q3c_dist(c.ra,c.dec,f.meanra,f.meandec) as dist

FROM features f, catalog c
WHERE
    q3c_join(c.r

In [4]:
# Running the gold query
from secret.config import SQL_URL
import requests
import sqlalchemy as sa

# Setup params for query engine
params = requests.get(SQL_URL).json()['params']
engine = sa.create_engine(f"postgresql+psycopg2://{params['user']}:{params['password']}@{params['host']}/{params['dbname']}")
engine.begin()

pd.read_sql_query(sql_gold, con=engine)

Unnamed: 0,oid,fid,dmdt_first,candid,f_id,magpsf,sigmapsf_corr,sigmapsf_corr_ext
0,ZTF19aapafit,2,-0.558783,829383361815015003,1,17.737854,,
1,ZTF19aapafit,2,-0.558783,832386251815015000,2,17.734913,,
2,ZTF19aapafit,2,-0.558783,858352811815015002,2,17.735107,,
3,ZTF19aapafit,2,-0.558783,878351491815015006,1,18.341246,,
4,ZTF19aapafit,2,-0.558783,912248911815015006,1,18.500607,,
...,...,...,...,...,...,...,...,...
1532,ZTF22aavpkwo,2,-0.741901,2090146050015015011,2,18.845226,0.075151,0.077069
1533,ZTF22aavpkwo,2,-0.741901,2088175550015015009,2,18.898780,0.077255,0.079281
1534,ZTF22aavpkwo,2,-0.741901,2088156800015015007,1,19.567533,0.088220,0.090360
1535,ZTF22aavpkwo,2,-0.741901,2086202140015015009,1,19.556650,0.075521,0.077972


### Trying out the pipelines

In [5]:
import pandas as pd
from pprint import pprint
from main import run_pipeline, engine
engine.begin()

# Model to use
#model = "claude-3-5-sonnet-20240620"
model = "gpt-4o-2024-08-06"

# Format for the pipeline
format = "python"

# RAG parameters
max_tokens = 1000
size = 700
overlap = 300
quantity = 10

# Running the pipeline
result, total_usage, prompts = run_pipeline(query, model, max_tokens, size, 
                                            overlap, quantity, format, False, 
                                            engine, rag_pipe=True, 
                                            self_corr=True)
print("Resulting table:")
print(result)
print("Total usage of the pipeline:")
pprint(total_usage)

# The prompts used will be saved in this file
with open(f"prompts/examples/prompts_query_{model}.txt", "w") as f:
    f.write(str(prompts))

  from .autonotebook import tqdm as notebook_tqdm


Tables needed: [object, detection, magstat]
Difficulty: advanced
Resulting python query: 
# Sub-query to select oids from the probability table where the classifier is 'lc_classifier', class is 'SNII', and probability > 0.6
sub_query_probability = f'''
SELECT DISTINCT
    probability.oid
FROM
    probability
WHERE
    probability.classifier_name = 'lc_classifier'
    AND probability.class_name = 'SNII'
    AND probability.probability > 0.6
    AND probability.ranking = 1
'''

# Sub-query to select oids from the magstat table where the initial rise rate is greater than 0.5 mag/day in ZTF g and r-band
sub_query_magstat = f'''
SELECT
    magstat.oid
FROM
    magstat
WHERE
    (magstat.fid = 1 OR magstat.fid = 2)  -- Assuming fid=1 is ZTF g-band and fid=2 is ZTF r-band
    AND magstat.dmdt_first < -0.5
GROUP BY
    magstat.oid
HAVING
    COUNT(DISTINCT magstat.fid) = 2  -- Ensure both bands are present
'''

# Sub-query to select oids from the object table where the number of detections is 

Exception: Failed again: Running SQL exception: (psycopg2.errors.QueryCanceled) canceling statement due to statement timeout

[SQL: 
SELECT
    detection.oid,
    detection.candid,
    detection.magpsf,
    detection.sigmapsf_corr,
    detection.sigmapsf_corr_ext,
    detection.fid,
    detection.mjd
FROM
    detection
WHERE
    detection.oid IN (
SELECT
    oid
FROM
    (
SELECT DISTINCT
    probability.oid
FROM
    probability
WHERE
    probability.classifier_name = 'lc_classifier'
    AND probability.class_name = 'SNII'
    AND probability.probability > 0.6
    AND probability.ranking = 1
) AS prob_oids
INTERSECT
SELECT
    oid
FROM
    (
SELECT
    magstat.oid
FROM
    magstat
WHERE
    (magstat.fid = 1 OR magstat.fid = 2)  -- Assuming fid=1 is ZTF g-band and fid=2 is ZTF r-band
    AND magstat.dmdt_first < -0.5
GROUP BY
    magstat.oid
HAVING
    COUNT(DISTINCT magstat.fid) = 2  -- Ensure both bands are present
) AS magstat_oids
INTERSECT
SELECT
    oid
FROM
    (
SELECT
    object.oid
FROM
    object
WHERE
    object.ndet > 50
    AND object.firstmjd BETWEEN 58484 AND 59580  -- MJD range for 2019-2022
) AS object_oids
)
ORDER BY
    detection.oid, detection.mjd
]
(Background on this error at: https://sqlalche.me/e/20/e3q8)