# Testing notebook

## Working the CSVs and Excel sheets

In [1]:
# Exploring the 'training' dataset
import pandas as pd
df = pd.read_csv("datasets/txt2sql_alerce_train_v3_1.csv")
df.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,rephrased_request,rephrased_request_gpt-3.5-turbo-0125_t0.4,rephrased_request_gpt-4o-2024-05-13_t0.2
0,13,Give me all the SNe that were first detected b...,"['object', 'probability']",\r\n-- mjd date for December = 59914.0\r\n-- m...,\r\n-- Super Nova (SNe) is a large explosion t...,"\r\nSELECT\r\n object.oid, probability.clas...",simple,object,none,,,
1,10,Get the object identifiers and probabilities i...,"['probability', 'object']",0,0,"\r\nSELECT\r\n sq1.oid, sq1.probability as SN...",medium,object,simple,,,
2,15,"Get the object identifiers, probabilities in t...","['object', 'probability', 'detection', 'magstat']",\r\n-- mjd date for September 01 = 60188.0\r\n...,\r\n-- A fast riser is defined as an object wh...,"\r\nSELECT\r\n sq.oid, sq.probability, sq.c...",advanced,other,multi,,,
3,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\r\n-- mjd date for the start of the year 2019...,0,"\r\nSELECT\r\n sq.oid, sq.fid, sq.dmdt_first,...",advanced,other,multi,,,
4,25,Query objects within 10 degress of the next po...,"['probability', 'object']",\r\n-- mjd date for February 01 = 59976.0\r\n-...,0,"\r\nWITH catalog ( source_id, ra, dec) AS (\r\...",advanced,spatial,simple,,,


In [2]:
df[df["request"] == "Find at most 10 cases where the feature called 'LinearTrend' is larger than 5 and their ZTF objects have a difference between last and first detection date larger than 1 day. Return the following columns. For the ZTF objects: ZTF identifier and number of detections; for their light curve classifier data at ranking 1: classifier version, class and probability; for the 'LinearTrend' feature: value and filter identifier"]["external_knowledge"].item()

"Since no feature version is specified, the combination between ZTF oid and fid can be non-unique in the 'feature' table output"

In [3]:
# Reading the excel file
excel = pd.read_excel("datasets/SQLusecases_alerce.xlsx", 
                      sheet_name="examples_alerce_usecasesV3_1")

# Cleaning the excel a bit
excel = excel.drop(columns=["Unnamed: 0.1", "Unnamed: 0"])
excel.head()

Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,Set,python_format
0,0,Get objects that are likely to be YSOs (possib...,"['probability', 'feature']",\n-- feature.name can be 'Multiband_period'\n-...,\n-- Multiband_period: Period obtained using t...,"\nSELECT\n oid, probability, value, name, fid...",advanced,object,tree,Train,"sub_query_1='''\nSELECT\n feature.oid, prob_o..."
1,1,Get all the objects classified as AGN with a p...,"['object', 'probability', 'feature', 'magstat']",\n-- object.ndet represents the number of dete...,\n-- Amplitude: Half of the difference between...,"\nSELECT\n sq.oid, sq.value, sq.name, sq.fid ...",advanced,object,tree,Test,\nsub_query_object='''\nSELECT\n object.oid...
2,2,Give me the objects classified as YSO by their...,['probability'],,,"\nSELECT\n oid, probability\nFROM\n prob...",simple,object,none,Train,"query='''\nSELECT\n oid, probability\nFROM\..."
3,3,Give me the objects classified as YSO by the l...,"['object', 'probability']","\n-- last june in mjd date: [start=60096.0, en...",,\nSELECT\n *\nFROM\n probability\nWHERE\...,simple,object,simple,Train,query=f'''\nSELECT\n *\nFROM\n probabili...
4,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\n-- mjd date for the start of the year 2019 =...,,"\nSELECT\n sq.oid, sq.fid, sq.dmdt_first,\n ...",advanced,other,multi,Train,# objects classified as SN II with probability...


## Tests

### Gold values and query to test

In [20]:
# Select a query
query = "Get objects that are likely to be YSOs (possibility great than 0.7) and with some periodicities. The photometric period is between 3 to 10 days and variation amplitude is great than 0.5 mag. Please sort the list by the possibility in descending order."

# Check if the query is in the excel file
if query in excel["request"].to_list():
    # Obtain the gold SQL query and Python query
    sql_gold = excel[excel["request"] == query]["gold_query"].item()
    python_gold = excel[excel["request"] == query]["python_format"].item()

    # Obtain the necessary tables
    gold_tables = excel[excel["request"] == query]["table_info"].item()

    # Print all in orderly fashion
    print("Gold values\n")
    print("Tables needed for the query:")
    print(gold_tables + "\n")
    print("SQL gold query:")
    print(sql_gold)
    print("Python gold query:\n")
    print(python_gold)
    
else:
    # Obtain the gold SQL query
    sql_gold = df[df["request"] == query]["gold_query"].item()

    # Obtain the necessary tables
    gold_tables = df[df["request"] == query]["table_info"].item()

    # Print all in orderly fashion
    print("Gold values\n")
    print("Tables needed for the query:")
    print(gold_tables + "\n")
    print("SQL gold query:")
    print(sql_gold)

Gold values

Tables needed for the query:
['probability', 'feature']

SQL gold query:

SELECT
  oid, probability, value, name, fid, version
FROM
  (
SELECT *
FROM (
SELECT
  feature.oid, prob_oids.probability, feature.value, feature.name, feature.fid, feature.version
FROM
  (SELECT * FROM probability
    WHERE
    probability.classifier_name='lc_classifier'
    AND probability.class_name='YSO'
    AND probability.ranking=1
    AND probability.probability > 0.7
    ) as prob_oids
    INNER JOIN
    feature ON feature.oid = prob_oids.oid
WHERE
  feature.name = 'Multiband_period'
  AND feature.value > 3
  AND feature.value < 10
) as sq1
UNION
SELECT *
FROM (
SELECT
  feature.oid, prob_oids.probability, feature.value, feature.name, feature.fid, feature.version
FROM
  (SELECT * FROM probability
    WHERE
    probability.classifier_name='lc_classifier'
    AND probability.class_name='YSO'
    AND probability.ranking=1
    AND probability.probability > 0.7
    ) as prob_oids
    INNER JOIN
  

In [21]:
# Running the gold query
from secret.config import SQL_URL
import requests
import sqlalchemy as sa

# Setup params for query engine
params = requests.get(SQL_URL).json()['params']
engine = sa.create_engine(f"postgresql+psycopg2://{params['user']}:{params['password']}@{params['host']}/{params['dbname']}")
engine.begin()

pd.read_sql_query(sql_gold, con=engine)

Unnamed: 0,oid,probability,value,name,fid,version
0,ZTF18abcxlhm,0.922152,0.682077,Amplitude,2,lc_classifier_1.2.1-P
1,ZTF18abcxlhm,0.922152,7.218844,Multiband_period,12,lc_classifier_1.2.1-P
2,ZTF18abcxlhm,0.922152,0.575987,Amplitude,2,lc_classifier_1.2.1-P-transitional
3,ZTF18abcxlhm,0.922152,1.051982,Amplitude,1,lc_classifier_1.2.1-P
4,ZTF18abcxlhm,0.922152,0.833532,Amplitude,1,lc_classifier_1.2.1-P-transitional
...,...,...,...,...,...,...
1135,ZTF17aaavjvc,0.700040,1.321648,Amplitude,1,lc_classifier_1.2.1-P
1136,ZTF17aaavjvc,0.700040,0.955481,Amplitude,1,lc_classifier_1.2.1-P-transitional
1137,ZTF17aaavjvc,0.700040,0.814443,Amplitude,2,23.12.25
1138,ZTF17aaavjvc,0.700040,1.037813,Amplitude,2,lc_classifier_1.2.1-P


### Trying out the pipelines

In [22]:
from pprint import pprint
from pipeline.eval import run_pipeline
from testing.tests import engine
engine.begin()

# Model to use
#model = "claude-3-5-sonnet-20240620"
model = "gpt-4o"

# Format for the pipeline
format = "python"

# RAG parameters
max_tokens = 10000
size = 700
overlap = 300
quantity = 10

# Running the pipeline
result, error, total_usage, prompts = run_pipeline(query, model, max_tokens, size, 
                                            overlap, quantity, format, False,
                                            True, True, 2, 3)
print("Resulting table:")
print(result)
print("Total usage of the pipeline:")
pprint(total_usage)

# The prompts used will be saved in this file
with open(f"prompts/examples/prompts_query_{model}.txt", "w") as f:
    f.write(str(prompts))

Tables needed: [object, magstat, detection]
Difficulty: advanced
Resulting python query: 
# Sub-query to get objects classified as YSOs with a probability greater than 0.7
sub_query_yso_probability = f'''
SELECT DISTINCT
    probability.oid
FROM
    probability
WHERE
    probability.classifier_name = 'lc_classifier'
    AND probability.class_name = 'YSO'
    AND probability.probability > 0.7
    AND probability.ranking = 1
'''

# Sub-query to get objects with a photometric period between 3 and 10 days
sub_query_periodicity = f'''
SELECT
    feature.oid
FROM
    feature
WHERE
    feature.name = 'period'
    AND feature.value BETWEEN 3 AND 10
    AND feature.oid IN ({sub_query_yso_probability})
'''

# Sub-query to get objects with a variation amplitude greater than 0.5 mag
sub_query_amplitude = f'''
SELECT
    feature.oid
FROM
    feature
WHERE
    feature.name = 'amplitude'
    AND feature.value > 0.5
    AND feature.oid IN ({sub_query_yso_probability})
'''

# Combine periodicity and am

In [None]:
from pprint import pprint
from pipeline.main import run_pipeline, engine
engine.begin()

# Model to use
#model = "claude-3-5-sonnet-20240620"
model = "o1-preview"

# Format for the pipeline
format = "python"

# RAG parameters
max_tokens = 10000
size = 700
overlap = 300
quantity = 10

# Running the pipeline
result, total_usage, prompts = run_pipeline(query, model, max_tokens, size, 
                                            overlap, quantity, format, False, 
                                            engine, rag_pipe=True, 
                                            self_corr=True)
print("Resulting table:")
print(result)
print("Total usage of the pipeline:")
pprint(total_usage)

# The prompts used will be saved in this file
with open(f"prompts/examples/prompts_query_{model}.txt", "w") as f:
    f.write(str(prompts))

Tables needed: [object, detection, magstat]
Difficulty: advanced
```python
# Get oids of objects classified as 'SN II' with probability > 0.6
sub_query_probability = f'''
SELECT oid
FROM probability
WHERE classifier_name='lc_classifier'
AND class_name='SN II'
AND ranking=1
AND probability > 0.6
'''

# Get oids of objects with first observation between 2019-01-01 and 2022-12-31
# MJD range from 58484 (2019-01-01) to 59580 (2022-01-01)
sub_query_object = f'''
SELECT oid
FROM object
WHERE firstmjd BETWEEN 58484 AND 59946
'''

# Combine previous subqueries to get oids satisfying both class and date criteria
sub_query_oid1 = f'''
SELECT prob_oids.oid
FROM ({sub_query_probability}) AS prob_oids
INNER JOIN ({sub_query_object}) AS obj_oids
ON prob_oids.oid = obj_oids.oid
'''

# Get oids with more than 30 detections
sub_query_ndet = f'''
SELECT oid
FROM detection
GROUP BY oid
HAVING COUNT(*) > 30
'''

# Combine to get oids satisfying class, date, and number of detections criteria
sub_query_oid2

Exception: Failed again: local variable 'results' referenced before assignment