# Evaluation notebook

## Setting up the training and testing dataframes

In [30]:
import pandas as pd

df_train = pd.read_csv("datasets/txt2sql_alerce_train_v3_1.csv")
df_test = pd.read_csv("datasets/txt2sql_alerce_test_v3_1.csv")

In [31]:
# Information about the dataframes
print(f"Shape of training set: {df_train.shape}")
df_train.head()

Shape of training set: (58, 12)


Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,rephrased_request,rephrased_request_gpt-3.5-turbo-0125_t0.4,rephrased_request_gpt-4o-2024-05-13_t0.2
0,13,Give me all the SNe that were first detected b...,"['object', 'probability']",\r\n-- mjd date for December = 59914.0\r\n-- m...,\r\n-- Super Nova (SNe) is a large explosion t...,"\r\nSELECT\r\n object.oid, probability.clas...",simple,object,none,,,
1,10,Get the object identifiers and probabilities i...,"['probability', 'object']",0,0,"\r\nSELECT\r\n sq1.oid, sq1.probability as SN...",medium,object,simple,,,
2,15,"Get the object identifiers, probabilities in t...","['object', 'probability', 'detection', 'magstat']",\r\n-- mjd date for September 01 = 60188.0\r\n...,\r\n-- A fast riser is defined as an object wh...,"\r\nSELECT\r\n sq.oid, sq.probability, sq.c...",advanced,other,multi,,,
3,4,"Get the object identifier, candidate identifie...","['object', 'probability', 'magstat', 'detection']",\r\n-- mjd date for the start of the year 2019...,0,"\r\nSELECT\r\n sq.oid, sq.fid, sq.dmdt_first,...",advanced,other,multi,,,
4,25,Query objects within 10 degress of the next po...,"['probability', 'object']",\r\n-- mjd date for February 01 = 59976.0\r\n-...,0,"\r\nWITH catalog ( source_id, ra, dec) AS (\r\...",advanced,spatial,simple,,,


In [32]:
print(f"Shape of testing set: {df_test.shape}")
df_test.head()

Shape of testing set: (52, 12)


Unnamed: 0,req_id,request,table_info,external_knowledge,domain_knowledge,gold_query,difficulty,type,nested_type,rephrased_request,rephrased_request_gpt-3.5-turbo-0125_t0.4,rephrased_request_gpt-4o-2024-05-13_t0.2
0,8,Query all objects that were first classified a...,"['probability', 'object']",\r\n-- mjd date for august 17 = 60173.0\r\n-- ...,,\r\nSELECT\r\n *\r\nFROM\r\n object\r\nI...,simple,object,none,,,
1,27,"Return the oids, meanra, meandec, ndet, firstm...","['probability', 'object']",\r\n-- object.ndet represents the total number...,,"\r\nSELECT\r\n object.oid, object.meanra, o...",simple,object,simple,,,
2,17,Return the objects classified as AGN that show...,"['probability', 'object', 'detection']",\r\n-- firstmjd represents the time of the fir...,,\r\nSELECT\r\n *\r\nFROM\r\n (\r\nSELECT *\r...,advanced,object,tree,,,
3,37,"""Query the top 200 objects classified SNIa acc...",['probability'],,,\r\nSELECT\r\n *\r\nFROM\r\n probability\r...,simple,object,none,,,
4,34,Given this list of oids ['ZTF17aaadpsi' 'ZTF19...,['feature'],\r\n-- feature.name can be 'Amplitude'\r\n-- f...,\r\n-- Multiband_period: Period obtained using...,\r\nSELECT\r\n *\r\nFROM\r\n feature\r\n...,simple,other,none,,,


In [37]:
# Ten advanced queries
adv_10 = df_test[df_test["difficulty"] == "advanced"]
adv_10["request"].to_numpy()

array(['Return the objects classified as AGN that show variability in the r band, but have no detections or show no variability in the g band. Order the features by their value in ascending order, and the final query by oid.',
       "Get all the objects classified as AGN with a probability larger than 0.6, with light curves covering at least one year that have at least 20 detections at least in one band, and select those that show variations larger than 1 magnitude in the g band and larger than 0.5 in the r band. The magnitudes need to be the corrected ones. Return their oid, the value, name, version and filter id from the features, mean coordinates, number of detections, maximum and minimum magnitudes, the variations in bands as the difference between the maximum and minimum magnitudes aliasing the column as 'delta_mag'. For the filter id columns, use the alias '[table_name]_fid' using the table name as the alias.",
       " Get the object identifiers, probabilities in the stamp clas

## Running experiments

In [4]:
# Experiment and save paths
experiment_path = "experiments"

# Running parameters
ncpus = 3
db_min = 10
n_tries = 3
exps = 3

# Model to use
model = "gpt-4o"

# RAG parameters
max_tokens = 10000
size = 700
overlap = 300
quantity = 10

# Type of pipeline
format = "python"
self_corr = True
rag_pipe = True
direct = False

In [1]:
# Running the evaluation in training
from pipeline.eval import new_run_eval_fcn

new_run_eval_fcn(df_train[0:5], experiment_path, model, max_tokens, format,
                 ncpus, db_min, n_tries, self_corr, rag_pipe, direct, size, 
                 overlap, quantity, exps)

KeyboardInterrupt: 

In [5]:
# Running compare oids
from pipeline.eval import new_compare_oids

new_compare_oids(df_train[:5], 1, model, max_tokens, format, experiment_path, db_min, n_tries, 
                 self_corr, rag_pipe, direct, size, overlap, quantity)

  from .autonotebook import tqdm as notebook_tqdm


Tables needed: [object, probability]
Difficulty: simple
Raw response: ```sql
SELECT 
    p.class AS probability_class, 
    o.firstmjd AS first_detection_date, 
    o.lastmjd AS last_detection_date, 
    o.oid
FROM 
    object o
WHERE 
    o.firstmjd BETWEEN 59922 AND 60195
    AND o.oid IN (
        SELECT DISTINCT oid 
        FROM probability 
        WHERE classifier_name = 'lc_classifier_transient' 
        AND ranking = 1 
        AND class IN ('SNIa', 'SNIbc', 'SNII', 'SLSN')
    )
```
Resulting sql query: 
SELECT p.class AS probability_class,
       o.firstmjd AS first_detection_date,
       o.lastmjd AS last_detection_date,
       o.oid
FROM OBJECT o
WHERE o.firstmjd BETWEEN 59922 AND 60195
  AND o.oid IN
    (SELECT DISTINCT oid
     FROM probability
     WHERE classifier_name = 'lc_classifier_transient'
       AND ranking = 1
       AND CLASS IN ('SNIa',
                     'SNIbc',
                     'SNII',
                     'SLSN'))

SELECT p.class AS probability_cl

IndexError: list index out of range

## Various tests

In [28]:
import numpy as np
import re
data = pd.read_csv("experiments/python_RAG_False_1.csv")

gen_columns = []
gen_rows = []
for i in range(data.shape[0]):
    watch = data["results"][i]
    if pd.isna(watch):
        gen_columns.append(np.nan)
        gen_rows.append(np.nan)
    else:
        # Extract column names
        column_line = re.search(r'^\s*(oid.*?magstat_fid)', watch, re.DOTALL).group(1)
        columns = re.split(r'\s{2,}', column_line)
        gen_columns.append(columns)

        # Extract oid values
        oid_values = re.findall(r'^\s*\d+\s+(ZTF\d+[a-z]+)', watch, re.MULTILINE)
        gen_rows.append(oid_values)
        
gold_columns = []
gold_rows = []
for i in range(data.shape[0]):
    watch = data["resultGold"][i]
    if pd.isna(watch):
        gold_columns.append(np.nan)
        gold_rows.append(np.nan)
    else:
        # Extract the first line containing column names
        header_line = re.search(r'^\s*(oid.*?)\\', watch, re.DOTALL).group(1)

        # Split by whitespace to get individual column names
        columns = re.split(r'\s{2,}', header_line.strip())
        gold_columns.append(columns)

        # Extract oid values
        oid_values = re.findall(r'^\s*\d+\s+(ZTF\d+[a-z]+)', watch, re.MULTILINE)
        gold_rows.append(oid_values)
        
data["resultsCols"] = gen_columns
data["resultsRows"] = gen_rows
data["resultGoldCols"] = gold_columns
data["resultGoldRows"] = gold_rows
data

Unnamed: 0,exp,request,model,format,rag,genQuery,goldQuery,results,resultGold,resultsCols,resultsRows,resultGoldCols,resultGoldRows
0,11,"Get the object identifiers, probabilities in t...",gpt-4o,python,False,# Sub-query to get object identifiers with the...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",,oid probability ca...,,,"[oid, probability, candid, fid, mjd]","[ZTF18aaiaclp, ZTF18aakjgzj, ZTF18aaleydm, ZTF..."
1,12,"Get the object identifiers, probabilities in t...",gpt-4o,python,False,# Sub-query to get object identifiers with the...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",,oid probability ca...,,,"[oid, probability, candid, fid, mjd]","[ZTF18aaiaclp, ZTF18aakjgzj, ZTF18aaleydm, ZTF..."
2,13,"Get the object identifiers, probabilities in t...",gpt-4o,python,False,# Sub-query to get object identifiers with the...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",oid candid fid ...,oid probability ca...,"[oid, candid, fid, mjd, magstat_fid]","[ZTF18aaiaclp, ZTF18aakjgzj, ZTF18aaleydm, ZTF...","[oid, probability, candid, fid, mjd]","[ZTF18aaiaclp, ZTF18aakjgzj, ZTF18aaleydm, ZTF..."
3,14,"Get the object identifiers, probabilities in t...",gpt-4o,python,False,# Sub-query to get object identifiers with the...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",,oid probability ca...,,,"[oid, probability, candid, fid, mjd]","[ZTF18aaiaclp, ZTF18aakjgzj, ZTF18aaleydm, ZTF..."
4,15,"Get the object identifiers, probabilities in t...",gpt-4o,python,False,# Sub-query to get object identifiers with the...,"\nSELECT\n sq.oid, sq.probability, sq.candi...",oid candid fid ...,oid probability ca...,"[oid, candid, fid, mjd, magstat_fid]","[ZTF18aaiaclp, ZTF18aakjgzj, ZTF18aaleydm, ZTF...","[oid, probability, candid, fid, mjd]","[ZTF18aaiaclp, ZTF18aakjgzj, ZTF18aaleydm, ZTF..."


In [29]:
data.to_csv("python_RAG_False_corrected.csv")

In [8]:
from pipeline.eval import create_conn

query = """
SELECT 
    p.class AS probability_class, 
    o.firstmjd AS first_detection_date, 
    o.lastmjd AS last_detection_date, 
    o.oid
FROM 
    object o
WHERE 
    o.firstmjd BETWEEN 59922 AND 60195
    AND o.oid IN (
        SELECT DISTINCT oid 
        FROM probability 
        WHERE classifier_name = 'lc_classifier_transient' 
        AND ranking = 1 
        AND class IN ('SNIa', 'SNIbc', 'SNII', 'SLSN')
    );
"""

pd.read_sql_query(query, con=create_conn())

ProgrammingError: (psycopg2.errors.UndefinedTable) missing FROM-clause entry for table "p"
LINE 3:     p.class AS probability_class, 
            ^

[SQL: 
SELECT 
    p.class AS probability_class, 
    o.firstmjd AS first_detection_date, 
    o.lastmjd AS last_detection_date, 
    o.oid
FROM 
    object o
WHERE 
    o.firstmjd BETWEEN 59922 AND 60195
    AND o.oid IN (
        SELECT DISTINCT oid 
        FROM probability 
        WHERE classifier_name = 'lc_classifier_transient' 
        AND ranking = 1 
        AND class IN ('SNIa', 'SNIbc', 'SNII', 'SLSN')
    );
]
(Background on this error at: https://sqlalche.me/e/20/f405)