In [1]:
import pandas as pd
import os, json, sqlite3, re

In [2]:
with open("../src/spider/mockup_schema_description.json") as f:
    spider_description = json.load(f)
    exists_tables = [tab['table'] for tab in spider_description]

with open("../src/spider/table_database_map.json") as f:
    table_map_db = json.load(f)

In [3]:
spider_df = pd.read_csv('../src/NSText2SQL/train_spider.csv')
spider_df.head()

Unnamed: 0,Question,Table,SQL
0,"What are the first names, office locations of ...","CREATE TABLE course (\n crs_code text,\n ...","SELECT T2.emp_fname, T4.prof_office, T3.crs_de..."
1,Please show the songs that have result 'nomina...,"CREATE TABLE artist (\n artist_id number,\n...",SELECT T2.song FROM music_festival AS T1 JOIN ...
2,Which teams had more than 3 eliminations?,CREATE TABLE elimination (\n elimination_id...,SELECT team FROM elimination GROUP BY team HAV...
3,"Show the names of people, and dates and venues...","CREATE TABLE people (\n people_id number,\n...","SELECT T3.name, T2.date, T2.venue FROM debate_..."
4,Tell me the the date when the first claim was ...,CREATE TABLE settlements (\n settlement_id ...,SELECT date_claim_made FROM claims ORDER BY da...


In [4]:
def extract_sql_query(text):

    sql_patterns = [r'```sql(.*?)```', r'```(.*?)```', r'(SELECT.*?;)', r'(SELECT.*)']
    
    for pattern in sql_patterns:
        match = re.search(pattern, text, re.DOTALL)
        if match:
            return match.group(1).strip()
    return text

In [5]:
extract_sql_query("query: 'SELECT COUNT(*) AS Number_of_cinemas FROM cinema;'")

'SELECT COUNT(*) AS Number_of_cinemas FROM cinema;'

In [6]:
def extract_table_and_columns(sql_statements):
    # Regular expression pattern to match the table name and column names
    pattern = r'CREATE\s+TABLE\s+(\w+)\s*\((.+?)\);?'

    table_column_pairs = {}
    matches = re.finditer(pattern, sql_statements, re.IGNORECASE)
    for match in matches:
        table_name = match.group(1)
        columns = [col.strip().split()[0] for col in match.group(2).split(',')]
        table_column_pairs[table_name] = columns

    return table_column_pairs

def db_of_create_table(query):
    lines = query.splitlines()
    # Look for "CREATE TABLE" and start capturing columns
    for line in lines:
        if "CREATE TABLE" in line:
            table_name = line.split()[-2]
            db = table_map_db[table_name]
            return db

def query_db(sql_query, db_name):
    try:
        conn = sqlite3.connect(f'../src/spider/database/{db_name}/{db_name}.sqlite')
        cursor = conn.cursor()
    except:
        return "CANNOT CONNECT DATABASE"
    try:
        cursor.execute(sql_query)
        results = cursor.fetchall()
    except:
        return "CANNOT FETCHING DATA"
    conn.close()
    return results


In [7]:
def complexity(sql_query):
    level = {
        4: ["EXPECT", "CASE"],
        3: ["HAVING", "DATE", "JOIN"],
        2: ["GROUP BY", "ORDER BY", "LIMIT"],
        1: ["WHERE", "SUM", "AVG", "MAX", "SELECT"]
    }
    sql_query = sql_query.lower().replace('\n', '').strip()

    # Checking for nested queries
    if sql_query.lower().count("select") >= 2:
        return 4

    for key, keywords in level.items():
        for keyword in keywords:
            if keyword.lower() in sql_query:
                return key
    
    # If none of the keywords match, default complexity level is 1
    return 1

In [8]:
spider_expdf = pd.read_excel("results/temp_spider_experiments.xlsx")

_df = pd.DataFrame(spider_expdf['Question'])
_df = pd.merge(_df, spider_df, how='inner', on='Question')
_df.rename(columns={'SQL':'Actual SQL'}, inplace=True)
_df['db_id'] = _df['Table'].apply(db_of_create_table)
_df['Complexity'] = _df['Actual SQL'].apply(complexity)
# _df.to_excel('spider_qpair.xlsx', index=False)

spider_expdf = pd.merge(_df, spider_expdf, how='inner', on='Question')

sql_columns = ['Actual SQL', 'Desc DeepSeek', 'Desc Gemini', 'ChatQ NSQL', 'ChatQ DeepSeek', 'ChatQ Gemini']
for col in sql_columns:
    spider_expdf[col] = spider_expdf[col].apply(extract_sql_query)
    spider_expdf[f"{col} result"] = spider_expdf.apply(lambda row: query_db(row[col], row['db_id']), axis=1)

spider_expdf.head()

Unnamed: 0,Question,Table,Actual SQL,db_id,Complexity,Desc DeepSeek,Desc GPT3.5,Desc GPT4,Desc Gemini,ChatQ NSQL,ChatQ DeepSeek,ChatQ GPT3.5,ChatQ GPT4,ChatQ Gemini,Actual SQL result,Desc DeepSeek result,Desc Gemini result,ChatQ NSQL result,ChatQ DeepSeek result,ChatQ Gemini result
0,Find the number of members living in each addr...,"CREATE TABLE shop (\n shop_id number,\n ...","SELECT COUNT(*), address FROM member GROUP BY ...",coffee_shop,2,"SELECT Address, COUNT(*) FROM member GROUP BY ...",API ERROR,API ERROR,"SELECT Address, COUNT(*) AS Number_of_Members ...","SELECT COUNT(*), address FROM member GROUP BY ...","SELECT COUNT(Member_ID), Address FROM member G...",API ERROR,API ERROR,"SELECT COUNT(*), Address FROM member GROUP BY ...","[(1, Bridgeport), (2, Cheshire), (3, Hartford)...","[(Bridgeport, 1), (Cheshire, 2), (Hartford, 3)...","[(Bridgeport, 1), (Cheshire, 2), (Hartford, 3)...","[(1, Bridgeport), (2, Cheshire), (3, Hartford)...","[(1, Bridgeport), (2, Cheshire), (3, Hartford)...","[(1, Bridgeport), (2, Cheshire), (3, Hartford)..."
1,Count the number of cinemas.,"CREATE TABLE cinema (\n cinema_id number,\n...",SELECT COUNT(*) FROM cinema,cinema,1,SELECT COUNT(*) FROM cinema;,API ERROR,API ERROR,SELECT COUNT(*) AS Number_of_cinemas FROM cinema;,SELECT COUNT(*) FROM cinema;,SELECT COUNT(*) FROM cinema;,API ERROR,API ERROR,SELECT COUNT(*) FROM cinema;,"[(10,)]","[(10,)]","[(10,)]","[(10,)]","[(10,)]","[(10,)]"
2,What is the location with the most cinemas ope...,"CREATE TABLE cinema (\n cinema_id number,\n...",SELECT location FROM cinema WHERE openning_yea...,cinema,2,"SELECT Location, COUNT(*) as Number_of_Cinemas...",API ERROR,API ERROR,"SELECT Location, COUNT(*) AS Number_of_cinemas...",SELECT location FROM cinema WHERE openning_yea...,SELECT Location FROM cinema WHERE Openning_yea...,API ERROR,API ERROR,SELECT Location FROM cinema WHERE Openning_yea...,"[(County Tipperary,)]","[(County Tipperary, 2)]","[(County Tipperary, 2)]","[(County Tipperary,)]","[(County Tipperary,)]","[(County Tipperary,)]"
3,List the name of musicals that do not have act...,"CREATE TABLE musical (\n musical_id number,...",SELECT name FROM musical WHERE NOT musical_id ...,musical,4,SELECT Name FROM musical WHERE Musical_ID NOT ...,API ERROR,API ERROR,SELECT Name\nFROM Musical\nEXCEPT\nSELECT Name...,SELECT name FROM musical WHERE NOT musical_id ...,"I'm sorry, but as an AI model developed by Dee...",API ERROR,API ERROR,SELECT Name FROM musical WHERE Musical_ID NOT ...,"[(Wicked,), (Rent,), (Chicago,)]","[(Wicked,), (Rent,), (Chicago,)]","[(Chicago,), (Rent,), (Wicked,)]","[(Wicked,), (Rent,), (Chicago,)]",CANNOT FETCHING DATA,"[(Wicked,), (Rent,), (Chicago,)]"
4,What is the official name and status of the ci...,CREATE TABLE farm_competition (\n competiti...,"SELECT official_name, status FROM city ORDER B...",farm,2,"SELECT Official_Name, Status FROM city ORDER B...",API ERROR,API ERROR,"SELECT Official_Name, Status\nFROM city\nORDER...","SELECT official_name, status FROM city ORDER B...","SELECT Official_Name, Status FROM city ORDER B...",API ERROR,API ERROR,"SELECT Official_Name, Status FROM city ORDER B...","[(Grand Falls/Grand-Sault, Town)]","[(Grand Falls/Grand-Sault, Town)]","[(Grand Falls/Grand-Sault, Town)]","[(Grand Falls/Grand-Sault, Town)]","[(Grand Falls/Grand-Sault, Town)]","[(Grand Falls/Grand-Sault, Town)]"


In [9]:
spider_expdf.to_excel("results/final_spider_eval.xlsx", index=False)

In [15]:
# spider_eval_df = pd.read_excel('results/spider_measurement_experiments.xlsx')
# spider_eval_df = pd.merge(spider_expdf, spider_eval_df, how='inner', on='Question')
# spider_eval_df.to_excel('results/spider_measurement_experiments.xlsx', index=False)