In [1]:
import sqlite3, os, json, sqlparse, re, string
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from nltk.stem import WordNetLemmatizer
from sql_metadata import Parser

## Get the SQLite Database

In [2]:
folder_path = "../src/spider/database"
select_db = ['musical',
             'farm', 
             'hospital_1', 
             'tvshow', 
             'cinema', 
             'restaurants', 
             'company_employee', 
             'company_offic', 
             'singer', 
             'coffee_shop']

db = []

if os.path.exists(folder_path) and os.path.isdir(folder_path):
    files = os.listdir(folder_path)
    for file in files:
        # if file in select_db:
        db_path = os.path.join(folder_path, file)
        sqlite_db = [os.path.join(db_path, sql) for sql in os.listdir(db_path) if ".sqlite" in sql]
        db.append(*sqlite_db)

db[:5]

['../src/spider/database/browser_web/browser_web.sqlite',
 '../src/spider/database/musical/musical.sqlite',
 '../src/spider/database/farm/farm.sqlite',
 '../src/spider/database/voter_1/voter_1.sqlite',
 '../src/spider/database/game_injury/game_injury.sqlite']

In [3]:
def get_schema(sqlite_db):
    connection = sqlite3.connect(sqlite_db)
    cursor = connection.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    table_names = []
    for table in tables:
        table_name = table[0]
        table_names.append(table_name)
        print(f"Table: {table_name}")
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()

        for column in columns:
            column_name = column[1]
            print(f"  Column: {column_name}")

        print()
    
    cursor.close()
    connection.close()
    return table_names

In [5]:
# db_map_tables = dict({})
for database_path in db[:2]:
    for db_name in select_db:
        if database_path.split('/')[-2] == db_name:
            # if table in exists_table : continue
            get_schema(database_path)
            # exists_table.append(table)
            print('---------------------------------')

Table: musical
  Column: Musical_ID
  Column: Name
  Column: Year
  Column: Award
  Column: Category
  Column: Nominee
  Column: Result

Table: actor
  Column: Actor_ID
  Column: Name
  Column: Musical_ID
  Column: Character
  Column: Duration
  Column: age

---------------------------------


## Embedding description of tables and columns

In [6]:
src_folder = "../src"
schema_description_file = "mockup_schema_description.json"
with open(os.path.join(src_folder, schema_description_file)) as f:
    dbs = json.load(f)
model = SentenceTransformer('../models/all-MiniLM-L6-v2')
lemmanizer = WordNetLemmatizer()

In [7]:
for i in range(len(dbs)):
    dbs[i]['table'] = dbs[i]['table'].lower()
    dbs[i]['columns'] = {key.lower(): value for key, value in dbs[i]['columns'].items()}

In [8]:
dbs[:2]

[{'table': 'musical',
  'description': 'This table contains information about musicals.',
  'columns': {'musical_id': 'Unique identifier for the musical',
   'name': 'Name of the musical',
   'year': 'Year the musical was produced',
   'award': 'Award received by the musical',
   'category': 'Category of the award',
   'nominee': 'Name of the nominee associated with the musical',
   'result': 'Result of the award nomination for the musical'}},
 {'table': 'actor',
  'description': 'This table contains information about actors in musicals.',
  'columns': {'actor_id': 'Unique identifier for the actor',
   'name': 'Name of the actor',
   'musical_id': 'Identifier of the musical the actor is associated with',
   'character': 'Character played by the actor in the musical',
   'duration': "Duration of the actor's involvement in the musical",
   'age': 'Age of the actor'}}]

In [9]:
schema_vector = []

for db in dbs:
    schema_emb = {}
    table_name = db['table']
    table_description = db['description']
    schema_emb[table_name] = model.encode(table_description).tolist()
    columns = list(db['columns'].keys())
    for col in columns:
        column_description = db['columns'][col]
        schema_emb[col] = model.encode(column_description).tolist()
    schema_vector.append(schema_emb)

# schema_vector_file = "mockup_schema_description_vector.json"
# with open(os.path.join(src_folder, schema_vector_file), "w") as f:
#     json.dump(description_emb,f)

In [9]:
# schema_vector_file = "mockup_schema_description_vector.json"
# with open(os.path.join(src_folder, schema_vector_file)) as f:
#     schema_vector = json.load(f)

## Filtering Columns by Question

Question --> Similarity description-base --> String matching (score weight up to match condition) --> Column

### String Matching

In [10]:
# string mathing , max_score = weight of string match, token_id for foud exact match col-token in question
exact_match_id = []
def column_from_question(question,used_table_col = {}, default_score=1, token_id=None):
    # question_tokens = [token.lower() for token in tokenizer.tokenize(question)]
    question_tokens = [lemmanizer.lemmatize(token.lower()).strip(string.punctuation) for token in question.split()]
    # print(question_tokens)
    for table in schema_vector:
        max_score = default_score
        for token in question_tokens:
            cols = [ key.lower() for key in table.keys()]
            table_name = cols.pop(0)
            if token == table_name: 
                max_score = 1.0
                # plus the score of columns in exact match with table name
                if used_table_col.get(token) is not None: 
                    for key in used_table_col[token]: 
                        used_table_col[token].update({key: max(used_table_col[token][key] + 0.1,1)})
            # exact match table and column
            if token in cols:
                # print("EXACT MATCH:",token)
                exact_match_id.append(token_id)
                used_table_col.setdefault(table_name, {}).update({token : max_score})

    return used_table_col

### Similarity description-base score

In [11]:
# filter_table : filter by table before 
def filter_tables_by_description(question, column_threshold = 0.4, table_threshold = 0.2, filter_tables = True):
    question_emb = model.encode(question)
    used_schema = {}
    for i in range(len(schema_vector)):
        table_name = list(schema_vector[i].keys())[0]

        table_description_vector = schema_vector[i][table_name]
        if filter_tables and util.cos_sim(table_description_vector, question_emb) < table_threshold: continue
        
        used_col = {}
        for col, vec in schema_vector[i].items():
            if col == table_name: continue
            score = round(float(util.cos_sim(vec, question_emb)),2)
            if score > column_threshold:
                # column_description = [dbs[i]['columns'][col] for i in range(len(dbs)) if dbs[i]['table'] == table_name][0]
                # print(f"{table_name} - {col} : {score}\nDescription : {column_description}\n")
                used_col.update({col: score})
        if len(used_col) > 0: used_schema[table_name] = used_col
    return used_schema

### Question --> Similarity description-base --> String matching (score weight up to match condition) --> Column

In [12]:
question = "Find the number of members living in each address"

In [13]:
selected_table_column = filter_tables_by_description(question, column_threshold = 0.4, filter_tables = False)
selected_table_column = column_from_question(question, used_table_col=selected_table_column)
selected_table_column

{'room': {'roomnumber': 0.48},
 'member': {'address': 1.0},
 'patient': {'address': 1},
 'shop': {'address': 1}}

## Filter Spider train dataset for expirement testing

In [14]:
# spider_sql = []
# df_data = {
#     'Question' : [],
#     'Table' : [],
#     'SQL' : []
# }

# with open("src/NSText2SQL/train.jsonl") as f:
#     for line in f:
#         data = json.loads(line)
#         if data['source'] == 'spider': 
#             spider_sql.append(data)
#             df_data['Question'].append(data['instruction'].split('--')[-1].strip())
#             df_data['Table'].append(data['instruction'].split('--')[0].strip())
#             df_data['SQL'].append(data['output'])

# df = pd.DataFrame(df_data)
# df.to_csv('src/NSText2SQL/train_spider.csv', index=False)
# df.head()

In [15]:
df = pd.read_csv('../src/NSText2SQL/train_spider.csv')
print(df.shape)
df.head()

(6994, 3)


Unnamed: 0,Question,Table,SQL
0,"What are the first names, office locations of ...","CREATE TABLE course (\n crs_code text,\n ...","SELECT T2.emp_fname, T4.prof_office, T3.crs_de..."
1,Please show the songs that have result 'nomina...,"CREATE TABLE artist (\n artist_id number,\n...",SELECT T2.song FROM music_festival AS T1 JOIN ...
2,Which teams had more than 3 eliminations?,CREATE TABLE elimination (\n elimination_id...,SELECT team FROM elimination GROUP BY team HAV...
3,"Show the names of people, and dates and venues...","CREATE TABLE people (\n people_id number,\n...","SELECT T3.name, T2.date, T2.venue FROM debate_..."
4,Tell me the the date when the first claim was ...,CREATE TABLE settlements (\n settlement_id ...,SELECT date_claim_made FROM claims ORDER BY da...


## Get the column and table name from SQL query

by String matching, SQLParse library

In [16]:
sql_extract_token_type = {
            sqlparse.sql.IdentifierList, sqlparse.sql.Where,
            sqlparse.sql.Having, sqlparse.sql.Comparison, sqlparse.sql.Function,
            sqlparse.sql.Parenthesis, sqlparse.sql.Operation, sqlparse.sql.Case
        }

def columns_from_query(sql_query):
    # identifiers contain table name and column name
    if type(sql_query) == str:
        sql_query = sqlparse.parse(sql_query)[0]
    columns = []
    for token in sql_query:
        if isinstance(token, sqlparse.sql.Identifier):
            columns.append(token.get_real_name().lower())
        elif hasattr(token, "tokens"):
            columns.extend(columns_from_query(token.tokens))
    return columns

def columns_by_split(sql_query:str, all_columns:list):
    columns = []
    for token in sql_query.split():
        if token[-1] == ",": token = token[:-1]
        if token in all_columns:
            columns.append(token)
    return columns

# Split the SQL query into lines
def table_column_of_create_table(query):
    lines = query.splitlines()

    # Initialize a list to store column names
    # columns = []
    # table_names = []
    schema = {}
    # Look for "CREATE TABLE" and start capturing columns
    capture = False
    for line in lines:
        if "CREATE TABLE" in line:
            capture = True
            table_name = line.split()[-2].lower()
            schema[table_name] = []
            # table_names.append(table_name)
        elif line.strip().endswith(')') or line.strip().endswith(');'):
            capture = False
        elif capture:
            column_name = line.strip().split()[0]
            if column_name in ["CONSTRAINT", "PRIMARY"]: continue
            schema[table_name].append(column_name.lower())
            # columns.append(column_name)

    # print("Table Name:", table_names)
    # print("Columns:", columns)
    return schema

### Map table to db

In [17]:
spider_path = '../src/spider/database'
map_table_db = {}

for folder in os.listdir(spider_path):
    if folder.lower() not in select_db: continue
    schema_path = os.path.join(spider_path, folder, 'schema.sql')
    if os.path.exists(schema_path):
        with open(schema_path, 'r') as sql_file:
            sql_script = sql_file.read()
            table_names = list(table_column_of_create_table(sql_script).keys())
            for table in table_names: 
                table = re.sub(r'[^a-zA-Z_]', '', table).lower()
                map_table_db[table] = folder.lower()

# with open("src/spider/table_database_map.json", "w") as f:
#     json.dump(map_table_db, f, indent=4)

In [18]:
def safe_divide(numerator, denominator):
    try:
        result = round(numerator / denominator,2)
    except :
        result = "ZeroDivisionError"
    return result

### Map db to table

In [19]:
db_to_table_map = {}
for table, db in map_table_db.items():
    if db in db_to_table_map:
        db_to_table_map[db].append(table)
    else:
        db_to_table_map[db] = [table]


### For keep description analysis

In [20]:
def gen_keepinfo_question(db_to_table_map=db_to_table_map, dbs=dbs):
    col_info = {}
    for db in db_to_table_map:
        if db not in select_db: continue
        col_info[db] = {}
        for table in db_to_table_map[db]:
            col_info[db][table] = {}
            for t in dbs:
                if t['table'].lower() == table.lower():
                    for c in t['columns'].keys():
                        col_info[db][table].setdefault(c.lower(), {
                            'expect_question':set({}) , 
                            'use_question': set({})
                        })
    return col_info

#### Structure

In [22]:
# {
#     'db_id':{
#         'table1': {
#             'col1': {
#                 'expect_question':{'q1, q2, q4'} , 
#                 'use_question': {'q2','q3'}
#             }
#         }
#     }
# }

## Expirement test by threshold score to dataframe
automate join column (each treshold score)

In [21]:
col_recall_tid = {'0.6':[], '0.4':[], '0.2':[]}
desc_anlz_threshold = {'0.6':gen_keepinfo_question(), '0.4':gen_keepinfo_question(), '0.2':gen_keepinfo_question()}

In [22]:
def expirement_test(threshold_score:list, dbs=dbs, verbose=False):
    new_df_index = 0
    full_result_df = pd.DataFrame()
    exists_table = [i['table'].lower() for i in dbs]
    for i,row in df.iterrows():
        table_of_query = row['Table']
        expect_schema = table_column_of_create_table(table_of_query)
        tables  = list(expect_schema.keys())
        all_columns = [value for values in expect_schema.values() for value in values]
        is_present = np.all(np.isin(np.array(tables), np.array(exists_table)))
        # all table from db is in exist tables (same database)
        if is_present:
            # print(expect_schema)
            # try:
                
            question = row['Question']

            # Got the expect column and table with string matching, parser and SQLParse lib
            columns = Parser(row['SQL']).columns
            expect_cols = []
            expect_table = []
            for col in columns:
                # found join function (Table1.column1)
                if "." in col:
                    table_name, column_name = col.split('.') 
                    expect_cols.append(column_name)
                    expect_table.append(table_name)
                elif col in all_columns:
                    expect_cols.append(col)
            
            expect_cols.extend([c for c in columns_from_query(row['SQL']) if c in all_columns])
            expect_cols.extend(columns_by_split(row['SQL'], all_columns))
            expect_table.extend(Parser(row['SQL']).tables)

            expect_table = list(set(expect_table))
            expect_cols = list(set(expect_cols))

            # filtering table name in columns list
            expect_cols = [c for c in expect_cols if c not in expect_table]

            db = map_table_db[expect_table[0]]
            table_in_db = db_to_table_map[db]

            if verbose:
                print(question)
                print(row['SQL'])
                print("DATABASE:", db)
                print("TABLES IN DB:", table_in_db)
                print("EXPECT TABLE:", expect_table)
                print("EXPECT COLUMNS:",expect_cols)
                print()

            # dataframe for merge to threshold dataframe
            threshold_result_df = pd.DataFrame({'query': [question],'actual col' : [expect_cols], '# actual col' : [len(expect_cols)]})

            for score in threshold_score:
                desc_anlz = desc_anlz_threshold[str(score)][db]
                for db_table, db_cols in expect_schema.items():
                    if db_table in expect_table:
                        for db_c in db_cols:
                            if db_c in expect_cols:
                                desc_anlz[db_table][db_c]['expect_question'].add(question)


                name_col_selected   = f"T{score} selected col"
                name_col_correct    = f"T{score} # correct"
                name_col_recall     = f"T{score} recall"
                name_col_precision  = f"T{score} precision"
                name_col_f1         = f"T{score} F1"
                

                # filtering schema from question (similarity description-base score --> string matching)
                result = filter_tables_by_description(question, column_threshold = score, filter_tables = False)
                result = column_from_question(question, used_table_col=result, token_id=new_df_index)
                # print(result)
                
                result_tables = []
                result_columns = []
                for t in result:
                    if t in table_in_db: 
                        result_tables.append(t)
                        result_columns.extend(list(result[t].keys()))

                        for rc in result[t].keys():
                            desc_anlz[t][rc]['use_question'].add(question)

                result_columns = list(set([c.lower() for c in result_columns]))

                # calculate accuracy

                table_TP = len(set(expect_table) & set(result_tables))
                table_FP = len(set(result_tables) - set(expect_table))
                table_FN = len(set(expect_table) - set(result_tables))
                col_TP = len(set(expect_cols) & set(result_columns))
                col_FP = len(set(result_columns) - set(expect_cols))
                col_FN = len(set(expect_cols) - set(result_columns))
                
                table_recall, table_precision, col_recall, col_precision, table_f1, col_f1 = (None, ) * 6

                # Calculate table_recall, table_precision, col_recall, and col_precision using safe_divide
                if table_TP is not None and table_FN is not None and table_FP is not None:
                    table_recall = safe_divide(table_TP, table_TP + table_FN)
                    table_precision = safe_divide(table_TP, table_TP + table_FP)

                if col_TP is not None and col_FN is not None and col_FP is not None:
                    if len(expect_cols) == 0 : col_recall = 1
                    else: col_recall = safe_divide(col_TP, col_TP + col_FN)
                    col_precision = safe_divide(col_TP, col_TP + col_FP)

                # Calculate table_f1 and col_f1 using safe_divide
                if table_precision is not None and table_recall is not None and table_recall != "ZeroDivisionError" and table_precision != "ZeroDivisionError":
                    table_f1 = 2 * safe_divide(table_precision * table_recall, table_precision + table_recall)

                if col_precision is not None and col_recall is not None and col_recall != "ZeroDivisionError" and col_precision != "ZeroDivisionError":
                    col_f1 = 2 * safe_divide(col_precision * col_recall, col_precision + col_recall)

                # Check for "division error" and set appropriate values
                if table_f1 == "ZeroDivisionErrorZeroDivisionError":table_f1 = "ZeroDivisionError"
                if col_f1 == "ZeroDivisionErrorZeroDivisionError": col_f1 = "ZeroDivisionError"
                if int(col_recall) == 1: col_recall_tid[str(score)].append(new_df_index)
                if verbose:
                    print("THRESHOLD:", score)
                    print("PREDICT TABLE:", result_tables)
                    print("PREDICT COLUMNS:", result_columns)
                    print("TABLE RECALL:", table_recall, "\tCOLUMNS RECALL:", col_recall)
                    print("TABLE PRECISION:", table_precision, "\tCOLUMNS PRECISION:", col_precision)
                    print("TABLE F1 SCORE:", table_f1, "\tCOLUMNS F1 SCORE:", col_f1)
                    print()

                result_data = {
                    'query' : [question],
                    name_col_selected : [result_columns],
                    name_col_correct : [np.sum(np.isin(np.array(result_columns), np.array(expect_cols)))],
                    name_col_recall : [col_recall],
                    name_col_precision : [col_precision],
                    name_col_f1 : [col_f1]
                }
                
                # dataframe for merge
                result_df = pd.DataFrame(result_data)
                threshold_result_df = pd.merge(threshold_result_df, result_df, on='query', how='outer')
            
            # append (concat) the row of full dataframe
            full_result_df = pd.concat([full_result_df, threshold_result_df], ignore_index=True)
            # for keep question id that recall 100%
            new_df_index += 1

            # except  Exception as e: print(e)
            print('------------------------------------------')
    return full_result_df

In [23]:
exact_match_id = []
result_df = expirement_test([0.6, 0.4, 0.2], verbose=True)

Find the number of members living in each address.
SELECT COUNT(*), address FROM member GROUP BY address
DATABASE: coffee_shop
TABLES IN DB: ['shop', 'member', 'happy_hour', 'happy_hour_member']
EXPECT TABLE: ['member']
EXPECT COLUMNS: ['address']

THRESHOLD: 0.6
PREDICT TABLE: ['shop', 'member']
PREDICT COLUMNS: ['address']
TABLE RECALL: 1.0 	COLUMNS RECALL: 1.0
TABLE PRECISION: 0.5 	COLUMNS PRECISION: 1.0
TABLE F1 SCORE: 0.66 	COLUMNS F1 SCORE: 1.0

THRESHOLD: 0.4
PREDICT TABLE: ['member', 'shop']
PREDICT COLUMNS: ['address']
TABLE RECALL: 1.0 	COLUMNS RECALL: 1.0
TABLE PRECISION: 0.5 	COLUMNS PRECISION: 1.0
TABLE F1 SCORE: 0.66 	COLUMNS F1 SCORE: 1.0

THRESHOLD: 0.2
PREDICT TABLE: ['shop', 'member', 'happy_hour', 'happy_hour_member']
PREDICT COLUMNS: ['level_of_membership', 'address', 'age', 'name', 'membership_card', 'member_id', 'num_of_staff_in_charge', 'total_amount', 'num_of_staff']
TABLE RECALL: 1.0 	COLUMNS RECALL: 1.0
TABLE PRECISION: 0.25 	COLUMNS PRECISION: 0.11
TABLE F1 S

In [24]:
result_df.shape

(244, 18)

In [25]:
exact_match_id = sorted(list(set(exact_match_id)))
print(len(exact_match_id))

216


In [26]:
print(len(set(col_recall_tid['0.6']).intersection(set(exact_match_id))))
print(len(set(col_recall_tid['0.4']).intersection(set(exact_match_id))))
print(len(set(col_recall_tid['0.2']).intersection(set(exact_match_id))))

102
148
207


In [27]:
result_df.head()

Unnamed: 0,query,actual col,# actual col,T0.6 selected col,T0.6 # correct,T0.6 recall,T0.6 precision,T0.6 F1,T0.4 selected col,T0.4 # correct,T0.4 recall,T0.4 precision,T0.4 F1,T0.2 selected col,T0.2 # correct,T0.2 recall,T0.2 precision,T0.2 F1
0,Find the number of members living in each addr...,[address],1,[address],1,1.0,1.0,1.0,[address],1,1.0,1.0,1.0,"[level_of_membership, address, age, name, memb...",1,1.0,0.11,0.2
1,Count the number of cinemas.,[],0,"[capacity, cinema_id]",0,1.0,0.0,0.0,"[show_times_per_day, location, openning_year, ...",0,1.0,0.0,0.0,"[show_times_per_day, directed_by, openning_yea...",0,1.0,0.0,0.0
2,How many rooms does each block floor have?,"[blockfloor, blockcode]",2,"[blockfloor, roomnumber, room, blockcode]",2,1.0,0.5,0.66,"[blockfloor, roomtype, room, blockcode, roomnu...",2,1.0,0.4,0.58,"[blockfloor, roomtype, examinationroom, room, ...",2,1.0,0.25,0.4
3,What procedures cost less than 5000 and have J...,"[treatment, name, employeeid, code, cost]",5,"[physician, cost]",1,0.2,0.5,0.28,"[physician, cost, pcp, name]",2,0.4,0.5,0.44,"[dateundergoes, insuranceid, dose, procedures,...",4,0.8,0.16,0.26
4,What is the location with the most cinemas ope...,"[location, openning_year]",2,"[location, openning_year]",2,1.0,1.0,1.0,"[show_times_per_day, openning_year, capacity, ...",2,1.0,0.33,0.5,"[show_times_per_day, directed_by, openning_yea...",2,1.0,0.13,0.24


In [31]:
# result_df.to_csv("expirement_filtering_columns.csv", index=False)

In [None]:
# {
#     'db_id':{
#         'table1': {
#             'col1': {
#                 'expect_question':{'q1, q2, q4'} , 
#                 'use_question': {'q2','q3'}
#             }
#         }
#     }
# }

## Description analysis Dataframe

In [34]:
# Example sets
expected_set = {'apple', 'banana', 'orange', 'kiwi'}
result_set = {'banana', 'kiwi', 'grape', 'pear'}

# Find the common elements
common_elements = expected_set.intersection(result_set)
print(result_set - expected_set)

# Calculate the length of common elements
length_of_common_elements = len(common_elements)

# Print the result
print("Common Elements:", common_elements)
print("Length of Common Elements:", length_of_common_elements)


{'grape', 'pear'}
Common Elements: {'banana', 'kiwi'}
Length of Common Elements: 2


In [93]:
def pretty_print(questions:set):
    return '\n'.join([f"- {q}" for q in questions])

In [94]:
def column_description(table_name, col_name):
    for table in dbs:
        if table_name == table['table']:
            return str(table['columns'][col_name])

In [95]:
full_result_df = pd.DataFrame()
# threshold_result_df = pd.DataFrame({'query': [question],'actual col' : [expect_cols], '# actual col' : [len(expect_cols)]})
first_time = True
for threshold, threshold_db in desc_anlz_threshold.items():
    threshold_result_df = pd.DataFrame()
    for db_name, db_tables in threshold_db.items():
        for table_name , table_cols in db_tables.items():
            for col_name, count_questions in table_cols.items():

                name_col_threshold = f"T{threshold}"
                num_correct_question = int(len(count_questions['expect_question'].intersection(count_questions['use_question'])))
                num_selected_question = int(len(count_questions['use_question']))
                correct_questions = pretty_print(count_questions['expect_question'].intersection(count_questions['use_question']))
                incorrect_questions = pretty_print(count_questions['use_question'] - count_questions['expect_question'])
                # print(count_questions)s
                data = {'Database' : [db_name],
                        'Table' : [table_name],
                        'Column' : [col_name],
                        f'{name_col_threshold} # selected' : [num_selected_question],
                        f'{name_col_threshold} # correct' : [num_correct_question],
                        f'{name_col_threshold} incorrect query' : [incorrect_questions],
                        f'{name_col_threshold} correct query' : [correct_questions]
                        }
                if first_time:
                    col_desc = column_description(table_name,col_name)
                    num_expect_questions = int(len(count_questions['expect_question']))
                    if num_expect_questions == 0:
                        continue
                    data.update({
                        'Description' : [col_desc],
                        '# queries' : [num_expect_questions],
                    })

                result_df = pd.DataFrame(data)
                threshold_result_df = pd.concat([threshold_result_df, result_df], ignore_index=True)
    if first_time:
        description_col = threshold_result_df.pop('Description')
        num_query_col = threshold_result_df.pop('# queries')
        full_result_df = threshold_result_df.copy()
        full_result_df.insert(3, 'Description', description_col)
        full_result_df.insert(4, '# queries', num_query_col)
        first_time = False
    else:
        full_result_df = pd.merge(full_result_df, threshold_result_df, on=['Database', 'Table', 'Column'], how='outer')

full_result_df = full_result_df.dropna()

{'expect_question': {'What are the names of actors who have been in the musical titled The Phantom of the Opera?', 'What are the names of musicals who have no actors?', 'Show names of musicals and the number of actors who have appeared in the musicals.', 'How many actors have appeared in each musical?', 'Show names of actors in descending order of the year their musical is awarded.', 'What are the names of actors ordered descending by the year in which their musical was awarded?', "Show names of actors that have appeared in musical with name 'The Phantom of the Opera'.", 'Show names of actors and names of musicals they are in.', 'Show names of musicals which have at least three actors.', 'What are the names of musicals who have at 3 or more actors?', 'List the name of musicals that do not have actors.', 'What are the names of actors and the musicals that they are in?'}, 'use_question': set()}
{'expect_question': {'What are the names of actors who have been in the musical titled The Pha

In [96]:
# full_result_df.to_excel('column-description_analysis.xlsx', index=False)
full_result_df

Unnamed: 0,Database,Table,Column,Description,# queries,T0.6 # selected,T0.6 # correct,T0.6 incorrect query,T0.6 correct query,T0.4 # selected,T0.4 # correct,T0.4 incorrect query,T0.4 correct query,T0.2 # selected,T0.2 # correct,T0.2 incorrect query,T0.2 correct query
0,musical,musical,musical_id,Unique identifier for the musical,12.0,0.0,0.0,,,11,4,- List the most common result of the musicals....,- What are the names of actors ordered descend...,29,12,- List the most common result of the musicals....,- What are the names of actors who have been i...
1,musical,musical,name,Name of the musical,14.0,17.0,13.0,- List the name of actors whose age is not 20....,- What are the names of actors who have been i...,27,14,- List the most common result of the musicals....,- What are the names of actors who have been i...,32,14,- List the most common result of the musicals....,- What are the names of actors who have been i...
2,musical,musical,year,Year the musical was produced,2.0,3.0,2.0,- What are the names of actors who are not 20 ...,- What are the names of actors ordered descend...,22,2,- How many musicals has each nominee been nomi...,- What are the names of actors ordered descend...,31,2,- Show the musical nominee with award 'Bob Fos...,- What are the names of actors ordered descend...
3,musical,musical,award,Award received by the musical,6.0,6.0,5.0,- What are the names of actors ordered descend...,- Show the musical nominee with award 'Bob Fos...,18,6,- List the most common result of the musicals....,- Show the musical nominee with award 'Bob Fos...,35,6,- List the name of actors in ascending alphabe...,- Show the musical nominee with award 'Bob Fos...
4,musical,musical,nominee,Name of the nominee associated with the musical,14.0,16.0,14.0,- What are the names of actors ordered descend...,- Who are the nominees who have been nominated...,28,14,- List the most common result of the musicals....,- Who are the nominees who have been nominated...,39,14,- List the name of actors in ascending alphabe...,- Who are the nominees who have been nominated...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,coffee_shop,member,age,Age of the member,3.0,3.0,3.0,,- Find the ids and names of members who are un...,5,3,- How many members have the black membership c...,- Find the ids and names of members who are un...,10,3,- Give me the names of members whose address i...,- Find the ids and names of members who are un...
100,coffee_shop,member,time_of_purchase,Date and time of purchase,1.0,0.0,0.0,,,2,1,- Show the shop addresses ordered by their ope...,"- Find the purchase time, age and address of e...",9,1,- Which months have more than 2 happy hours?\n...,"- Find the purchase time, age and address of e..."
101,coffee_shop,member,address,Address of the member,7.0,10.0,6.0,- Show the shop addresses ordered by their ope...,- Give me the names of members whose address i...,13,6,- How many members have the black membership c...,- Give me the names of members whose address i...,14,7,- How many members have the black membership c...,- Give me the names of members whose address i...
102,coffee_shop,happy_hour,shop_id,Identifier of the shop hosting the happy hour,3.0,3.0,3.0,,- which shop has happy hour most frequently? L...,5,3,- Which month has the most happy hours?\n- Whi...,- which shop has happy hour most frequently? L...,9,3,- Which months have more than 2 happy hours?\n...,- which shop has happy hour most frequently? L...
