# Compare models experiments with PointX

In [24]:
import pandas as pd
import os, json, sqlite3, asyncio, random, re
from dotenv import load_dotenv
from openai import OpenAI

from SchemaLinking import SchemaLinking
import warnings
from transformers import AutoTokenizer, AutoModelForCausalLM
import google.generativeai as genai

In [25]:
load_dotenv('../.env')
tokenizer = AutoTokenizer.from_pretrained("../models/nsql-350M")
model = AutoModelForCausalLM.from_pretrained("../models/nsql-350M")

GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

In [26]:
system_content_puresql = """You are a helpful assistant for generate SQL query from user-specified questions. 
please return only answer of sql string query result !!! 
Do not return any other format the user has provided to you.
This is example of output format which user expect from you
query : 'SELECT...'
"""

system_content_schemaprovide = """You are a helpful assistant for generate SQL query from user-specified questions and schema.
User provides you with a question.
Please return only a sql string query results.
Do not return any other format the user has provided to you.
This is example of output format which user expect from you
query : 'SELECT...'
"""

system_content_fillmask = """You are a helpful assistant for generate SQL query from user-specified questions and schema. 
User has some SQL where the [MASK] columns, condition values and tables are syntaxed and User wants you to respond to output that populates the [MASK] column of the SQL input followed by the question and schema description (name - description - data type).
If you don't know which column to fill in Do not include columns that you have created yourself. And only columns defined from the schema must be used. 
Do not use columns from other tables or schema. must also be used from the same table defined in the input.
If you must enter conditional values Please decide the format or value based on the sample values of that column.
If that column has many too long category value please decide base on column description.
please return only the answer of sql string query result!!! ('SELECT...')
"""

In [27]:
zero_shot_prompt_mask = """For example:
table :     cat - this table contain cat information 
columns :    id - number for identify cat | number
            name - name of cat | text
            age - age of cat | number
            birth_date - pet birthday in format 'YYYY-MM-DD' | datetime
            gender - gender of cat (male, female) | text

question : Show me number of cat for each gender which born before March 23, 2011.
input : SELECT [MASK], COUNT([MASK]) FROM [MASK] WHERE [MASK] < [MASK] GROUP BY [MASK] ;
query : SELECT gender, COUNT(*) FROM cat WHERE birth_date < '2011-03-23' GROUP BY gender;

"""

zero_shot_prompt = """For example:
table :     cat - this table contain cat information 
columns :    id - number for identify cat | number
            name - name of cat | text
            age - age of cat | number
            birth_date - pet birthday in format 'YYYY-MM-DD' | datetime
            gender - gender of cat (male, female) | text

question : 'Show me number of cat for each gender which born before March 23, 2011.'
query : 'SELECT gender, COUNT(*) FROM cat WHERE birth_date < '2011-03-23' GROUP BY gender;'

"""

In [44]:
def create_nsql_prompt(schema_link:object, question:str, used_schema:dict) -> str:
    """
    Generate a prompt for applying into SQL generation model based on the question and schema.

    Parameters:
    schema_link (object): The instance of the class containing schema information.
    question (str): The question for which the prompt is generated.
    used_schema (dict): A dictionary containing tables as keys and lists of columns as values after filtering the schema.

    Returns:
    str: A prompt for applying into SQL generation model.

    Example:
    prompt = create_prompt(schema_instance, "What are the total sales?", 
                          { 'sales': {'date' : 0.3, 'amount' : 0.61}, 
                            'products': {'name' : 0.23, 'price' : 0.57}})
    print(prompt)

    CREATE TABLE sales ( date DATE, amount INT,PRIMARY KEY ("date") )
    -- Using valid SQLite, answer the following questions for the tables provided above.
    -- What are the total sales?
    SELECT
    """
    full_sql = ""
    for table, columns in used_schema.items():
        if not len(columns): continue       # pass this table when no column
        primary_keys = schema_link.schema_datatypes[table]["JOIN_KEY"]["PK"]
        foreign_keys = list(schema_link.schema_datatypes[table]["JOIN_KEY"]["FK"].keys())
        join_table_key = primary_keys + foreign_keys
        
        sql = f"CREATE TABLE {table} ("
        for column in columns:
            if column in join_table_key and len(join_table_key): join_table_key.remove(column)
            try:
                sql += f' {column} {schema_link.schema_datatypes[table]["COLUMNS"][column]},'
            except KeyError: 
                print(f"KeyError :{column}")
                
        if len(join_table_key): # key for join of table are remaining
            for column in join_table_key:
                sql += f' {column} {schema_link.schema_datatypes[table]["COLUMNS"][column]},'

        # All table contain PK (maybe)
        if len(primary_keys):
            sql += 'PRIMARY KEY ('
            for pk_type in primary_keys: sql += f'"{pk_type}" ,'
            sql = sql[:-1] + "),"

        if len(foreign_keys):
            for fk, ref_table_column in schema_link.schema_datatypes[table]["JOIN_KEY"]["FK"].items():
                sql += f' FOREIGN KEY ("{fk}") REFERENCES "{list(ref_table_column.keys())[0]}" ("{list(ref_table_column.values())[0]}"),'

        sql = sql[:-1] + " )\n\n"
        full_sql += sql
    prompt = full_sql + "-- Using valid SQLite, answer the following questions for the tables provided above."
    prompt = prompt + '\n' + '-- ' + question
    prompt = prompt + '\n' + "SELECT"

    return prompt

def generate_nsql_sql(prompt):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        generated_ids = model.generate(input_ids, max_length=1000)
        sql = tokenizer.decode(generated_ids[0], skip_special_tokens=True).split('\n')[-1]
    return sql

async def LLM_gensql(full_prompt:str, system_content:str, llm_model:str) -> str:
        """
        Generate SQL query followed by prompt

        Parameters:
        prompt (str): prompt for generate result
        llm_model (str): model-service name for generate result

        Returns:
        str: The complete SQL query.
        """
        if llm_model in ['gemini-pro']:
            try:
                print(llm_model)
                gemini_prompt = system_content + full_prompt
                genai.configure(api_key=GOOGLE_API_KEY)
                gemini_model = genai.GenerativeModel(llm_model)
                gemini_model.temperature = 0
                response = gemini_model.generate_content(gemini_prompt)
                return response.text
            
            except Exception as e:
                 return f"Google AI Error : {e}"
            
        elif llm_model in ['gpt-3.5-turbo', 'gpt-4-0125-preview']:
            
            API_KEY = OPENAI_API_KEY
            base_url = "https://api.openai.com/v1"
            return "API ERROR"
        
        elif llm_model in ['deepseek-coder', 'deepseek-chat']:
            API_KEY = DEEPSEEK_API_KEY
            base_url = "https://api.deepseek.com/v1"
        try:
            print(llm_model)
            
            client = OpenAI(api_key=API_KEY, base_url=base_url)
            response = client.chat.completions.create(
                model=llm_model,
                messages=[
                        {"role": "system",
                            "content": system_content},
                        {"role": "user", 
                            "content": full_prompt},
                        ],
                stop=['\n'],
                temperature=0
            )
            return response.choices[0].message.content
        
        except Exception as e:
            return f"API Error :{e}"
            
def get_reason(schema_link:object, sql_result:str) -> str:
    """
    Get the reason message related to the selected columns and tables from the schema based on the SQL query.

    Parameters:
    schema_link (object): The instance of the class containing schema information.
    sql_result (str): The SQL query result for which the reason message is generated.

    Returns:
    str: The reason message explaining the selection of columns and tables from the schema.

    Example:
    get_reason(schema_instance, "SELECT column1, column2 FROM table1 WHERE column3 = 'value'")

    Table - table1 : Description of table1
        Column - column1 : Description of column1
        Column - column2 : Description of column2
        Column - column3 : Description of column3
    """

    table_col_sql = schema_link.table_col_of_sql(sql_result)
    reason = ""

    for table, cols in table_col_sql.items():
        _df = schema_link.column_info_df[schema_link.column_info_df['Table'] == table][['Column', 'Description']].drop_duplicates()
        table_reason = f"Table - {table}\t: {schema_link.table_descriptions[table]['text']}\n"
        if len(cols):       # have columns of table
            col_reason = "\n".join([f"\tColumn - {c}\t: {_df.loc[_df['Column'] == c, 'Description'].values[0]}" for c in cols])
        else: col_reason = ""
        reason += str(table_reason + col_reason + "\n\n")

    return reason

In [37]:
def create_llm_prompt(schema_link:object, used_schema:dict, question:str, masked_query:str, 
                      few_shot:str=zero_shot_prompt_mask, is_marked:bool=True, is_fewshot:bool=True) -> str:

    full_prompt = ""
    for table_name, column_score in used_schema.items():
        _df = schema_link.column_info_df[schema_link.column_info_df['Table'] == table_name][['Column', 'Description']].drop_duplicates()
        full_prompt += f"\ntable : {table_name} - {schema_link.table_descriptions[table_name]['text']}\ncolumns:"

        for column_name in column_score:
            full_prompt += f"\t{column_name} - {_df[_df['Column'] == column_name]['Description'].values[0]}"
            full_prompt += f" | {schema_link.schema_datatypes[table_name]['COLUMNS'][column_name]}\n"

    full_prompt += f"question : {question}\n"
    if is_marked: full_prompt += f"input : {masked_query}"
    if is_fewshot: full_prompt = few_shot + full_prompt
    
    return full_prompt + "\nquery : "

In [38]:
def query_pointx_db(sql_query):
    try:
        conn = sqlite3.connect(f'../src/pointx/pointx.db')
        cursor = conn.cursor()
    except:
        return "CANNOT CONNECT DATABASE"
    try:
        cursor.execute(sql_query)
        results = cursor.fetchall()
    except:
        return "CANNOT FETCHING DATA"
    conn.close()
    return results

In [39]:
def table_col_of_sql(schema_link, sql_query:str) -> dict:
        """
        Extract tables and their corresponding columns from the given SQL query.

        Parameters:
        sql_query (str): The SQL query from which tables and columns need to be extracted.

        Returns:
        dict: A dictionary containing tables as keys and lists of columns as values.

        Example:
        SchemaLinking.table_col_of_sql("SELECT column1, column2 FROM table1 WHERE column3 = 'value'")
        {'table1': ['column1', 'column2', 'column3']}
        """
        
        selected_schema = {}
        query_split = re.split(schema_link.split_pattern, sql_query)
        for table in schema_link.schema_datatypes.keys():
            if table in query_split:
                selected_col = []
                for col in schema_link.schema_datatypes[table]['COLUMNS'].keys():
                    if col in query_split: selected_col.append(col)
                selected_schema[table] = selected_col

        return selected_schema

In [40]:
with open(f"../src/pointx/Schema/embedded_data.json", "r") as f:
    domain = json.load(f)

schema_link = SchemaLinking(domain)

async def ChatQ_pipeline(question:str, domain_tables:list, llm_model_name:str, 
                         max_n:int=10,verbose:bool=True, get_final_prompt:bool=False):

    if not domain_tables: domain_tables = list(domain['tables'].keys())
    used_schema = schema_link.filter_schema(question, domain_tables, max_n=max_n)
    nsql_prompt = create_nsql_prompt(schema_link, question, used_schema)
    nsql_sql_result = generate_nsql_sql(nsql_prompt)
    masked_query = schema_link.masking_query(nsql_sql_result)
    llm_prompt = create_llm_prompt(schema_link, used_schema, question, masked_query)
    if get_final_prompt: return llm_prompt
    llm_result = await LLM_gensql(llm_prompt, system_content_fillmask, llm_model_name)
    if verbose:
        reason = get_reason(schema_link, llm_result)
        print("========= QUESTION =========")
        print(question)
        print()
        print("========= NSQL SQL =========")
        print(nsql_sql_result)
        print()
        print("========= LLM SQL =========")
        print(llm_result)
        print()
        print("========= REASON =========")
        print(reason)
        print()
        print("========= SCHEMA =========")
        print(used_schema)
        print()

    return llm_result

In [41]:
q_pair_df = pd.read_csv("../src/pointx/Train set/PointX_questionpair.csv")[['Table', 'Question', 'Actual SQL']]
print(q_pair_df.shape)
q_pair_df.head()

(124, 3)


Unnamed: 0,Table,Question,Actual SQL
0,pointx_keymatrix_dly,What is the total amout of all financial trans...,"SELECT month_id, SUM(ntx_pointx_financial) FRO..."
1,pointx_keymatrix_dly,What is the total amount of points generated b...,SELECT SUM(amt_point_topup) FROM pointx_keymat...
2,pointx_keymatrix_dly,What is the total amount of points generated b...,"SELECT month_id, SUM(amt_point_pay) FROM point..."
3,pointx_keymatrix_dly,What is the average rate of released points fo...,SELECT AVG(rate_point_per_baht_pay) FROM point...
4,pointx_keymatrix_dly,Can you determine the average number of custom...,"SELECT month_id, AVG(ncust_visit) FROM pointx_..."


In [42]:
# # delete duplicate experiment record

# _df = pd.read_excel('results/temp_model_description_experiments.xlsx')
# print(_df.shape)
# _df.drop_duplicates(subset=['Question'], inplace=True)
# print(_df.shape)
# _df.to_excel('results/temp_model_description_experiments.xlsx', index=False)

In [43]:
query = await ChatQ_pipeline("Which payment methods occurred the most from June 5 to July 1, 2022?",
                            ['pointx_fbs_rpt_dly'], "gpt-3.5-turbo", verbose=True)
query

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


gpt-3.5-turbo


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Which payment methods occurred the most from June 5 to July 1, 2022?

SELECT payment_method FROM pointx_fbs_rpt_dly WHERE event_date BETWEEN 'June 5' AND 'July 1, 2022' GROUP BY payment_method ORDER BY COUNT(*) DESC LIMIT 1;

API Error :Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}



{'pointx_fbs_rpt_dly': {'event_date': 0.193, 'event_month': 0.299, 'user_ltv_revenue': 0.206, 'user_ltv_currency': 0.258, 'delivery_fee': 0.211, 'delivery_type': 0.221, 'each_point_card': 0.279, 'payment_method': 0.645, 'stock_code': 0.185, 'total_amount': 0.33, 'total_point': 0.188, 'transaction_type': 0.269, '_dl_load_ts': 0.251, '_date': 0.332}}



"API Error :Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}"

## GenAI Model with provide schema-description

In [None]:
def split_list(input_list, chunk_size):
    for i in range(0, len(input_list), chunk_size):
        yield input_list[i:i + chunk_size]

In [None]:
def sample_columns(schema_link:object, table_name:str, used_columns:dict, noise:int=20) -> dict:
    table_columns = list(schema_link.column_info_df[schema_link.column_info_df['Table'] == table_name]['Column'].unique())
    remaining_values = [value for value in table_columns if value not in used_columns[table_name]]
    remaining_count = noise - len(used_columns[table_name])
    random_selected_values = random.sample(remaining_values, remaining_count)
    result = list(used_columns[table_name]) + random_selected_values
    used_schema = { table_name : result}
    return used_schema

In [None]:
def yeild_columns(schema_link, sql_queries:list) -> list:

    selected_schema = {}
    for sql_query in sql_queries:
        
        query_split = re.split(schema_link.split_pattern, sql_query)
        
        for table in schema_link.schema_datatypes.keys():
            if table in query_split:
                selected_col = []
                for col in schema_link.schema_datatypes[table]['COLUMNS'].keys():
                    if col in query_split:
                        selected_col.append(col)
                selected_schema[table] = selected_col
    
    return selected_schema


In [None]:
sample_columns(schema_link, 'pointx_fbs_rpt_dly', { "pointx_fbs_rpt_dly":{'month_id':0.2}}, 10)

In [45]:
predict_data = {
    "Question" : [],
    "Desc DeepSeek" : [],
    "Desc GPT3.5" : [],
    "Desc GPT4" : [],
    "Desc Gemini" : []
}


i = 0
n_chunk = 5

temp_result_file = "results/temp_model_description_top10_experiments.xlsx"

if os.path.exists(temp_result_file):
    print("File exist")
    _df = pd.read_excel(temp_result_file)
    i = _df.shape[0]
    for key in predict_data:
        if key in _df.columns:
            predict_data[key] = _df[key].tolist()


for table_name in q_pair_df['Table'].unique():
    table_df = q_pair_df[q_pair_df['Table'] == table_name]
    table_questions = table_df['Question'].to_list()
    table_actualSQL = table_df['Actual SQL'].to_list()
    # list_chunk_questions = list(split_list(table_questions, n_chunk))
    # list_chunk_actualSQL = list(split_list(table_actualSQL, n_chunk))
    # cannot used full schema because context in larger than handle
    # used_schema = { table_name : list(schema_link.column_info_df[schema_link.column_info_df['Table'] == table_name]['Column'].unique())}
    
    
    for question, sql in zip(table_questions, table_actualSQL):
        if question in predict_data['Question']: continue
        # used_columns = yeild_columns(schema_link,sql)
        used_columns = table_col_of_sql(schema_link, sql)

        # used_schema = sample_columns(schema_link, table_name, used_columns)
        used_schema = schema_link.filter_schema(question, [table_name], max_n=30)

        table_prompt = create_llm_prompt(schema_link, used_schema, question, zero_shot_prompt, is_marked=False)

        gemini_results = LLM_gensql(table_prompt, system_content_schemaprovide, 'gemini-pro')
        gpt3_5_results = LLM_gensql(table_prompt, system_content_schemaprovide, 'gpt-3.5-turbo')
        gpt4_results = LLM_gensql(table_prompt, system_content_schemaprovide, 'gpt-4-0125-preview')
        deepseek_result = LLM_gensql(table_prompt, system_content_schemaprovide, 'deepseek-coder')

        print("Generating SQL...")
        gemini_results, deepseek_result, gpt3_5_results, gpt4_results = await asyncio.gather(gemini_results, deepseek_result, gpt3_5_results, gpt4_results)

        print(question)
        print(gemini_results, deepseek_result, gpt3_5_results, gpt4_results, sep='\n')

        predict_data['Question'].append(question)
        predict_data['Desc DeepSeek'].append(deepseek_result)
        predict_data['Desc GPT3.5'].append(gpt3_5_results)
        predict_data['Desc GPT4'].append(gpt4_results)
        predict_data['Desc Gemini'].append(gemini_results)

        i += 1
        if not i % 5:
            save_df = pd.DataFrame(predict_data)
            save_df.to_excel(temp_result_file, index=False)
            print("SAVE TEMP COMPLETE", i)
        
    save_df = pd.DataFrame(predict_data)
    save_df.to_excel(temp_result_file, index=False)
    print("SAVE TEMP COMPLETE", i)

save_df = pd.DataFrame(predict_data)
save_df.to_excel("results/model_description_top10_experiments.xlsx", index=False)

Generating SQL...
gemini-pro
deepseek-coder
What is the total amout of all financial transactions for each month?
SELECT month_id, SUM(ntx_pointx_financial) AS total_transactions 
FROM pointx_keymatrix_dly 
GROUP BY month_id;
SELECT month_id, SUM(ntx_pointx_financial) FROM pointx_keymatrix_dly GROUP BY month_id;

API ERROR
API ERROR
String matching ['points']
Generating SQL...
gemini-pro
deepseek-coder
What is the total amount of points generated by all top-up transactions in August 2022?
SELECT SUM(amt_point_topup) FROM pointx_keymatrix_dly WHERE month_id = '2022-08';
SELECT SUM(amt_point_topup) FROM pointx_keymatrix_dly WHERE month_id = '202208';

API ERROR
API ERROR
String matching ['points']
Generating SQL...
gemini-pro
deepseek-coder
What is the total amount of points generated by all payment transactions for each month in 2022?
SELECT month_id, SUM(amt_point_pay) AS total_points_generated
FROM pointx_keymatrix_dly
WHERE month_id LIKE '2022%'
GROUP BY month_id;
SELECT month_id, SU

## GenAI Model with Framework pipeline

In [None]:
predict_data = {
    "Question" : [],
    "Desc DeepSeek" : [],
    "Desc GPT3.5" : [],
    "Desc GPT4" : [],
    "Desc Gemini" : []
}


i = 0
n_chunk = 5

temp_result_file = "results/temp_model_chatq_experiments.xlsx"

if os.path.exists(temp_result_file):
    print("File exist")
    _df = pd.read_excel(temp_result_file)
    i = _df.shape[0]
    for key in predict_data:
        if key in _df.columns:
            predict_data[key] = _df[key].tolist()


for table_name in q_pair_df['Table'].unique():
    table_df = q_pair_df[q_pair_df['Table'] == table_name]
    table_questions = table_df['Question'].to_list()
    table_actualSQL = table_df['Actual SQL'].to_list()
    
    
    for question, sql in zip(table_questions, table_actualSQL):
        if question in predict_data['Question']: continue
        
        table_prompt = await ChatQ_pipeline(question, domain_tables=[table_name], 
                                            llm_model_name=None, max_n=10,get_final_prompt=True)
        gemini_results = LLM_gensql(table_prompt, system_content_fillmask, 'gemini-pro')
        gpt3_5_results = LLM_gensql(table_prompt, system_content_fillmask, 'gpt-3.5-turbo')
        gpt4_results = LLM_gensql(table_prompt, system_content_fillmask, 'gpt-4-0125-preview')
        deepseek_result = LLM_gensql(table_prompt, system_content_fillmask, 'deepseek-coder')

        print("Generating SQL...")
        gemini_results, deepseek_result, gpt3_5_results, gpt4_results = await asyncio.gather(gemini_results, deepseek_result, gpt3_5_results, gpt4_results)

        print(question)
        print(gemini_results, deepseek_result, gpt3_5_results, gpt4_results, sep='\n')

        predict_data['Question'].append(question)
        predict_data['Desc DeepSeek'].append(deepseek_result)
        predict_data['Desc GPT3.5'].append(gpt3_5_results)
        predict_data['Desc GPT4'].append(gpt4_results)
        predict_data['Desc Gemini'].append(gemini_results)

        i += 1
        if not i % 5:
            save_df = pd.DataFrame(predict_data)
            save_df.to_excel(temp_result_file, index=False)
            print("SAVE TEMP COMPLETE", i)
        
    save_df = pd.DataFrame(predict_data)
    save_df.to_excel(temp_result_file, index=False)
    print("SAVE TEMP COMPLETE", i)

save_df = pd.DataFrame(predict_data)
save_df.to_excel("results/model_chatq_experiments.xlsx", index=False)

## Pure question without providing schema and description

In [None]:
predict_data = {
    "Question" : [],
    "Desc DeepSeek" : [],
    "Desc GPT3.5" : [],
    "Desc GPT4" : [],
    "Desc Gemini" : []
}


i = 0
n_chunk = 5

temp_result_file = "results/temp_model_pureQ_experiments.xlsx"

if os.path.exists(temp_result_file):
    print("File exist")
    _df = pd.read_excel(temp_result_file)
    i = _df.shape[0]
    for key in predict_data:
        if key in _df.columns:
            predict_data[key] = _df[key].tolist()


for table_name in q_pair_df['Table'].unique():
    table_df = q_pair_df[q_pair_df['Table'] == table_name]
    table_questions = table_df['Question'].to_list()
    table_actualSQL = table_df['Actual SQL'].to_list()
    
    
    for question, sql in zip(table_questions, table_actualSQL):
        if question in predict_data['Question']: continue

        gemini_results = LLM_gensql(question, system_content_puresql, 'gemini-pro')
        gpt3_5_results = LLM_gensql(question, system_content_puresql, 'gpt-3.5-turbo')
        gpt4_results = LLM_gensql(question, system_content_puresql, 'gpt-4-0125-preview')
        deepseek_result = LLM_gensql(question, system_content_puresql, 'deepseek-coder')

        print("Generating SQL...")
        gemini_results, deepseek_result, gpt3_5_results, gpt4_results = await asyncio.gather(gemini_results, deepseek_result, gpt3_5_results, gpt4_results)

        print(question)
        print(gemini_results, deepseek_result, gpt3_5_results, gpt4_results, sep='\n')

        predict_data['Question'].append(question)
        predict_data['Desc DeepSeek'].append(deepseek_result)
        predict_data['Desc GPT3.5'].append(gpt3_5_results)
        predict_data['Desc GPT4'].append(gpt4_results)
        predict_data['Desc Gemini'].append(gemini_results)

        i += 1
        if not i % 5:
            save_df = pd.DataFrame(predict_data)
            save_df.to_excel(temp_result_file, index=False)
            print("SAVE TEMP COMPLETE", i)
        
    save_df = pd.DataFrame(predict_data)
    save_df.to_excel(temp_result_file, index=False)
    print("SAVE TEMP COMPLETE", i)

save_df = pd.DataFrame(predict_data)
save_df.to_excel("results/model_pureQ_experiments.xlsx", index=False)

## Pure SQL

In [None]:
predict_data = {
    "Question" : [],
    "ChatQ - NSQL" : []
}


i = 0
n_chunk = 5

temp_result_file = "results/temp_model_onlyNSQL_experiments.xlsx"

if os.path.exists(temp_result_file):
    print("File exist")
    _df = pd.read_excel(temp_result_file)
    i = _df.shape[0]
    for key in predict_data:
        if key in _df.columns:
            predict_data[key] = _df[key].tolist()


for table_name in q_pair_df['Table'].unique():
    table_df = q_pair_df[q_pair_df['Table'] == table_name]
    table_questions = table_df['Question'].to_list()
    table_actualSQL = table_df['Actual SQL'].to_list()
    
    
    for question, sql in zip(table_questions, table_actualSQL):
        if question in predict_data['Question']: continue

        gemini_results = LLM_gensql(question, system_content_puresql, 'gemini-pro')
        gpt3_5_results = LLM_gensql(question, system_content_puresql, 'gpt-3.5-turbo')
        gpt4_results = LLM_gensql(question, system_content_puresql, 'gpt-4-0125-preview')
        deepseek_result = LLM_gensql(question, system_content_puresql, 'deepseek-coder')

        print("Generating SQL...")
        gemini_results, deepseek_result, gpt3_5_results, gpt4_results = await asyncio.gather(gemini_results, deepseek_result, gpt3_5_results, gpt4_results)

        print(question)
        print(gemini_results, deepseek_result, gpt3_5_results, gpt4_results, sep='\n')

        predict_data['Question'].append(question)
        predict_data['Desc DeepSeek'].append(deepseek_result)
        predict_data['Desc GPT3.5'].append(gpt3_5_results)
        predict_data['Desc GPT4'].append(gpt4_results)
        predict_data['Desc Gemini'].append(gemini_results)

        i += 1
        if not i % 5:
            save_df = pd.DataFrame(predict_data)
            save_df.to_excel(temp_result_file, index=False)
            print("SAVE TEMP COMPLETE", i)
        
    save_df = pd.DataFrame(predict_data)
    save_df.to_excel(temp_result_file, index=False)
    print("SAVE TEMP COMPLETE", i)

save_df = pd.DataFrame(predict_data)
save_df.to_excel("results/model_pureQ_experiments.xlsx", index=False)

# NSQL only

In [None]:
predict_data = {
    "Question" : [],
    "pure NSQL" : []
}


i = 0
n_chunk = 5

temp_result_file = "results/temp_model_pureNSQL_experiments.xlsx"

if os.path.exists(temp_result_file):
    print("File exist")
    _df = pd.read_excel(temp_result_file)
    i = _df.shape[0]
    for key in predict_data:
        if key in _df.columns:
            predict_data[key] = _df[key].tolist()


for table_name in q_pair_df['Table'].unique():
    table_df = q_pair_df[q_pair_df['Table'] == table_name]
    table_questions = table_df['Question'].to_list()
    table_actualSQL = table_df['Actual SQL'].to_list()
    
    
    for question, sql in zip(table_questions, table_actualSQL):
        if question in predict_data['Question']: continue
        
        used_schema = schema_link.filter_schema(question, [table_name], max_n=50)
        nsql_prompt = create_nsql_prompt(schema_link, question, used_schema)
        nsql_sql_result = generate_nsql_sql(nsql_prompt)

        predict_data['Question'].append(question)
        predict_data['pure NSQL'].append(nsql_sql_result)

        i += 1
        if not i % 5:
            save_df = pd.DataFrame(predict_data)
            save_df.to_excel(temp_result_file, index=False)
            print("SAVE TEMP COMPLETE", i)
        
    save_df = pd.DataFrame(predict_data)
    save_df.to_excel(temp_result_file, index=False)
    print("SAVE TEMP COMPLETE", i)

save_df = pd.DataFrame(predict_data)
save_df.to_excel("results/model_pureNSQL_experiments.xlsx", index=False)

# Structure by masking

In [None]:
import pandas as pd
import json
from SchemaLinking import SchemaLinking

In [18]:
import re

def extract_sql_query(text):
    
    sql_patterns = [r'```sql(.*?)```', r'```(.*?)```', r'(SELECT.*?;)', r'(SELECT.*)']
    
    for pattern in sql_patterns:
        match = re.search(pattern, text, re.DOTALL)
        if match:
            return match.group(1).strip()
    return text


In [None]:
with open(f"../src/src_dev/pointx/embedded_data.json", "r") as f:
    domain = json.load(f)

schema_link = SchemaLinking(domain)

result_file = pd.read_excel('../src/pointx/NLQ2SQL model exp result.xlsx').iloc[:,3:]
result_file = result_file.applymap(str)
for col in result_file.columns:
    result_file[col] = result_file[col].apply(extract_sql_query)
result_file.head()

In [None]:
ncols = len(result_file.columns) // 2
sql_df = result_file.iloc[:,:ncols]
sql_df.head()

In [None]:
schema_link.masking_query("SELECT customer_type, SUM(ntx) FROM pointx_cust_mly GROUP BY customer_type;")

In [None]:
def mask_query(sql):
    masked_sql = schema_link.masking_query(sql).replace('\n',' ').strip()
    cleaned_query = re.sub(r'\b(?:AS|as)\s+\w+\b', '', masked_sql)
    return cleaned_query

In [None]:
q = """SELECT customer_type as eiei, SUM(ntx) AS TOTAL
FROM pointx_cust_mly 
GROUP BY customer_type;"""
mask_query(q)

In [None]:
sql_df_copy = sql_df.copy()
columns = sql_df_copy.columns

for col in columns:
    print(col)
    new_col = f"MASK {col}"
    sql_df_copy[new_col] = sql_df_copy[col].apply(mask_query)

In [None]:
sql_df_copy.to_excel("results/temp_experiments.xlsx", index=False)

# Spider

In [None]:
import pandas as pd
import os, json, sqlite3, re, time
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
folder_path = "../src/spider/database"

db = dict()

if os.path.exists(folder_path) and os.path.isdir(folder_path):
    files = os.listdir(folder_path)
    for db_id in files:
        db_path = os.path.join(folder_path, db_id)
        sqlite_db = [os.path.join(db_path, sql) for sql in os.listdir(db_path) if ".sqlite" in sql]
        assert len(sqlite_db) == 1
        db[db_id] = sqlite_db[0]


In [16]:
def get_schema(sqlite_db_path):
    connection = sqlite3.connect(sqlite_db_path)
    cursor = connection.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()
    full_sql = ""
    for table in tables:
        table_name = table[0]
        cursor.execute(f"PRAGMA table_info({table_name});")
        columns = cursor.fetchall()

        sql = f"CREATE TABLE {table_name} ("
        for column in columns:
            column_name = column[1]
            column_datatype = column[2].lower()
            sql += f"{column_name} {column_datatype}, "
        sql = sql[:-2] + ");"
        full_sql += sql
    
    cursor.close()
    connection.close()
    return full_sql

def create_prompt(question, schema):
    full_prompt = ""
    full_prompt += f"{str(schema)}\n\n"
    full_prompt += "-- Using valid SQLite, answer the following questions for the tables provided above.\n\n"
    full_prompt += f"--{question}\n\nSELECT"
    return full_prompt

def extract_table_and_columns(sql_statements):
    # Regular expression pattern to match the table name and column names
    pattern = r'CREATE\s+TABLE\s+(\w+)\s*\((.+?)\);?'

    table_column_pairs = {}
    matches = re.finditer(pattern, sql_statements, re.IGNORECASE)
    for match in matches:
        table_name = match.group(1)
        columns = [col.strip().split()[0] for col in match.group(2).split(',')]
        table_column_pairs[table_name] = columns

    return table_column_pairs

def table_column_of_create_table(query):
    lines = query.splitlines()
    columns = []
    table_names = []

    # Look for "CREATE TABLE" and start capturing columns
    capture = False
    for line in lines:
        if "CREATE TABLE" in line:
            capture = True
            table_names.append(line.split()[-2])
        elif line.strip().endswith(')') or line.strip().endswith(');'):
            capture = False
        elif capture:
            column_name = line.strip().split()[0]
            if column_name in ["CONSTRAINT", "PRIMARY"]: continue
            columns.append(column_name)
    return table_names, columns

def query_db(sql_query, db_name):
    try:
        conn = sqlite3.connect(f'../src/spider/database/{db_name}/{db_name}.sqlite')
        cursor = conn.cursor()
    except:
        return "CANNOT CONNECT DATABASE"
    try:
        cursor.execute(sql_query)
        results = cursor.fetchall()
    except:
        return "CANNOT FETCHING DATA"
    conn.close()
    return results


In [None]:
def spider_masking_query(sql_query:str, tab_columns:list, condition_value_mask:bool=True) -> str:

        tab_columns_lower = [t.lower() for t in tab_columns]
        if '*' in sql_query: sql_query = sql_query.replace('*', "[MASK]")
        query_split = re.split(r'(?<=[() .,;])|(?=[() .,;])', sql_query)
        mask_next = False

        for i in range(len(query_split)):
            token = query_split[i].lower()
            # prepare mask condition value
            if token.lower() == 'where': mask_next = True
            if condition_value_mask and mask_next and (token in {'=', '>', '<', '>=', '<=', '<>', '!='} and i + 1 < len(query_split)):
                step_mask_next = 1
                # find the condition value
                while query_split[i + step_mask_next] == ' ': step_mask_next += 1
                query_split[i + step_mask_next] = "[MASK]"
            
            if token in tab_columns_lower:
                query_split[i] = "[MASK]"

        return "".join(query_split)

In [None]:
load_dotenv('../.env')
tokenizer = AutoTokenizer.from_pretrained("../models/nsql-350M")
model = AutoModelForCausalLM.from_pretrained("../models/nsql-350M")

GOOGLE_API_KEY = os.environ.get('GOOGLE_API_KEY')
DEEPSEEK_API_KEY = os.environ.get('DEEPSEEK_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

system_content_schemaprovide = """You are a helpful assistant for generate SQL query from user-specified questions and schema.
User provides you with a question.
Please return only a sql string query results.
Do not return any other format the user has provided to you.
This is example of output format which user expect from you
query : 'SELECT...'
"""

system_content_fillmask = """You are a helpful assistant for generate SQL query from user-specified questions and schema. 
User has some SQL where the [MASK] columns and condition values are syntaxed and User wants you to respond to output that populates the [MASK] column of the SQL input followed by the question and schema description (name - description).
If you don't know which column to fill in Do not include columns that you have created yourself. And only columns defined from the schema must be used. 
Do not use columns from other tables or schema. must also be used from the same table defined in the input.
If you must enter conditional values Please decide the format or value based on the sample values of that column.
If that column has many too long category value please decide base on column description.
please return only the answer of sql string query result!!! ('SELECT...')
"""

zero_shot_prompt = """For example:
table :     cat - this table contain cat information 
columns :    id - number for identify cat
            name - name of cat 
            age - age of cat 
            birth_date - pet birthday in format 'YYYY-MM-DD'
            gender - gender of cat (male, female)

question : 'Show me number of cat for each gender which born before March 23, 2011.'
query : 'SELECT gender, COUNT(*) FROM cat WHERE birth_date < '2011-03-23' GROUP BY gender;'

"""

zero_shot_prompt_mask = """For example:
table :     cat - this table contain cat information 
columns :    id - number for identify cat
            name - name of cat 
            age - age of cat 
            birth_date - pet birthday in format 'YYYY-MM-DD'
            gender - gender of cat (male, female)

question : Show me number of cat for each gender which born before March 23, 2011.
input : SELECT [MASK], COUNT([MASK]) FROM cat WHERE [MASK] < [MASK] GROUP BY [MASK] ;
query : SELECT gender, COUNT(*) FROM cat WHERE birth_date < '2011-03-23' GROUP BY gender;

"""

In [None]:
def generate_nsql_sql(prompt):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
        generated_ids = model.generate(input_ids, max_length=1000)
        sql = tokenizer.decode(generated_ids[0], skip_special_tokens=True).split('\n')[-1]
    return sql

In [None]:
with open("../src/spider/mockup_schema_description.json") as f:
    spider_description = json.load(f)
    exists_tables = [tab['table'] for tab in spider_description]

with open("../src/spider/table_database_map.json") as f:
    table_map_db = json.load(f)

In [None]:
spider_df = pd.read_csv('../src/NSText2SQL/train_spider.csv')
spider_df.head()

In [None]:
predict_data = {
    "Question" : [],
    "Desc DeepSeek" : [],
    "Desc GPT3.5" : [],
    "Desc GPT4" : [],
    "Desc Gemini" : [],
    "ChatQ NSQL" : [],
    "ChatQ DeepSeek" : [],
    "ChatQ GPT3.5" : [],
    "ChatQ GPT4" : [],
    "ChatQ Gemini" : []
}

measurement_data = {
    "Question" : [],
    "Time Desc LLM" : [],
    "Time ChatQ NSQL" : [],
    "Time ChatQ LLM" : [],
    "Token Desc LLM" : [],
    "Token ChatQ LLM" : []
}

temp_result_file = "results/temp_spider_experiments.xlsx"
temp_eval_file = "results/temp_spider_measurement_experiments.xlsx"

if os.path.exists(temp_result_file):
    print("File exist")
    _df = pd.read_excel(temp_result_file)
    i = _df.shape[0]
    for key in predict_data:
        if key in _df.columns:
            predict_data[key] = _df[key].tolist()

if os.path.exists(temp_eval_file):
    print("File exist")
    _df = pd.read_excel(temp_eval_file)
    i = _df.shape[0]
    for key in measurement_data:
        if key in _df.columns:
            measurement_data[key] = _df[key].tolist()

for i in range(spider_df.shape[0]):
    tabs, cols = table_column_of_create_table(spider_df.iloc[i,1])
    if len(set(tabs).intersection(set(exists_tables))) == len(tabs) :

        start_time = time.time()
        question = spider_df.iloc[i,0]
        if question in predict_data['Question']: continue
        nsql_prompt = create_prompt(question, spider_df.iloc[i,1])
        actual_sql = spider_df.iloc[i,2]
        
        schema_desc_prompt = ""
        for use_tabl in tabs:
            used_schema = spider_description[exists_tables.index(use_tabl)]
            schema_desc_prompt += f"{used_schema['table']} - {used_schema['description']}\n"
            for col_name, col_desc in used_schema['columns'].items():
                schema_desc_prompt += f"\t{col_name} - {col_desc}\n"

        schema_desc_prompt += f"question: {question}\n"
        prompt_time = time.time() - start_time
        schema_provide_prompt = zero_shot_prompt + schema_desc_prompt + "query: "

        schema_gemini_results = LLM_gensql(schema_provide_prompt, system_content_schemaprovide, 'gemini-pro')
        schema_gpt3_5_results = LLM_gensql(schema_provide_prompt, system_content_schemaprovide, 'gpt-3.5-turbo')
        schema_gpt4_results = LLM_gensql(schema_provide_prompt, system_content_schemaprovide, 'gpt-4-0125-preview')
        schema_deepseek_result = LLM_gensql(schema_provide_prompt, system_content_schemaprovide, 'deepseek-coder')
        schema_gemini_results, schema_deepseek_result, schema_gpt3_5_results, schema_gpt4_results = await asyncio.gather(schema_gemini_results, schema_deepseek_result, schema_gpt3_5_results, schema_gpt4_results)
        llm_time = time.time() - start_time
        llm_time_process_time = llm_time - prompt_time
        llm_tokens = len(schema_provide_prompt) + len(system_content_schemaprovide)
        print("LLM times", llm_time)
        nsql_result = generate_nsql_sql(nsql_prompt)
        nsql_time = time.time() - start_time - llm_time_process_time
        print("NSQL time", nsql_time)

        masked_nsql_result = spider_masking_query(nsql_result, cols)
        chatq_prompt = zero_shot_prompt_mask + schema_desc_prompt + f"input : {masked_nsql_result}\nquery: "

        chatq_gemini_results = LLM_gensql(chatq_prompt, system_content_fillmask, 'gemini-pro')
        chatq_gpt3_5_results = LLM_gensql(chatq_prompt, system_content_fillmask, 'gpt-3.5-turbo')
        chatq_gpt4_results = LLM_gensql(chatq_prompt, system_content_fillmask, 'gpt-4-0125-preview')
        chatq_deepseek_result = LLM_gensql(chatq_prompt, system_content_fillmask, 'deepseek-coder')

        # print("Generating SQL...")
        chatq_gemini_results, chatq_gpt3_5_results, chatq_gpt4_results, chatq_deepseek_result = await asyncio.gather(chatq_gemini_results, chatq_gpt3_5_results, chatq_gpt4_results, chatq_deepseek_result)
        chatq_time = time.time() - start_time - llm_time_process_time
        print("ChatQ time", chatq_time)
        chatq_tokens = len(chatq_prompt) + len(system_content_fillmask)

        print(question)
        # print(schema_gemini_results, schema_deepseek_result, schema_gpt3_5_results, schema_gpt4_results, chatq_gemini_results, chatq_gpt3_5_results, chatq_gpt4_results, chatq_deepseek_result, sep='\n')

        predict_data['Question'].append(question)
        predict_data['Desc DeepSeek'].append(schema_deepseek_result)
        predict_data['Desc GPT3.5'].append(schema_gpt3_5_results)
        predict_data['Desc GPT4'].append(schema_gpt4_results)
        predict_data['Desc Gemini'].append(schema_gemini_results)
        predict_data['ChatQ NSQL'].append(nsql_result)
        predict_data['ChatQ DeepSeek'].append(chatq_deepseek_result)
        predict_data['ChatQ GPT3.5'].append(chatq_gpt3_5_results)
        predict_data['ChatQ GPT4'].append(chatq_gpt4_results)
        predict_data['ChatQ Gemini'].append(chatq_gemini_results)

        measurement_data['Question'].append(question)
        measurement_data["Time Desc LLM"].append(llm_time)
        measurement_data["Time ChatQ NSQL"].append(nsql_time)
        measurement_data["Time ChatQ LLM"].append(chatq_time)
        measurement_data["Token Desc LLM"].append(llm_tokens)
        measurement_data["Token ChatQ LLM"].append(chatq_tokens)

        if not i % 10:
            save_df = pd.DataFrame(predict_data)
            measurement_df = pd.DataFrame(measurement_data)
            save_df.to_excel(temp_result_file, index=False)
            measurement_df.to_excel(temp_eval_file, index=False)
            print("SAVE TEMP COMPLETE", i)

save_df = pd.DataFrame(predict_data)
measurement_df = pd.DataFrame(measurement_data)
save_df.to_excel("results/model_spider_experiments.xlsx", index=False)
measurement_df.to_excel("results/spider_measurement_experiments.xlsx", index=False)

# Eval PointX 

In [None]:
measurement_data = {
    "Question" : [],
    "Time Desc LLM" : [],
    "Time ChatQ LLM" : [],
    "Token Desc LLM" : [],
    "Token ChatQ LLM" : []
}

predict_data = {
    "Question" : [],
    "Desc DeepSeek" : [],
    "Desc GPT3.5" : [],
    "Desc GPT4" : [],
    "Desc Gemini" : [],
    "ChatQ DeepSeek" : [],
    "ChatQ GPT3.5" : [],
    "ChatQ GPT4" : [],
    "ChatQ Gemini" : []
}

temp_result_file = "results/temp_pointx_experiments.xlsx"
temp_eval_file = "results/temp_eval_pointx.xlsx"

if os.path.exists(temp_result_file):
    print("File exist")
    _df = pd.read_excel(temp_result_file)
    i = _df.shape[0]
    for key in predict_data:
        if key in _df.columns:
            predict_data[key] = _df[key].tolist()

if os.path.exists(temp_eval_file):
    print("File exist")
    _df = pd.read_excel(temp_eval_file)
    i = _df.shape[0]
    for key in measurement_data:
        if key in _df.columns:
            measurement_data[key] = _df[key].tolist()

i = 0
for table_name in q_pair_df['Table'].unique():
    table_df = q_pair_df[q_pair_df['Table'] == table_name]
    table_questions = table_df['Question'].to_list()
    table_actualSQL = table_df['Actual SQL'].to_list()
    
    
    for question, sql in zip(table_questions, table_actualSQL):
        if question in predict_data['Question']: continue
        start_time = time.time()
        llm_used_schema = schema_link.filter_schema(question, [table_name], max_n=50)
        llm_prompt = create_llm_prompt(schema_link, llm_used_schema, question, zero_shot_prompt, is_marked=False)

        gemini_results = LLM_gensql(llm_prompt, system_content_schemaprovide, 'gemini-pro')
        gpt3_5_results = LLM_gensql(llm_prompt, system_content_schemaprovide, 'gpt-3.5-turbo')
        gpt4_results = LLM_gensql(llm_prompt, system_content_schemaprovide, 'gpt-4-0125-preview')
        deepseek_result = LLM_gensql(llm_prompt, system_content_schemaprovide, 'deepseek-coder')
        
        schema_gemini_results, schema_deepseek_result, schema_gpt3_5_results, schema_gpt4_results = await asyncio.gather(gemini_results, deepseek_result, gpt3_5_results, gpt4_results)
        llm_time = time.time() - start_time
        llm_tokens = len(llm_prompt) + len(system_content_schemaprovide)
        print('LLM times', llm_time)
        chatq_prompt = await ChatQ_pipeline(question, domain_tables=[table_name], 
                                            llm_model_name=None, max_n=10,get_final_prompt=True)
        
        gemini_results = LLM_gensql(chatq_prompt, system_content_fillmask, 'gemini-pro')
        gpt3_5_results = LLM_gensql(chatq_prompt, system_content_fillmask, 'gpt-3.5-turbo')
        gpt4_results = LLM_gensql(chatq_prompt, system_content_fillmask, 'gpt-4-0125-preview')
        deepseek_result = LLM_gensql(chatq_prompt, system_content_fillmask, 'deepseek-coder')

        chatq_gemini_results, chatq_deepseek_result, chatq_gpt3_5_results, chatq_gpt4_results = await asyncio.gather(gemini_results, deepseek_result, gpt3_5_results, gpt4_results)
        chatq_time = time.time() - start_time - llm_time
        chatq_tokens = len(chatq_prompt) + len(system_content_fillmask)
        print("ChatQ times", chatq_time)
        print(question)

        predict_data['Question'].append(question)
        predict_data['Desc DeepSeek'].append(schema_deepseek_result)
        predict_data['Desc GPT3.5'].append(schema_gpt3_5_results)
        predict_data['Desc GPT4'].append(schema_gpt4_results)
        predict_data['Desc Gemini'].append(schema_gemini_results)
        predict_data['ChatQ DeepSeek'].append(chatq_deepseek_result)
        predict_data['ChatQ GPT3.5'].append(chatq_gpt3_5_results)
        predict_data['ChatQ GPT4'].append(chatq_gpt4_results)
        predict_data['ChatQ Gemini'].append(chatq_gemini_results)

        measurement_data['Question'].append(question)
        measurement_data["Time Desc LLM"].append(llm_time)
        measurement_data["Time ChatQ LLM"].append(chatq_time)
        measurement_data["Token Desc LLM"].append(llm_tokens)
        measurement_data["Token ChatQ LLM"].append(chatq_tokens)

        i += 1
        if not i % 10:
            save_df = pd.DataFrame(predict_data)
            measurement_df = pd.DataFrame(measurement_data)
            save_df.to_excel(temp_result_file, index=False)
            measurement_df.to_excel(temp_eval_file, index=False)
            print("SAVE TEMP COMPLETE", i)

save_df = pd.DataFrame(predict_data)
measurement_df = pd.DataFrame(measurement_data)
save_df.to_excel("results/pointx_experiments.xlsx", index=False)
measurement_df.to_excel("results/evel_pointx.xlsx", index=False)

In [46]:
pointx_exp = pd.read_excel('results/model_description_top10_experiments.xlsx')
cols = ['Desc DeepSeek', 'Desc Gemini']
for c in cols:
    pointx_exp[c] = pointx_exp[c].apply(extract_sql_query)
    pointx_exp[f'{c} result'] = pointx_exp[c].apply(query_pointx_db)

pointx_exp.to_excel("results/model_description_top10_experiments.xlsx", index=False)

In [47]:
pointx_exp

Unnamed: 0,Question,Desc DeepSeek,Desc GPT3.5,Desc GPT4,Desc Gemini,Desc DeepSeek result,Desc Gemini result
0,What is the total amout of all financial trans...,"SELECT month_id, SUM(ntx_pointx_financial) FRO...",API ERROR,API ERROR,"SELECT month_id, SUM(ntx_pointx_financial) AS ...","[(2022-07, 447), (2022-08, 259)]","[(2022-07, 447), (2022-08, 259)]"
1,What is the total amount of points generated b...,SELECT SUM(amt_point_topup) FROM pointx_keymat...,API ERROR,API ERROR,SELECT SUM(amt_point_topup) FROM pointx_keymat...,"[(None,)]","[(178992.0,)]"
2,What is the total amount of points generated b...,"SELECT month_id, SUM(amt_point_pay) FROM point...",API ERROR,API ERROR,"SELECT month_id, SUM(amt_point_pay) AS total_p...","[(2022-07, 30075.0), (2022-08, 30045.0)]","[(2022-07, 30075.0), (2022-08, 30045.0)]"
3,What is the average rate of released points fo...,SELECT AVG(rate_point_per_baht_pay_qr_cs) FROM...,API ERROR,API ERROR,SELECT rate_point_per_baht_pay_weight\nFROM po...,"[(4.995833333333334,)]","[(0.0,), (0.0,), (18.25,), (11.461538461538462..."
4,Can you determine the average number of custom...,SELECT AVG(ncust_visit) FROM pointx_keymatrix_...,API ERROR,API ERROR,"SELECT \n STRFTIME('%Y-%m',date) AS month,\...","[(50.48,)]",CANNOT FETCHING DATA
...,...,...,...,...,...,...,...
119,Top 5 E-coupon with the most transactions,"SELECT e_coupon_display, COUNT(transaction_id)...",API ERROR,API ERROR,"SELECT e_coupon_display, count(*) FROM pointx_...","[(None, 0)]","[(None, 1000)]"
120,Top 3 delivery type with the least transactions,"SELECT delivery_type, COUNT(transaction_id) as...",API ERROR,API ERROR,"SELECT delivery_type, COUNT(*) AS tc FROM poin...","[(None, 0)]","[(None, 1000)]"
121,Top 3 deal of the day with the most transactions,"SELECT deal_title, COUNT(transaction_id) as tr...",API ERROR,API ERROR,"SELECT \n\tdeal_title,\n\tCOUNT(transaction_id...","[(None, 0)]","[(None, 0)]"
122,What is the total amout of transactions for ea...,"SELECT customer_type, SUM(ntx) FROM pointx_cus...",API ERROR,API ERROR,"SELECT customer_type, SUM(ntx) AS total_transa...","[(Easy, 290), (GUEST, 0), (Non Easy, 56)]","[(Easy, 290), (GUEST, 0), (Non Easy, 56)]"
