In [2]:
from openai import OpenAI

client = OpenAI()

completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a teacher"},
    {"role": "user", "content": "What is the capital of france?"}
  ]
)
print(completion.choices[0].message)

ChatCompletionMessage(content='The capital of France is Paris.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


### Creating Database from CSV files

In [None]:
import sqlite3
import os
import pandas as pd

def csv_folder_to_database_custom_schema(folder_path, db_path):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_csv(file_path)
            table_name = os.path.splitext(filename)[0].replace(" ", "_").replace("-", "_")
            print(table_name)
            cursor.execute(f"""
                    CREATE TABLE IF NOT EXISTS {table_name}(
                        day INTEGER NOT NULL CHECK (day BETWEEN 1 AND 31),
                        month TEXT NOT NULL,
                        year INTEGER NOT NULL CHECK (year BETWEEN 1800 AND 2100),
                        dayname TEXT NOT NULL,
                        season INTEGER NOT NULL,
                        stadium TEXT NOT NULL,
                        city TEXT NOT NULL,
                        state TEXT NOT NULL,
                        attendance INTEGER NOT NULL,
                        capacity INTEGER NOT NULL,
                        game_id INTEGER PRIMARY KEY,
                        summary TEXT
                    )
                """)
            print(f"Custom schema applied for table '{table_name}'.")

            for _, row in df.iterrows():
                placeholders = ', '.join(['?'] * len(row))
                column_names = ', '.join(row.index)
                insert_query = f"INSERT INTO {table_name} ({column_names}) VALUES ({placeholders})"
                cursor.execute(insert_query, tuple(row))

    conn.commit()
    print("All CSV files have been successfully imported with the custom schema.")
    return conn

folder_path = r'D:\NSFQA\Question Generation\TestTables_5'
db_path = r'D:\NSFQA\Question Generation\SQL\new_database.db'

conn = csv_folder_to_database_custom_schema(folder_path, db_path)


sportset_2
Custom schema applied for table 'sportset_2'.
sportset_coldtemp_30_13
Custom schema applied for table 'sportset_coldtemp_30_13'.
sportset_midwest_30_8
Custom schema applied for table 'sportset_midwest_30_8'.
sportset_northeast_30_1
Custom schema applied for table 'sportset_northeast_30_1'.
sportset_west_30_4
Custom schema applied for table 'sportset_west_30_4'.
All CSV files have been successfully imported with the custom schema.


In [None]:
import sqlite3

db_path = r"D:\NSFQA\Question Generation\SQL\new_database.db" 
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = [row[0] for row in cursor.fetchall()]
print("Tables in the database:", tables)


conn.close()


Tables in the database: ['sportset_2', 'sportset_coldtemp_30_13', 'sportset_midwest_30_8', 'sportset_northeast_30_1', 'sportset_west_30_4']


In [None]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
result = pd.read_sql_query("SELECT dayname FROM sportset_2 WHERE attendance > 17000 AND state = 'Tennessee'", conn)
str(result)

'    dayname\n0  Saturday\n1  Saturday\n2   Tuesday'

In [None]:
import pandas as pd

conn = sqlite3.connect(db_path)
cursor = conn.cursor()

def get_table_schema_and_rows(table_name):
    cursor.execute(f"PRAGMA table_info({table_name})")
    schema = cursor.fetchall()
    schema_dict = {col[1]: col[2] for col in schema} 
    query = f"SELECT * FROM {table_name} LIMIT 5"
    df = pd.read_sql_query(query, conn)
    return schema_dict, df

table_name = tables[0]
schema, rows = get_table_schema_and_rows(table_name)
print("Schema:", schema)
print("\nSample Rows:")
print(rows)
conn.close()


Schema: {'day': 'INTEGER', 'month': 'TEXT', 'year': 'INTEGER', 'dayname': 'TEXT', 'season': 'INTEGER', 'stadium': 'TEXT', 'city': 'TEXT', 'state': 'TEXT', 'attendance': 'INTEGER', 'capacity': 'INTEGER', 'game_id': 'INTEGER', 'summary': 'TEXT'}

Sample Rows:
   day     month  year    dayname  season                  stadium  \
0   24   January  2015   Saturday    2014               FedExForum   
1    3  December  2014  Wednesday    2014  Time Warner Cable Arena   
2   13  December  2014   Saturday    2014  Time Warner Cable Arena   
3   20     March  2015     Friday    2014             Amway Center   
4   28   October  2014    Tuesday    2014     Smoothie King Center   

          city           state  attendance  capacity  game_id  \
0      Memphis       Tennessee       17600     17800      269   
1    Charlotte  North Carolina       16900     19100      379   
2    Charlotte  North Carolina       17100     19100      382   
3      Orlando         Florida       16200     18800      608

### Generating queries

In [None]:
from openai import OpenAI
import random
import json
import sqlite3
db_path = r"D:\LLMTables\Question Generation\test\new_database_2.db"  
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
client = OpenAI()

def generate_queries(table_name, schema, rows):
    row_samples = [rows.sample(1, random_state=random.randint(1, 1000)).to_dict(orient="records")[0] for _ in range(5)]
    print(f"Generating queries for {table_name}")
    messages = [
        {"role": "system", "content": "Act as an expert in SQL and databases. Please give valid output JSON."},
        {"role": "user", "content": f"""
         Read the table schema and 5 rows given from the table carefully and understand it correctly - 
         
         Table Schema:
         {json.dumps(schema, indent=4)}

         ROW DATA:
         {json.dumps(row_samples, indent=4)}

         SQL TEMPLATE:

         SELECT [column] FROM {table_name} WHERE [condition1] AND [condition2]

         Instruction:
            Please use the information from the table and data provided to fill in the placeholders in the template. Each SQL query should only return a single result using either:

            - A specific column (e.g., city, attendance, capacity, etc.)
            - An aggregate function (e.g., COUNT(), SUM(), MAX(), etc.)
            Follow the template given and try to fill in the placeholders in a way that can lead to logical and complex queries.
            Ensure that the queries generate deterministic answers, such as a single count, maximum, or specific column value (e.g., "How many games were held in Orlando?").
            Generate 5 such SQL queries.
            
            Please follow the Response format while answering :
            Query: <Single Liner SQL Query>
            Table: {table_name}
         """}
    ]

    chat_completion, *_ = client.chat.completions.create(
        model="gpt-4o-mini", 
        messages=messages,
        response_format={"type": "json_object"}

    ).choices
    content = chat_completion.message.content
    reply = json.loads(content)
    return reply

queries_json = generate_queries(table_name, schema, rows)

print("Generated JSON Output:")
print(json.dumps(queries_json, indent=4))

ImportError: cannot import name 'generate_queries_prompt' from 'prompts' (d:\NSFQA\Question Generation\SQL\prompts.py)

### Scaled Query Generation

In [None]:
def process_all_tables_and_save_simple(db_path, output_file):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = [row[0] for row in cursor.fetchall()]

    final_result = []

    for table_name in tables:
        print(f"Processing table: {table_name}")

        schema, rows = get_table_schema_and_rows(table_name)

        queries_json = generate_queries(table_name, schema, rows)

        for query in queries_json["queries"]:
            final_result.append({
                "table_name": query["Table"],
                "query": query["Query"] 
            })

    conn.close()

    with open(output_file, "w") as f:
        json.dump(final_result, f, indent=4)
    
    print(f"JSON output saved to {output_file}")

db_path = r"D:\LLMTables\Question Generation\test\new_database_2.db"  
output_file = r"D:\LLMTables\Question Generation\test\simple_queries_output.json"
all_tables_result = process_all_tables_and_save_simple(db_path, output_file)

print("Generated Simplified JSON Output for All Tables:")
print(json.dumps(all_tables_result, indent=4))


### Execute Queries and Modify JSON

In [None]:
import sqlite3
import json

def execute_queries_and_update_json(db_path, input_json_file, output_json_file):
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    with open(input_json_file, "r") as f:
        queries_json = json.load(f)

    updated_queries = []

    for entry in queries_json:
        table_name = entry["table_name"]
        query = entry["query"]
        print(query)
        
        try:
            cursor.execute(query)
            result = cursor.fetchall()
            print(result)
            if len(result) == 1 and len(result[0]) == 1:
                result_value = result[0][0]
            else:
                result_value = str(result)
        except Exception as e:
            result_value = f"Error: {str(e)}"

        updated_queries.append({
            "table_name": table_name,
            "query": query,
            "result": result_value
        })

    conn.close()

    with open(output_json_file, "w") as f:
        json.dump(updated_queries, f, indent=4)
    
    print(f"Updated JSON with results saved to {output_json_file}")
    return updated_queries

db_path = r"D:\LLMTables\Question Generation\test\new_database_2.db"
input_json_file = r"D:\LLMTables\Question Generation\test\simple_queries_output.json"  
output_json_file = r"D:\LLMTables\Question Generation\test\queries_with_results.json" 

updated_result = execute_queries_and_update_json(db_path, input_json_file, output_json_file)

print("Updated JSON with Results:")
print(json.dumps(updated_result, indent=4))


### Convert SQL Query to Natural Language

In [None]:
from openai import OpenAI
import json

client = OpenAI()
def convert_sql_to_natural_language(input_json_file, output_json_file):
    with open(input_json_file, "r") as f:
        queries_json = json.load(f)
    
    updated_json = []
    
    for entry in queries_json:
        sql_query = entry["query"]
        table_name = entry["table_name"]
        result = entry.get("result", None)
        
        prompt = f"""
        You are an expert data scientist skilled in SQL and natural language processing. Your task is to convert SQL queries into natural language questions. 

        Here is the SQL Query: 
        {sql_query}
        The questions should:
        - Clearly represent the intent of the SQL query.
        - Translate technical terms, column headers, and values into natural, descriptive forms.
        - Avoid technical jargon unless absolutely necessary.
        - Ensure the question retains the same scope and meaning as the SQL query to avoid altering the query's answer.

        For example:
        SQL: SELECT COUNT(game_id) FROM sportset_2 WHERE city = 'Orlando' AND year = 2015
        Output: How many games took place in Orlando in the year 2015?

        Please convert this SQL query into a single natural language question. Ensure the column headers and values are human-readable. 
        Respond STRICTLY in the following format:
        Question: <Natural Language Question>
        """
        
        messages = [
        {"role": "system", "content": "Act as an expert data scientist skilled in SQL and natural language processing."},
        {"role": "user", "content": prompt}
        ]
        
        try:  
            chat_completion, *_ = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=messages
                ).choices
            content = chat_completion.message.content
            print(content)      
            question = content.replace("Question: ", "").strip()
        except Exception as e:
            question = f"Error generating question: {str(e)}"
        updated_json.append({
            "table_name": table_name,
            "query": sql_query,
            "result": result,
            "question": question
        })
    with open(output_json_file, "w") as f:
        json.dump(updated_json, f, indent=4)
    
    print(f"Updated JSON with natural language questions saved to {output_json_file}")
    return updated_json
input_json_file = r"D:\LLMTables\Question Generation\test\queries_with_results.json"  
output_json_file = r"D:\LLMTables\Question Generation\test\natural_language_output.json" 

updated_json = convert_sql_to_natural_language(input_json_file, output_json_file)

print("Updated JSON with Natural Language Questions:")
print(json.dumps(updated_json, indent=4))


### Evaluation

In [None]:
import pandas as pd
import json
import openai

def convert_to_pipe_format(path_to_csv):
    df = pd.read_csv(path_to_csv)
    string = '/*\n'
    col_list = df.columns.values.tolist()
    string += 'col : ' + ' | '.join(df.columns) + '\n'
    for row_id, row in df.iterrows():
        string += f'row {row_id} : '
        for column_id, header in enumerate(df.columns):
            string += str(row[header])
            if column_id != len(df.columns) - 1:
                string += ' | '
        string += '\n'
    string += '*/\n'
    string += f'columns:{col_list}\n'
    return string

def generate_short_answer(table, question):
    answer_prompt = f"""
    Here is the table to answer this question. Answer the question in 3-4 words max.
    {table}
    Question: {question}
    The answer is: 
    """
    messages = [
        {"role": "system", "content": "You are an expert in answering questions from tabular data."},
        {"role": "user", "content": answer_prompt}
    ]
    completion = client.chat.completions.create(
        model="gpt-4o-mini", 
        temperature=0,
        messages=messages
    )
    generated_answer = completion.choices[0].message.content.strip()
    return generated_answer

def evaluate_qa_pair(qa_pair, correct_answers_list):
    table_path = rf"D:\LLMTables\Question Generation\test\{qa_pair['table_name']}.csv"
    table = convert_to_pipe_format(table_path)
    question = qa_pair['question']
    generated_answer = generate_short_answer(table, question)
    correct_answer = qa_pair["result"]
    if generated_answer == correct_answer:
        correct_answers_list.append(qa_pair)
        return True
    else:
        print("incorrect answer")
        print(question)
        print("actual: " + correct_answer)
        print("generated: " + generated_answer)
        return False
    
def process_evaluation(json_data):
    total_questions = len(json_data)
    print("total questions: " + str(total_questions))
    correct_answers = 0
    incorrect_answers = []

    for qa_pair in json_data:
        if evaluate_qa_pair(qa_pair, correct_answers_list=[]):
            correct_answers += 1
        else:
            incorrect_answers.append(qa_pair)
    accuracy = (correct_answers / total_questions) * 100
    return correct_answers, accuracy, incorrect_answers

def save_incorrect_answers(incorrect_answers, output_path):
    with open(output_path, "w") as json_file:
        json.dump(incorrect_answers, json_file, indent=4)
    print(f"Incorrectly answered questions saved to {output_path}")

def evaluation_pipeline(input_json_path, incorrect_output_json_path):
    with open(input_json_path, "r") as file:
        json_data = json.load(file)
    correct_answers, accuracy, incorrect_answers = process_evaluation(json_data)
    print(f"Total Correct Answers: {correct_answers}")
    print(f"Accuracy: {accuracy:.2f}%")
    save_incorrect_answers(incorrect_answers, incorrect_output_json_path)

if __name__ == "__main__":
    input_json_path = r"D:\LLMTables\Question Generation\test\natural_language_output.json"  
    incorrect_output_json_path = r"D:\LLMTables\Question Generation\test\incorrect_answers.json"  
    evaluation_pipeline(input_json_path, incorrect_output_json_path)