## 1. Import Thư viện Cần Thiết

In [None]:
import pandas as pd
import google.generativeai as genai
import pyodbc
import json
import time
from collections import deque
import re
import datetime
import os

  from .autonotebook import tqdm as notebook_tqdm


## 2. Cấu hình Hệ thống

In [None]:
# DANH SÁCH API KEYS CHO GEMINI
GEMINI_API_KEYS = [
    os.environ.get('GOOGLE_API_KEY')
]

API_RATE_LIMIT_PER_MINUTE = 100
API_KEY_COOLDOWN_SECONDS = 70 
MAX_API_RETRIES_PER_QUESTION = len(GEMINI_API_KEYS) 

api_key_usage = {key: {"count": 0, "last_reset_time": time.time()} for key in GEMINI_API_KEYS}
current_api_key_index = 0 

DB_SERVER = 'localhost'
DB_NAME = 'text_to_sql'
username = 'sa'
password = '123456'
DB_CONN_STRING = f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={DB_SERVER};DATABASE={DB_NAME};UID={username};PWD={password}'

INPUT_CSV_FILE = 'question.csv'
OUTPUT_CSV_FILE = 'result_os.csv' # Đổi tên file output
SCHEMA_FILE = 'm-schema.txt'

NUM_SYNTHETIC_EXAMPLES_RF = 2 # Số ví dụ cho common SQL features
NUM_SYNTHETIC_EXAMPLES_RT = 2 # Số ví dụ cho schema interpretation (tập trung vào bảng liên quan)


## 3. Các Hàm Hỗ Trợ

### 3.1. Hàm Chuyển đổi Date/Datetime cho JSON

In [3]:
def json_date_converter(obj):
    if isinstance(obj, (datetime.date, datetime.datetime)):
        return obj.isoformat()
    raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")

### 3.2. Hàm Quản lý và Xoay vòng API Key

In [4]:
def get_next_available_api_key():
    global current_api_key_index
    global api_key_usage
    start_index = current_api_key_index
    attempts = 0 
    max_attempts_for_key_search = len(GEMINI_API_KEYS) * 2

    while attempts < max_attempts_for_key_search:
        api_key = GEMINI_API_KEYS[current_api_key_index]
        usage_stats = api_key_usage[api_key]
        current_time = time.time()

        if current_time - usage_stats["last_reset_time"] > API_KEY_COOLDOWN_SECONDS:
            usage_stats["count"] = 0
            usage_stats["last_reset_time"] = current_time

        if usage_stats["count"] < API_RATE_LIMIT_PER_MINUTE:
            usage_stats["count"] += 1
            genai.configure(api_key=api_key)
            return api_key, current_api_key_index

        current_api_key_index = (current_api_key_index + 1) % len(GEMINI_API_KEYS)
        attempts += 1

        if current_api_key_index == start_index and attempts >= len(GEMINI_API_KEYS):
            wait_times = [(stats["last_reset_time"] + API_KEY_COOLDOWN_SECONDS + 0.5) - current_time 
                          for _, stats in api_key_usage.items()]
            positive_wait_times = [wt for wt in wait_times if wt > 0]
            min_wait_time = min(positive_wait_times) if positive_wait_times else 0.5
            if min_wait_time > 0:
                 print(f"All keys rate-limited. Waiting for {min_wait_time:.2f} seconds...")
                 time.sleep(min_wait_time)
            attempts = 0 
    print("CRITICAL: Could not get an available API key after multiple attempts and waits.")
    return None, -1

### 3.3. Hàm Trích xuất SQL từ Markdown

In [5]:
def extract_sql_from_markdown(markdown_text):
    match_sql_block = re.search(r"```sql\s*(.*?)\s*```", markdown_text, re.DOTALL | re.IGNORECASE)
    if match_sql_block:
        return match_sql_block.group(1).strip()
    match_generic_block = re.search(r"```\s*(SELECT .*?)\s*```", markdown_text, re.DOTALL | re.IGNORECASE)
    if match_generic_block:
        potential_sql = match_generic_block.group(1).strip()
        if potential_sql.lower().startswith("select"):
            return potential_sql
    lines = markdown_text.strip().split('\n')
    if len(lines) == 1 and lines[0].strip().lower().startswith("select"):
        return lines[0].strip()
    # Find the last occurrence of a line starting with SELECT (case-insensitive)
    for i in range(len(lines) - 1, -1, -1):
        if lines[i].strip().lower().startswith("select"):
            # Assume the SQL query starts from this line to the end
            potential_multiline_sql = "\n".join(lines[i:]).strip()
            return potential_multiline_sql
    return None

### 3.4. Hàm Tạo Ví dụ Tổng hợp (Online Synthetic Examples)

In [None]:
def parse_synthetic_examples(llm_output_text):
    """Phân tích output của LLM để trích xuất các cặp (câu hỏi, SQL)."""
    examples = []
    # Regex để tìm các cặp "input": "...", "output": "..." hoặc tương tự
    # This regex is a bit simplified; robust parsing might need more complex logic
    pattern = re.compile(r'"input":\s*"(.*?)",\s*"output":\s*"(.*?)"', re.DOTALL)
    matches = pattern.findall(llm_output_text)
    for q, s in matches:
        # Thêm xử lý để làm sạch SQL nếu cần (ví dụ: loại bỏ \n, \t)
        cleaned_sql = s.replace('\\n', ' ').replace('\\t', ' ').strip()
        examples.append({"question": q.strip(), "sql": cleaned_sql})
    
    if not examples:
        # Fallback: nếu không có format JSON, thử tìm các cặp câu hỏi - SQL dựa trên từ khóa
        # Ví dụ, nếu LLM trả về dạng:
        # Question: ... 
        # SQL: ...
        lines = llm_output_text.split('\n')
        current_q = None
        current_s = None
        for line in lines:
            if line.lower().startswith("question:") or line.lower().startswith("input:"):
                if current_q and current_s:
                    examples.append({"question": current_q, "sql": current_s})
                current_q = line.split(":",1)[1].strip()
                current_s = None
            elif (line.lower().startswith("sql:") or line.lower().startswith("output:")) and current_q:
                current_s = line.split(":",1)[1].strip()
            elif current_s is not None and (line.strip().lower().startswith("select") or line.strip().lower().startswith("with")):
                # Handle multi-line SQL for fallback
                current_s += " " + line.strip()
        if current_q and current_s: # Add the last example
            examples.append({"question": current_q, "sql": current_s})
            
    # print(f"Parsed {len(examples)} synthetic examples.")
    return examples

def generate_synthetic_examples_online(question_for_context, db_schema, num_examples_rf, num_examples_rt, model_name='gemini-2.0-flash'):
    all_synthetic_examples = []
    
    # Giai đoạn 1: Common SQL features (Rf) - sử dụng toàn bộ schema
    if num_examples_rf > 0:
        active_api_key_rf, _ = get_next_available_api_key()
        if not active_api_key_rf: return [], "Error: No API key for Rf examples"
        # print(f"Generating {num_examples_rf} Rf examples with key ...{active_api_key_rf[-4:]}")
        model_rf = genai.GenerativeModel(model_name)
        prompt_rf = f"""
        You are a SQLite SQL expert. Your job is to create {num_examples_rf} example pairs of (natural language question, MS SQL Server query).
        These examples should showcase diverse and common SQL features like various JOIN types (INNER, LEFT, RIGHT), GROUP BY, HAVING, common aggregate functions (COUNT, SUM, AVG, MIN, MAX), subqueries, and simple SELECT statements with WHERE clauses.
        Base these examples on the following MS SQL Server Database Schema:
        ---
        {db_schema}
        ---
        The current user question for context (DO NOT answer this question, just use it to understand the domain if helpful): "{question_for_context}"
        
        Provide the examples strictly in the following format, with each example on a new set of lines:
        "input": "[Your generated natural language question here]",
        "output": "[Your generated MS SQL Server query here]"
        
        Ensure the SQL queries are valid for MS SQL Server.
        Example of desired output format for one pair:
        "input": "How many students are in each major?",
        "output": "SELECT T2.major_name, COUNT(T1.student_id) FROM Students AS T1 JOIN Majors AS T2 ON T1.major_id = T2.major_id GROUP BY T2.major_name;"
        "input": "List all courses with more than 3 credits.",
        "output": "SELECT course_name FROM Courses WHERE no_credit > 3;"
        """
        try:
            response_rf = model_rf.generate_content(prompt_rf)
            # print(f"LLM output for Rf examples:\n{response_rf.text.strip()}")
            examples_rf = parse_synthetic_examples(response_rf.text.strip())
            all_synthetic_examples.extend(examples_rf[:num_examples_rf])
        except Exception as e:
            print(f"Lỗi khi tạo Rf synthetic examples với key ...{active_api_key_rf[-4:]}: {e}")
            if "rate limit" in str(e).lower() or "quota" in str(e).lower() or "429" in str(e):
                 api_key_usage[active_api_key_rf]["count"] = API_RATE_LIMIT_PER_MINUTE
            # Không thử lại trong hàm này, để hàm gọi chính xử lý retry nếu cần

    # Giai đoạn 2: Schema Interpretation (Rt) - tập trung vào các bảng có vẻ liên quan đến câu hỏi hiện tại
    if num_examples_rt > 0:
        active_api_key_rt, _ = get_next_available_api_key()
        if not active_api_key_rt: return all_synthetic_examples, "Error: No API key for Rt examples"
        # print(f"Generating {num_examples_rt} Rt examples with key ...{active_api_key_rt[-4:]}")
        model_rt = genai.GenerativeModel(model_name)
        prompt_rt = f"""
        You are a SQLite SQL expert. Your job is to create {num_examples_rt} example pairs of (natural language question, MS SQL Server query).
        These examples should focus on correctly interpreting and using the database schema, especially tables and columns that seem relevant to the user's main question provided below for context.
        The user's current question (for context and to guide table/column focus - DO NOT directly answer this question):
        "{question_for_context}"
        
        Base your examples on the following MS SQL Server Database Schema:
        ---
        {db_schema}
        ---
        Provide the examples strictly in the following format, with each example on a new set of lines:
        "input": "[Your generated natural language question here, focusing on schema elements relevant to the user's contextual question]",
        "output": "[Your generated MS SQL Server query here]"
        
        Ensure the SQL queries are valid for MS SQL Server.
        Example of desired output format for one pair:
        "input": "What is the name of the department for lecturer LEC01?",
        "output": "SELECT T2.dep_name FROM Lecturers AS T1 JOIN Departments AS T2 ON T1.dep_id = T2.dep_id WHERE T1.lecturer_id = 'LEC01';"
        """
        try:
            response_rt = model_rt.generate_content(prompt_rt)
            # print(f"LLM output for Rt examples:\n{response_rt.text.strip()}")
            examples_rt = parse_synthetic_examples(response_rt.text.strip())
            all_synthetic_examples.extend(examples_rt[:num_examples_rt])
        except Exception as e:
            print(f"Lỗi khi tạo Rt synthetic examples với key ...{active_api_key_rt[-4:]}: {e}")
            if "rate limit" in str(e).lower() or "quota" in str(e).lower() or "429" in str(e):
                 api_key_usage[active_api_key_rt]["count"] = API_RATE_LIMIT_PER_MINUTE
                 
    return all_synthetic_examples, None # Trả về None cho error nếu không có lỗi nghiêm trọng

### 3.5. Hàm Sinh SQL Chính với Gemini (Sử dụng Online Synthetic Examples)

In [7]:
def generate_sql_with_online_synthetic_examples(question, condition_json, db_schema, synthetic_examples):
    model_name = 'gemini-1.5-flash-latest' 
    
    # Format a_s_e to be part of the prompt
    formatted_examples = ""
    if synthetic_examples:
        formatted_examples = "Here are some examples of questions and their corresponding MS SQL Server queries based on the schema:\n"
        for ex in synthetic_examples:
            formatted_examples += f"Example Question: {ex['question']}\nExample SQL: {ex['sql']}\n---\n"
    else:
        formatted_examples = "(No synthetic examples were generated for in-context learning for this turn.)\n"
        
    prompt_text = f"""
    You are an expert Text-to-SQL system. Your goal is to convert a natural language question into an MS SQL Server query.
    You will be provided with the database schema, the user's question, any user context/conditions, and some dynamically generated examples of questions and SQL queries for a similar database schema.

    Database Schema:
    ---
    {db_schema}
    ---

    User Context/Condition (JSON format, use this to filter data if applicable):
    ---
    {condition_json}
    ---

    Dynamically Generated Examples (for in-context learning):
    ---
    {formatted_examples}
    ---

    Now, please convert the following User Question into an MS SQL Server query:
    User Question: {question}

    Chain-of-Thought Process (Think step-by-step before providing the final query):
    1.  **Analyze User Question & Context:** What is the core intent? What entities (tables, columns) are involved? What are the filtering conditions from the question and User Context?
    2.  **Relate to Schema and Examples:** How do the entities and conditions map to the provided database schema? Do any of the synthetic examples show similar patterns or SQL structures that might be useful (e.g., types of JOINs, aggregations, filtering logic)?
    3.  **Formulate SQL Plan:**
        *   FROM clause and JOINs: Identify necessary tables and how to join them.
        *   WHERE clause: Apply all relevant filters from the question and context.
        *   GROUP BY and HAVING clauses (if aggregation is needed).
        *   SELECT clause: Determine the columns to output.
        *   ORDER BY and TOP clauses (if needed).
    4.  **Construct Final SQL Query.**

    Final Optimized MS SQL Server Query:
    (IMPORTANT: Provide ONLY the final, executable MS SQL Server query below this line. Do not include any other text, explanations, or markdown formatting like ```sql ... ``` around the query. The query should be a single, valid SQL statement.)
    """

    # This function now expects to be called within a retry loop in main
    active_api_key, key_index = get_next_available_api_key()
    if not active_api_key:
        return "Error: No available API key for generating final SQL."
    
    # print(f"Generating final SQL using API key ...{active_api_key[-4:]}")
    model = genai.GenerativeModel(model_name)
        
    try:
        response = model.generate_content(prompt_text)
        raw_llm_output = response.text.strip()
        # print(f"Raw LLM output (Final SQL Gen): \n---\n{raw_llm_output}\n---")
        generated_sql = extract_sql_from_markdown(raw_llm_output)

        if generated_sql:
            return generated_sql
        else:
            # Fallback if markdown parsing fails but output might still be SQL
            if raw_llm_output.lower().strip().startswith("select"):
                print(f"Warning: extract_sql_from_markdown failed, but raw output for final SQL seems to be SQL.")
                return raw_llm_output
            error_msg = f"Error: Could not parse final SQL from LLM. Output: {raw_llm_output[:200]}..."
            print(error_msg)
            return error_msg # Return error for retry logic in main
        
    except Exception as e:
        print(f"Lỗi khi gọi Gemini API cho final SQL với key ...{active_api_key[-4:]}: {e}")
        error_str = str(e).lower()
        if "rate limit" in error_str or "quota" in error_str or "429" in error_str or "resource has been exhausted" in error_str or "service unavailable" in error_str:
            api_key_usage[active_api_key]["count"] = API_RATE_LIMIT_PER_MINUTE
            print(f"API key ...{active_api_key[-4:]} marked as rate-limited/exhausted for final SQL generation.")
        return f"Error generating final SQL: API Exception - {e}" # Return error for retry logic in main


### 3.6. Hàm Đọc Schema từ File

In [8]:
def get_database_schema_from_file(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            schema_str = f.read()
        return schema_str.strip()
    except FileNotFoundError:
        print(f"Lỗi: Không tìm thấy file schema '{filepath}'.")
        return "Error: Schema file not found."
    except Exception as e:
        print(f"Lỗi khi đọc file schema: {e}")
        return f"Error reading schema file: {e}"

### 3.7. Hàm Thực thi SQL Query trên MS SQL Server

In [9]:
def execute_sql_query(sql_query):
    results = []
    error_message = None
    if not sql_query or not isinstance(sql_query, str) or not sql_query.strip() or "error:" in sql_query.lower():
        return [], f"Invalid SQL query provided to execute_sql_query: '{sql_query}'"
    
    cleaned_sql_query = sql_query # Assume extract_sql_from_markdown already cleaned it
    if not cleaned_sql_query.lower().startswith("select"): 
         return [], f"Non-SELECT SQL query provided or malformed: '{cleaned_sql_query}'"

    try:
        conn = pyodbc.connect(DB_CONN_STRING)
        cursor = conn.cursor()
        # print(f"Đang thực thi SQL: {cleaned_sql_query}")
        cursor.execute(cleaned_sql_query)

        if cursor.description is None:
            if cursor.rowcount != -1: 
                # print(f"Truy vấn SELECT không trả về dòng nào, hoặc là lệnh không phải SELECT đã thực thi thành công, {cursor.rowcount} dòng bị ảnh hưởng.")
                return [[]], None # Trả về list chứa list rỗng cho data, không lỗi
            else: # Không chắc chắn
                # print("Truy vấn thực thi nhưng không có description và rowcount không xác định.")
                return [[{"QueryExecutionStatusUnknown": True}]], "Query executed but status is unknown (no description, rowcount -1)"
        
        columns = [column[0] for column in cursor.description]
        dict_results = [] # Sửa tên biến để tránh nhầm lẫn
        for row_values in cursor.fetchall():
            row_dict = {}
            for idx, col_name in enumerate(columns):
                row_dict[col_name] = row_values[idx]
            dict_results.append(row_dict)

        conn.close()
        return dict_results, error_message # error_message là None ở đây

    except pyodbc.Error as ex:
        sqlstate = ex.args[0]
        error_message = f"Lỗi MS SQL Server ({sqlstate}): {str(ex.args[1])} executing query: {cleaned_sql_query}"
        # print(error_message)
    except Exception as e:
        error_message = f"Lỗi không xác định khi thực thi SQL '{cleaned_sql_query}': {e}"
        # print(error_message)
    return [], error_message # Trả về list rỗng cho data khi có lỗi

## 4. Hàm Chính (Main Execution)

In [10]:
def main():
    print(f"Đang đọc schema cơ sở dữ liệu từ file '{SCHEMA_FILE}'...")
    db_schema_str = get_database_schema_from_file(SCHEMA_FILE)
    if "Error:" in db_schema_str: 
        print(f"Không thể tiếp tục nếu không có schema: {db_schema_str}")
        return
    
    try:
        questions_df = pd.read_csv(INPUT_CSV_FILE)
    except FileNotFoundError:
        print(f"Lỗi: Không tìm thấy file '{INPUT_CSV_FILE}'.")
        return
    except Exception as e:
        print(f"Lỗi khi đọc file CSV: {e}")
        return
        
    results_log = []
    for index, row in questions_df.iterrows():
        question = row['Question']
        condition_value = row.get('Condition') 
        condition_str = str(condition_value) if pd.notna(condition_value) else "{}" 
        
        print(f"\nĐang xử lý câu hỏi {index + 1}/{len(questions_df)}: {question}")
        
        # Bước 1: Tạo ví dụ tổng hợp online
        print("Đang tạo ví dụ tổng hợp online...")
        synthetic_examples, synth_error = [], None 
        # Thử tạo ví dụ tổng hợp, có thể thêm retry ở đây nếu cần
        for synth_attempt in range(2): # Thử 2 lần cho việc tạo ví dụ tổng hợp
            synthetic_examples, synth_error = generate_synthetic_examples_online(
                question, db_schema_str, 
                NUM_SYNTHETIC_EXAMPLES_RF, 
                NUM_SYNTHETIC_EXAMPLES_RT
            )
            if not synth_error and synthetic_examples:
                print(f"Đã tạo {len(synthetic_examples)} ví dụ tổng hợp.")
                break
            elif synth_error:
                print(f"Lỗi khi tạo ví dụ tổng hợp (lần {synth_attempt+1}): {synth_error}")
                if "No API key" in synth_error: break # Dừng nếu hết key
                time.sleep(5) # Đợi chút trước khi thử lại
        if synth_error and not synthetic_examples:
            print("Không thể tạo ví dụ tổng hợp, sẽ tiếp tục không có chúng.")
            # Ghi log lỗi này nếu cần, nhưng vẫn cố gắng sinh SQL chính
            
        # Bước 2: Sinh SQL chính sử dụng các ví dụ tổng hợp (nếu có)
        final_generated_sql = "Error: Initial placeholder"
        llm_call_successful = False
        
        for attempt in range(MAX_API_RETRIES_PER_QUESTION):
            # print(f"  Thử sinh SQL chính lần {attempt + 1}/{MAX_API_RETRIES_PER_QUESTION}...")
            final_generated_sql = generate_sql_with_online_synthetic_examples(
                question, condition_str, db_schema_str, synthetic_examples
            )
            if final_generated_sql and "Error:" not in final_generated_sql:
                llm_call_successful = True
                break
            else:
                print(f"  Lỗi từ LLM khi sinh SQL chính (lần {attempt+1}): {final_generated_sql}")
                if "No available API key" in final_generated_sql: break # Dừng nếu hết key
        
        if llm_call_successful:
            # print(f"SQL đã xử lý (OS): {final_generated_sql}") 
            query_results, execution_error = execute_sql_query(final_generated_sql)
            if execution_error:
                results_log.append({
                    'Question': question,
                    'Condition': condition_str,
                    'GeneratedSQL': final_generated_sql,
                    'ExecutionResult': 'DB_ERROR',
                    'ErrorMessage': execution_error,
                    'SyntheticExamplesUsed': len(synthetic_examples),
                    'Result': "[]"
                })
            else:
                results_log.append({
                    'Question': question,
                    'Condition': condition_str,
                    'GeneratedSQL': final_generated_sql,
                    'ExecutionResult': 'SUCCESS',
                    'ErrorMessage': '',
                    'SyntheticExamplesUsed': len(synthetic_examples),
                    'Result': json.dumps(query_results, default=json_date_converter, indent=2)
                })
        else: # LLM call failed for final SQL generation
            results_log.append({
                'Question': question,
                'Condition': condition_str,
                'GeneratedSQL': final_generated_sql, # Sẽ chứa thông báo lỗi từ lần thử cuối cùng
                'ExecutionResult': 'FINAL_SQL_GENERATION_ERROR',
                'ErrorMessage': final_generated_sql, 
                'SyntheticExamplesUsed': len(synthetic_examples), # Vẫn ghi lại số SE đã tạo (nếu có)
                'Result': "[]"
            })
        # print("-" * 30) # Separator between questions

    output_df = pd.DataFrame(results_log)
    try:
        output_df.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8-sig')
        print(f"\nKết quả đã được lưu vào file '{OUTPUT_CSV_FILE}'")
    except Exception as e:
        print(f"Lỗi khi lưu file CSV kết quả: {e}")

## 5. Chạy Chương trình

In [11]:
if __name__ == '__main__':
    main()

Đang đọc schema cơ sở dữ liệu từ file 'm-schema.txt'...

Đang xử lý câu hỏi 1/162: What is my student ID?
Đang tạo ví dụ tổng hợp online...
Đã tạo 4 ví dụ tổng hợp.

Đang xử lý câu hỏi 2/162: What is my major?
Đang tạo ví dụ tổng hợp online...
Đã tạo 4 ví dụ tổng hợp.

Đang xử lý câu hỏi 3/162: When did I start studying at the school?
Đang tạo ví dụ tổng hợp online...
Đã tạo 4 ví dụ tổng hợp.

Đang xử lý câu hỏi 4/162: What was the first semester I studied at the school?
Đang tạo ví dụ tổng hợp online...
Đã tạo 4 ví dụ tổng hợp.

Đang xử lý câu hỏi 5/162: Find users with birthdays in January.
Đang tạo ví dụ tổng hợp online...
Đã tạo 4 ví dụ tổng hợp.

Đang xử lý câu hỏi 6/162: Find students whose names start with 'Student'.
Đang tạo ví dụ tổng hợp online...
Đã tạo 4 ví dụ tổng hợp.

Đang xử lý câu hỏi 7/162: How many female students are in the 'Software Engineering' major?
Đang tạo ví dụ tổng hợp online...
Đã tạo 4 ví dụ tổng hợp.

Đang xử lý câu hỏi 8/162: Show the list of students in