# 1. connection database

In [1]:
import pyodbc # Import the pyodbc module for connecting to SQL Server
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get database connection details from environment variables
db_server = os.getenv('DB_SERVER')
db_database = os.getenv('DB_DATABASE')
db_username = os.getenv('DB_USERNAME')
db_password = os.getenv('DB_PASSWORD')
db_driver = os.getenv('DB_DRIVER')

# Check if all required environment variables are set
if not all([db_server, db_database, db_driver]):
    print("Error: Database connection details (server, database, driver) are missing in the .env file.")
    exit()

# Construct the connection string
# Using Trusted_Connection=yes for Windows Authentication
# If using SQL Server Authentication, replace Trusted_Connection=yes with UID={db_username};PWD={db_password}
# Ensure the driver name matches the one installed on your system
conn_str = (
    f"DRIVER={{{db_driver}}};"
    f"SERVER={db_server};"
    f"DATABASE={db_database};"
)

# Add authentication details based on whether username/password are provided
if db_username and db_password:
    print("Connecting using SQL Server Authentication...")
    conn_str += f"UID={db_username};PWD={db_password};"
else:
    print("Connecting using Windows Authentication (Trusted Connection)...")
    conn_str += "Trusted_Connection=yes;"


Connecting using Windows Authentication (Trusted Connection)...


# 2. execute sql syntax

In [2]:
import json
import pyodbc
import pandas as pd

def validate_sql_queries(json_file_path: str, conn_str: str) -> pd.DataFrame:
    """
    Connects to a SQL Server database, reads SQL queries from a JSON file,
    executes each query to validate its syntax, and returns a DataFrame
    containing information about any queries that failed execution.

    Args:
        json_file_path: Path to the JSON file containing query data.
                        Expected keys per item: 'query_en', 'query_zh', 'sql_syntax'.
        conn_str: The pyodbc connection string for the SQL Server database.

    Returns:
        A pandas DataFrame with columns ['query_en', 'query_zh', 'sql_syntax', 'error_message']
        listing all queries that failed to execute. Returns an empty DataFrame if all
        queries executed successfully or if connection/file reading failed before execution.
    """
    failed_queries_data = []
    data = None
    conn = None
    cursor = None

    # 1. Load JSON data
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"Successfully loaded data from '{json_file_path}'.")
    except FileNotFoundError:
        print(f"Error: JSON file not found at '{json_file_path}'.")
        return pd.DataFrame(failed_queries_data, columns=['query_en', 'query_zh', 'sql_syntax', 'error_message'])
    except json.JSONDecodeError as e:
        print(f"Error: Could not decode JSON from '{json_file_path}'. Invalid JSON format? Error: {e}")
        return pd.DataFrame(failed_queries_data, columns=['query_en', 'query_zh', 'sql_syntax', 'error_message'])
    except Exception as e:
        print(f"An unexpected error occurred while reading the JSON file: {e}")
        return pd.DataFrame(failed_queries_data, columns=['query_en', 'query_zh', 'sql_syntax', 'error_message'])

    if not isinstance(data, list):
         print(f"Error: Expected JSON data to be a list of objects, but got {type(data)}.")
         return pd.DataFrame(failed_queries_data, columns=['query_en', 'query_zh', 'sql_syntax', 'error_message'])

    # 2. Connect to Database and Execute Queries
    try:
        conn = pyodbc.connect(conn_str)
        cursor = conn.cursor()
        print("Successfully connected to the database.")

        total_queries = len(data)
        print(f"Starting validation for {total_queries} queries...")

        for i, item in enumerate(data):
            # Ensure item is a dictionary and has the required keys
            if not isinstance(item, dict):
                print(f"Warning: Skipping item at index {i} because it's not a dictionary: {item}")
                continue
            if not all(key in item for key in ['query_en', 'query_zh', 'sql_syntax']):
                 print(f"Warning: Skipping item at index {i} due to missing keys: {item}")
                 continue

            sql_syntax = item['sql_syntax']
            query_en = item['query_en']
            query_zh = item['query_zh']

            # Basic check for empty SQL
            if not sql_syntax or not sql_syntax.strip():
                error_msg = "SQL syntax is empty or whitespace."
                print(f"Failed ({i+1}/{total_queries}): {error_msg} - Query EN: {query_en}")
                failed_queries_data.append({
                    'query_en': query_en,
                    'query_zh': query_zh,
                    'sql_syntax': sql_syntax,
                    'error_message': error_msg
                })
                continue # Skip execution if syntax is empty

            # 3. Execute SQL Syntax
            try:
                # Execute the query. We don't need results, just check for errors.
                # For DML/DDL, this might modify the database.
                # Consider transactions/rollback if modification is undesired.
                cursor.execute(sql_syntax)

                # For DML/DDL statements to persist (if desired and autocommit is off)
                # You might need conn.commit() here, but for pure validation,
                # it's often better *not* to commit.
                # If you encounter issues with multi-statement scripts needing commit,
                # handle accordingly or consider splitting them.
                # print(f"Success ({i+1}/{total_queries}): Query EN: {query_en}") # Optional: uncomment for success logs

            except pyodbc.Error as e:
                # 4. Record Failure
                error_msg = str(e)
                print(f"Failed ({i+1}/{total_queries}): {error_msg} - Query EN: {query_en}")
                failed_queries_data.append({
                    'query_en': query_en,
                    'query_zh': query_zh,
                    'sql_syntax': sql_syntax,
                    'error_message': error_msg
                })
                # Optional: If an error occurs, try to rollback any uncommitted changes
                # within the transaction, although many DDL might be auto-committed.
                try:
                    conn.rollback()
                except pyodbc.Error as rb_err:
                    print(f"Warning: Could not rollback after error: {rb_err}")

            except Exception as e: # Catch non-pyodbc errors during execution phase
                 error_msg = f"Non-ODBC execution error: {str(e)}"
                 print(f"Failed ({i+1}/{total_queries}): {error_msg} - Query EN: {query_en}")
                 failed_queries_data.append({
                    'query_en': query_en,
                    'query_zh': query_zh,
                    'sql_syntax': sql_syntax,
                    'error_message': error_msg
                 })

    except pyodbc.Error as e:
        print(f"Database connection or setup error: {e}")
        # Cannot proceed with validation if connection fails
        return pd.DataFrame(failed_queries_data, columns=['query_en', 'query_zh', 'sql_syntax', 'error_message'])
    except Exception as e:
        print(f"An unexpected error occurred during database operations: {e}")
        return pd.DataFrame(failed_queries_data, columns=['query_en', 'query_zh', 'sql_syntax', 'error_message'])
    finally:
        # 5. Cleanup
        if cursor:
            cursor.close()
        if conn:
            conn.close()
            print("Database connection closed.")

    # 6. Return DataFrame of failures
    df_errors = pd.DataFrame(failed_queries_data, columns=['query_en', 'query_zh', 'sql_syntax', 'error_message'])
    print(f"\nValidation complete. Found {len(df_errors)} problematic queries.")
    return df_errors

# -----------------------------執行-----------------------------

# 2. Specify the path to your JSON file
json_file = 'training_data_bilingual.json' 

# 3. Call the function
df_failed_queries = validate_sql_queries(json_file, conn_str)

# 4. Review the results
if not df_failed_queries.empty:
    print("\n--- Failed Queries ---")
    # Display the DataFrame - adjust display options if needed
    with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):
        print(df_failed_queries)
    # Optional: Save the failed queries to a CSV file for later review
    # df_failed_queries.to_csv('failed_sql_queries.csv', index=False, encoding='utf-8-sig')
    # print("\nFailed queries saved to 'failed_sql_queries.csv'")
else:
    print("\nAll SQL queries in the JSON file executed successfully!")

Successfully loaded data from 'training_data_bilingual.json'.
Database connection or setup error: ('08001', '[08001] [Microsoft][ODBC Driver 17 for SQL Server]Named Pipes Provider: Could not open a connection to SQL Server [2].  (2) (SQLDriverConnect); [08001] [Microsoft][ODBC Driver 17 for SQL Server]Login timeout expired (0); [08001] [Microsoft][ODBC Driver 17 for SQL Server]A network-related or instance-specific error has occurred while establishing a connection to SQL Server. Server is not found or not accessible. Check if instance name is correct and if SQL Server is configured to allow remote connections. For more information see SQL Server Books Online. (2)')

All SQL queries in the JSON file executed successfully!
