In [0]:
import re
import json
import requests
from pyspark.sql import SparkSession
from pyspark.dbutils import DBUtils

# Initialize Spark session
spark = SparkSession.builder.appName("TableValidation").getOrCreate()
dbutils = DBUtils(spark)

# Get modified files list from GitHub Actions
modified_files = dbutils.widgets.get("modified_files")
modified_files = json.loads(modified_files)

# Azure AI Search & OpenAI credentials
AZURE_SEARCH_ENDPOINT = "https://metadatavalidation.search.windows.net"
AZURE_SEARCH_KEY = "Z4t1LrbCYhNLEQRhIuA2bmOEeyuQrReuOkL0l3jICQAzSeDWlsQ4"
AZURE_OPENAI_ENDPOINT = "https://metadata-validation-mss.openai.azure.com"
AZURE_OPENAI_KEY = "2fvau2QsDFvA2WIbUWPgTWwibab7yL3z5npGbaYyT2khft35i9ArJQQJ99BBAC5RqLJXJ3w3AAABACOGFpa9"
OPENAI_DEPLOYMENT_NAME = "gpt-4-validator"

# Function to fetch validation rules
def get_validation_rules():
    search_url = f"{AZURE_SEARCH_ENDPOINT}/indexes/validation-rules/docs?api-version=2023-10-01-Preview&search=*"
    headers = {"api-key": AZURE_SEARCH_KEY}
    response = requests.get(search_url, headers=headers)
    data = response.json()
    return [rule["rule_text"] for rule in data["value"]]

# Function to call Azure OpenAI for validation
def validate_with_openai(prompt, text):
    openai_url = f"{AZURE_OPENAI_ENDPOINT}/openai/deployments/{OPENAI_DEPLOYMENT_NAME}/completions?api-version=2023-10-01-Preview"
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {AZURE_OPENAI_KEY}"
    }
    payload = {
        "prompt": f"{prompt}\n\n{text}",
        "max_tokens": 50
    }
    response = requests.post(openai_url, headers=headers, json=payload)
    return response.json()["choices"][0]["text"].strip()

# Load validation rules
rules = get_validation_rules()

validation_failed = False
missing_comments = {}

for file in modified_files:
    if file.endswith(".sql") or file.endswith(".py"):
        try:
            with open(file, "r") as f:
                script_content = f.read()

            create_table_sql = re.findall(r"CREATE TABLE.*?\((.*?)\)", script_content, re.DOTALL)
            if create_table_sql:
                columns = create_table_sql[0].split(",")

                for column in columns:
                    col_match = re.match(r"(\w+)\s+\w+(\s+COMMENT\s+'[^']+')?", column.strip())
                    if col_match and not col_match[2]:  # If no comment
                        column_name = col_match[1]
                        missing_comments[file] = missing_comments.get(file, []) + [column_name]

                        # Validate missing comment with OpenAI
                        validation_response = validate_with_openai(rules[0], f"Column: {column_name}")
                        print(f"AI Response for {column_name}: {validation_response}")

        except Exception as e:
            print(f"Error processing file {file}: {str(e)}")

if missing_comments:
    validation_failed = True
    print(f"Validation Failed! Missing descriptions in files: {missing_comments}")
else:
    print("Validation Passed! All columns have descriptions.")

# Exit with error if validation fails
if validation_failed:
    dbutils.notebook.exit(1)
else:
    dbutils.notebook.exit(0)