In [0]:
import json
import re
import os
import sys
import base64
import requests
from pyspark.sql.utils import AnalysisException

# Step 1: Get modified files from GitHub Actions
dbutils.widgets.text("modified_files", "[]")  # Default to empty list
modified_files = json.loads(dbutils.widgets.get("modified_files"))

if not modified_files:
    print("✅ No modified files to validate.")
    sys.exit(0)  # Exit successfully if there are no changes

# Step 2: Get the Databricks username
try:
    username = spark.sql("SELECT current_user()").collect()[0][0]
except AnalysisException:
    print("❌ Error: Unable to fetch username.")
    sys.exit(1)  # Fail the job

# Step 3: Get the repo name dynamically
repo_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
repo_name = repo_path.split("/")[3]  # Extract repo name

# Step 4: Construct full paths to modified notebooks
full_paths = [f"/Workspace/Users/{username}/{repo_name}/{os.path.splitext(file)[0]}" for file in modified_files]
print("🔹 Full Paths to Validate:", full_paths)

# Step 5: Databricks API details
DATABRICKS_HOST = "https://adb-845055060386182.2.azuredatabricks.net"
DATABRICKS_TOKEN = "dapic77f368dd1174a60661f559bc3f41d65-2"

headers = {"Authorization": f"Bearer {DATABRICKS_TOKEN}"}

# Step 6: Function to fetch and decode notebook content
def fetch_notebook_content(notebook_path):
    response = requests.get(
        f"{DATABRICKS_HOST}/api/2.0/workspace/export",
        headers=headers,
        params={"path": notebook_path, "format": "SOURCE"}  # Fetch raw SQL content
    )
    if response.status_code == 200:
        encoded_content = response.json().get("content", "")
        return base64.b64decode(encoded_content).decode("utf-8")  # Decode Base64
    else:
        print(f"❌ Error fetching notebook content ({notebook_path}): {response.text}")
        return None  # Return None on failure

# Step 7: Function to validate SQL scripts
def validate_sql_script(sql_content):
    """
    Validates if all columns in a CREATE TABLE statement have comments.
    Returns True if valid, False otherwise.
    """
    create_table_pattern = re.compile(
        r"CREATE\s+TABLE\s+[\w.]+\s*\((.*?)\)\s*COMMENT\s*=", 
        re.IGNORECASE | re.DOTALL
    )
    column_pattern = re.compile(
        r"\s*(\w+)\s+\w+(\s+COMMENT\s+'([^']+)')?", 
        re.IGNORECASE
    )

    match = create_table_pattern.search(sql_content)
    if not match:
        print("⚠️ No CREATE TABLE statement found in this file.")
        return True  # Ignore files that don't create tables

    column_definitions = match.group(1)
    columns = column_pattern.findall(column_definitions)

    missing_comments = [col[0] for col in columns if not col[2]]
    if missing_comments:
        print(f"❌ Missing comments for columns: {missing_comments}")
        return False  # Fail validation
    return True

# Step 8: Validate each modified file
validation_failed = False

for file_path in full_paths:
    sql_content = fetch_notebook_content(file_path)
    
    if sql_content is None:
        validation_failed = True  # Fail if notebook fetch fails
        continue

    if not validate_sql_script(sql_content):
        validation_failed = True  # Fail if SQL validation fails

# Step 9: Exit with failure if validation fails
if validation_failed:
    print("❌ Validation failed! Some tables have missing column comments.")
    sys.exit(1)  # Fail the Databricks job (which fails GitHub Actions)
else:
    print("✅ Validation passed! All tables have proper column comments.")
    sys.exit(0)  # Pass the job
