In [0]:
from pyspark.sql import SparkSession
import re
import sys
import json

# Initialize Spark session
spark = SparkSession.builder.appName("Validate Table Descriptions").getOrCreate()

# Retrieve modified files from input
dbutils.widgets.text("modified_files", "")
modified_files = json.loads(dbutils.widgets.get("modified_files"))

# Regular expression to match CREATE TABLE statements
create_table_regex = re.compile(r"CREATE TABLE\s+\w+\s*\((.*?)\)", re.IGNORECASE | re.DOTALL)
column_regex = re.compile(r"(\w+)\s+\w+(\s+COMMENT\s+'[^']*')?", re.IGNORECASE)

missing_comments = []

for file in modified_files:
    with open(file, "r") as f:
        content = f.read()
        matches = create_table_regex.findall(content)
        for match in matches:
            columns = match.split(",")
            for column in columns:
                column_match = column_regex.match(column.strip())
                if column_match and not column_match.group(2):
                    missing_comments.append(column_match.group(1))

if missing_comments:
    print("ERROR: The following columns are missing comments:", missing_comments)
    sys.exit(1)  # Fails the Databricks job, which also fails GitHub Actions

print("Validation successful: All columns have comments.")
