In [0]:
import os, re
from datetime import datetime, timezone

# 노트북 위치로부터 repo 루트 계산
nb_path = dbutils.notebook.entry_point.getDbutils().notebook().getContext().notebookPath().get()
repo_ws_root = "/".join(nb_path.split("/")[:4])        # Repos/<user>/<repo>
repo_fs_root = f"/Workspace{repo_ws_root}"             # /Workspace/Repos/<user>/<repo>

def extract_callable_name(py_file_text: str, fallback: str) -> str:
    """
    룰 파일에서 첫 번째 top-level def 이름을 찾아 callable_name으로 사용.
    못 찾으면 fallback(rule_id) 사용.
    """
    m = re.search(r"(?m)^\s*def\s+([A-Za-z_]\w*)\s*\(", py_file_text)
    return m.group(1) if m else fallback

rows = []
for rule_group, lookback in [("binary", 1440), ("behavioral", 43200), ("custom", 1440)]:
    folder_fs = f"{repo_fs_root}/base/detections/{rule_group}"
    
    for fname in sorted(os.listdir(folder_fs)):
        if not fname.endswith(".py") or fname.startswith("_"):
            continue

        rule_id = fname[:-3]
        # python import 경로: base.detections.binary.<rule_id>
        module_path = f"base.detections.{rule_group}.{rule_id}"

        with open(f"{folder_fs}/{fname}", "r", encoding="utf-8") as f:
            text = f.read()

        callable_name = extract_callable_name(text, fallback=rule_id)

        rows.append((rule_id, True, rule_group, module_path, callable_name, lookback))

df = spark.createDataFrame(
    rows,
    "rule_id string, enabled boolean, rule_group string, module_path string, callable_name string, lookback_minutes int"
)
df.createOrReplaceTempView("new_rules")

# created_at / updated_at은 현재시각으로 채움(없으면 MERGE에서 NULL 이슈)
spark.sql("""
MERGE INTO sandbox.audit_poc.rule_registry t
USING new_rules s
ON t.rule_id = s.rule_id
WHEN MATCHED THEN UPDATE SET
  t.enabled = s.enabled,
  t.rule_group = s.rule_group,
  t.module_path = s.module_path,
  t.callable_name = s.callable_name,
  t.lookback_minutes = s.lookback_minutes,
  t.updated_at = current_timestamp()
WHEN NOT MATCHED THEN INSERT (
  rule_id, enabled, rule_group, module_path, callable_name, lookback_minutes,
  schedule_hint, severity, description, tags, owner, params_json, created_at, updated_at
) VALUES (
  s.rule_id, s.enabled, s.rule_group, s.module_path, s.callable_name, s.lookback_minutes,
  NULL, NULL, NULL, NULL, NULL, NULL, current_timestamp(), current_timestamp()
)
""")

display(spark.sql("SELECT rule_group, COUNT(*) cnt FROM sandbox.audit_poc.rule_registry GROUP BY rule_group ORDER BY rule_group"))
display(spark.sql("SELECT rule_id, rule_group, module_path, callable_name, lookback_minutes, enabled FROM sandbox.audit_poc.rule_registry ORDER BY rule_group, rule_id"))


In [0]:
%sql
INSERT INTO sandbox.audit_poc.rule_checkpoint (
  rule_id,
  last_success_end_ts,
  last_attempt_start_ts,
  last_attempt_end_ts,
  last_status,
  last_error,
  updated_at
)
SELECT
  r.rule_id,
  NULL,
  NULL,
  NULL,
  NULL,
  NULL,
  current_timestamp()
FROM sandbox.audit_poc.rule_registry r
LEFT ANTI JOIN sandbox.audit_poc.rule_checkpoint c
ON r.rule_id = c.rule_id;
