# Pipeline Post-Run Check Notebook (Updated)
스크린샷에 있는 실제 DLT 테이블명을 반영한 **Silver 1개 + Gold 1개** 점검 노트북

생성일: 2026-02-20 02:33:58


In [0]:
# =========================================================
# 0) 기본 설정
# =========================================================
from pyspark.sql import functions as F

CATALOG = "signalcraft_databricks"
SCHEMA  = "default"

# ✅ Gold (프로젝트 확정 물리명)
GOLD_SNAPSHOT = f"{CATALOG}.{SCHEMA}.dlt_gold_user_behavior_snapshot"

# ✅ Silver (스크린샷 기반 실제 존재 테이블명 우선순위)
# - 가장 로우 레벨 이벤트 테이블(dlt_silver_watch_events_all) 우선
silver_candidates = [
    f"{CATALOG}.{SCHEMA}.dlt_silver_watch_events_all",
    f"{CATALOG}.{SCHEMA}.dlt_silver_daily_watch_time_rt",
    f"{CATALOG}.{SCHEMA}.dlt_silver_daily_watch_time",
    f"{CATALOG}.{SCHEMA}.dlt_silver_daily_watch_time_full",
    f"{CATALOG}.{SCHEMA}.dlt_silver_daily_watch_time_history",
    # 캠페인 메시지는 시청 로그가 아닐 수 있어 마지막 후보로 둠
    f"{CATALOG}.{SCHEMA}.dlt_silver_campaign_messages",
]

def first_existing_table(candidates):
    for t in candidates:
        try:
            spark.table(t).limit(1).collect()
            return t
        except Exception:
            pass
    return None

SILVER_TABLE = first_existing_table(silver_candidates)

print("✅ SILVER_TABLE =", SILVER_TABLE)
print("✅ GOLD_SNAPSHOT =", GOLD_SNAPSHOT)


✅ SILVER_TABLE = signalcraft_databricks.default.dlt_silver_watch_events_all
✅ GOLD_SNAPSHOT = signalcraft_databricks.default.dlt_gold_user_behavior_snapshot


In [0]:
# =========================================================
# 1) Silver Quick Check
# - row 수 / 날짜 범위 / 최근 날짜별 row 수 / 샘플
# =========================================================
assert SILVER_TABLE is not None, (
    "Silver 테이블을 찾지 못했어. "
    "silver_candidates에 실제 테이블명을 추가하거나 CATALOG/SCHEMA가 맞는지 확인해줘."
)

silver = spark.table(SILVER_TABLE)
print(f"📌 [SILVER] {SILVER_TABLE} rows = {silver.count():,}")

cols = set(silver.columns)

# 날짜 기준 컬럼 찾기
# 1) event_date가 있으면 그걸 사용
# 2) event_ts가 있으면 to_date(event_ts)
# 3) 그 외: date, dt 같은 흔한 컬럼이 있으면 사용
date_col = None
for c in ["event_date", "date", "dt", "event_dt"]:
    if c in cols:
        date_col = c
        break

if date_col is not None:
    silver_d = silver.withColumn("d", F.col(date_col).cast("date"))
elif "event_ts" in cols:
    silver_d = silver.withColumn("d", F.to_date(F.col("event_ts")))
else:
    silver_d = silver.withColumn("d", F.lit(None).cast("date"))

rng = silver_d.agg(F.min("d").alias("min_date"), F.max("d").alias("max_date")).collect()[0]
print(f"📌 [SILVER] date range = {rng['min_date']} ~ {rng['max_date']}")

print("📌 [SILVER] 최근 날짜별 row 수 (Top 10)")
silver_d.groupBy("d").count().orderBy(F.col("d").desc()).show(10, truncate=False)

print("📌 [SILVER] 샘플 20행")
silver.orderBy(F.rand()).show(20, truncate=False)


📌 [SILVER] signalcraft_databricks.default.dlt_silver_watch_events_all rows = 1,048,349
📌 [SILVER] date range = 2025-02-01 ~ 2026-02-20
📌 [SILVER] 최근 날짜별 row 수 (Top 10)
+----------+-----+
|d         |count|
+----------+-----+
|2026-02-20|2651 |
|2026-02-19|3664 |
|2026-02-18|2589 |
|2026-02-17|2703 |
|2026-02-16|2716 |
|2026-02-15|2618 |
|2026-02-14|1679 |
|2026-02-13|4542 |
|2026-02-12|2836 |
|2026-02-11|1940 |
+----------+-----+
only showing top 10 rows
📌 [SILVER] 샘플 20행
+-------------------+-------+-------+------------+--------+-------------------+----------+
|event_ts           |user_id|show_id|session_time|device  |event_ts_ts        |event_date|
+-------------------+-------+-------+------------+--------+-------------------+----------+
|2025-05-05 14:48:15|5212   |s307   |24          |mobile  |2025-05-05 14:48:15|2025-05-05|
|2025-10-02 22:44:17|9462   |s6572  |16          |smart_tv|2025-10-02 22:44:17|2025-10-03|
|2025-06-20 20:27:01|5162   |s590   |9           |mobile  |2025-06-2

In [0]:
# =========================================================
# 2) Gold Snapshot Quick Check
# - event_date 범위 / 최신일 기준 유저수 / segment & churn 분포
# =========================================================
snap = spark.table(GOLD_SNAPSHOT)
print(f"📌 [GOLD SNAPSHOT] {GOLD_SNAPSHOT} rows = {snap.count():,}")

snap_range = snap.agg(
    F.min("event_date").alias("min_date"),
    F.max("event_date").alias("max_date")
).collect()[0]

print(f"📌 [GOLD SNAPSHOT] date range = {snap_range['min_date']} ~ {snap_range['max_date']}")
latest_date = snap_range["max_date"]
print("📌 [GOLD SNAPSHOT] latest event_date =", latest_date)

# 최신일 핵심 지표
(snap.filter(F.col("event_date") == latest_date)
     .agg(
         F.countDistinct("user_id").alias("users"),
         F.sum(F.col("is_active").cast("int")).alias("active_users"),
         F.avg(F.col("daily_watch_time_min")).alias("avg_daily_watch_time_min")
     )
     .show(truncate=False))

print("📌 [GOLD SNAPSHOT] segment 분포 (latest)")
(snap.filter(F.col("event_date") == latest_date)
     .groupBy("segment")
     .count()
     .orderBy(F.col("count").desc())
     .show(truncate=False))

print("📌 [GOLD SNAPSHOT] churn_risk_level 분포 (latest)")
(snap.filter(F.col("event_date") == latest_date)
     .groupBy("churn_risk_level")
     .count()
     .orderBy(F.col("count").desc())
     .show(truncate=False))


📌 [GOLD SNAPSHOT] signalcraft_databricks.default.dlt_gold_user_behavior_snapshot rows = 3,625,791
📌 [GOLD SNAPSHOT] date range = 2025-02-01 ~ 2026-02-19
📌 [GOLD SNAPSHOT] latest event_date = 2026-02-19
+-----+------------+------------------------+
|users|active_users|avg_daily_watch_time_min|
+-----+------------+------------------------+
|10000|2367        |12.3412                 |
+-----+------------+------------------------+

📌 [GOLD SNAPSHOT] segment 분포 (latest)
+-------+-----+
|segment|count|
+-------+-----+
|Light  |5098 |
|Mid    |4781 |
|Heavy  |121  |
+-------+-----+

📌 [GOLD SNAPSHOT] churn_risk_level 분포 (latest)
+----------------+-----+
|churn_risk_level|count|
+----------------+-----+
|Active          |7660 |
|Churned         |1339 |
|Soft Churn      |558  |
|Dormant         |443  |
+----------------+-----+

