In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType

CATALOG = "signalcraft_databricks"
SCHEMA  = "default"

base = "abfss://signalcraft-data@signalcraftstorage.dfs.core.windows.net/watch_event_log/"
HISTORY_TABLE = f"{CATALOG}.{SCHEMA}.bronze_watch_history"

# ✅ CSV는 일단 string으로 안전하게
schema = StructType([
    StructField("event_ts", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("show_id", StringType(), True),
    StructField("session_time", StringType(), True),
    StructField("device", StringType(), True),
])

raw = (spark.read
       .format("csv")
       .option("header", "true")
       .schema(schema)
       .load(base))

df = raw.select([F.trim(F.col(c)).alias(c) for c in raw.columns]) \
        .withColumn("src_file", F.col("_metadata.file_path"))

# ✅ 파일명 패턴: watch_event_log_YYYYMMDD.csv
df = df.withColumn("file_yyyymmdd",
                   F.regexp_extract("src_file", r"watch_event_log_(\d{8})\.csv", 1))

# ✅ 8자리일 때만 to_date (절대 안 터짐)
df = df.withColumn(
    "file_date",
    F.when(F.length("file_yyyymmdd") == 8, F.to_date("file_yyyymmdd", "yyyyMMdd")).otherwise(F.lit(None))
)

# ✅ event_ts 파싱 (포맷 다양할 수 있어서 여러 개 시도)
df = df.withColumn("event_ts_str", F.trim(F.col("event_ts")))
df = df.withColumn(
    "event_ts_parsed",
    F.coalesce(
        F.expr("try_to_timestamp(event_ts_str, 'yyyy-MM-dd HH:mm:ss.SSS')"),
        F.expr("try_to_timestamp(event_ts_str, 'yyyy-MM-dd HH:mm:ss')"),
        F.expr("try_to_timestamp(event_ts_str)")  # 마지막 보험
    )
)

# ✅ 너무 강한 필터는 금지. event_ts만 살아있으면 우선 적재
clean = df.filter(F.col("event_ts_parsed").isNotNull() & F.col("user_id").isNotNull())

final_df = (clean
    .drop("event_ts")
    .withColumnRenamed("event_ts_parsed", "event_ts")
    .drop("event_ts_str")
)

(final_df.write
 .format("delta")
 .mode("overwrite")
 .saveAsTable(HISTORY_TABLE))

print("✅ saved:", HISTORY_TABLE)


✅ saved: signalcraft_databricks.default.bronze_watch_history


In [0]:
%sql
/* 과거 데이터 날짜확인 */
SELECT min(file_date) AS min_file_date, max(file_date) AS max_file_date
FROM signalcraft_databricks.default.bronze_watch_history;

min_file_date,max_file_date
2025-02-01,2026-02-10


In [0]:
%sql
/* 빈 날짜확인 */
SELECT count(*) AS null_file_date
FROM signalcraft_databricks.default.bronze_watch_history
WHERE file_date IS NULL;

null_file_date
0
