In [0]:
RAW_FILE = "/Volumes/workspace/default/raw/2019-Dec.csv"
RAW_TABLE = "workspace.default.raw_events_2019_dec"

print("RAW_FILE =", RAW_FILE)


In [0]:
from pyspark.sql import functions as F

df = (spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv(RAW_FILE)
)



print("Rows:", df.count())
display(df.limit(5))
df.printSchema()


In [0]:
from pyspark.sql import functions as F

spark.sql("CREATE DATABASE IF NOT EXISTS workspace.default")

# Read again (clean + consistent load)
df = (spark.read
      .option("header", "true")
      .option("inferSchema", "true")
      .csv(RAW_FILE))

# Add event_date for partitioning (core for daily slicing)
df_raw = df.withColumn("event_date", F.to_date(F.col("event_time")))

display(df_raw.select("event_time", "event_date").limit(5))


In [0]:
# One-time: create a RAW delta table partitioned by day
(df_raw.write
 .format("delta")
 .mode("overwrite")
 .partitionBy("event_date")
 .saveAsTable(RAW_TABLE))

print("âœ… RAW Delta table created:", RAW_TABLE)


In [0]:
raw_tbl = spark.table(RAW_TABLE)

print("RAW_TABLE rows:", raw_tbl.count())

raw_tbl.select(
    F.min("event_time").alias("min_event_time"),
    F.max("event_time").alias("max_event_time"),
    F.countDistinct("event_date").alias("distinct_days")
).show(truncate=False)


In [0]:
(raw_tbl.groupBy("event_date")
 .count()
 .orderBy("event_date")
 .show(10, truncate=False))
