In [0]:
# Descriptive stats
events.describe(["price"]).show()

# Hypothesis: weekday vs weekend conversion
weekday = events.withColumn("is_weekend",
    F.dayofweek("event_date").isin([1,7]))
weekday.groupBy("is_weekend", "event_type").count().show()

# Correlation
events.stat.corr("price", "conversion_rate")

# Feature engineering
features = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price")+1)) \
    .withColumn("time_since_first_view",
        F.unix_timestamp("event_time") -
        F.first("event_time").over(Window.partitionBy("user_id").orderBy("event_time")))


[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
File [0;32m<command-4886487906999805>, line 2[0m
[1;32m      1[0m [38;5;66;03m# Descriptive stats[39;00m
[0;32m----> 2[0m events[38;5;241m.[39mdescribe([[38;5;124m"[39m[38;5;124mprice[39m[38;5;124m"[39m])[38;5;241m.[39mshow()
[1;32m      4[0m [38;5;66;03m# Hypothesis: weekday vs weekend conversion[39;00m
[1;32m      5[0m weekday [38;5;241m=[39m events[38;5;241m.[39mwithColumn([38;5;124m"[39m[38;5;124mis_weekend[39m[38;5;124m"[39m,
[1;32m      6[0m     F[38;5;241m.[39mdayofweek([38;5;124m"[39m[38;5;124mevent_date[39m[38;5;124m"[39m)[38;5;241m.[39misin([[38;5;241m1[39m,[38;5;241m7[39m]))

[0;31mNameError[0m: name 'events' is not defined

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.getOrCreate()


In [0]:
# Load events table
events = spark.table("silver")

# Quick sanity check
events.printSchema()# Basic describe
events.describe(["price"]).show()

# Advanced stats
events.select(
    F.count("price").alias("count"),
    F.mean("price").alias("mean"),
    F.stddev("price").alias("stddev"),
    F.expr("percentile_approx(price, 0.5)").alias("median"),
    F.expr("percentile_approx(price, array(0.25, 0.75))").alias("iqr")
).show()

events.limit(5).show()


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4886487906999808>, line 5[0m
[1;32m      2[0m events [38;5;241m=[39m spark[38;5;241m.[39mtable([38;5;124m"[39m[38;5;124msilver[39m[38;5;124m"[39m)
[1;32m      4[0m [38;5;66;03m# Quick sanity check[39;00m
[0;32m----> 5[0m events[38;5;241m.[39mprintSchema()[38;5;66;03m# Basic describe[39;00m
[1;32m      6[0m events[38;5;241m.[39mdescribe([[38;5;124m"[39m[38;5;124mprice[39m[38;5;124m"[39m])[38;5;241m.[39mshow()
[1;32m      8[0m [38;5;66;03m# Advanced stats[39;00m

File [0;32m/databricks/python/lib/python3.12/site-packages/pyspark/sql/connect/dataframe.py:1975[0m, in [0;36mDataFrame.printSchema[0;34m(self, level)[0m
[1;32m   1973[0m     [38;5;28mprint[39m([38;5;28mself[39m[38;5;241m.[39mschema[38;5;241m.[39mtreeString(level))
[1;32m   1974[0m 

In [0]:
events = events.withColumn(
    "is_weekend",
    F.dayofweek("event_date").isin([1, 7])  # Sun, Sat
)
events = events \
    .filter(F.col("event_time").isNotNull()) \
    .filter(F.col("price").isNotNull()) \
    .withColumn("event_date", F.to_date("event_time"))


In [0]:
events.groupBy("is_weekend", "event_type").count().orderBy("is_weekend").show()


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4886487906999810>, line 1[0m
[0;32m----> 1[0m events[38;5;241m.[39mgroupBy([38;5;124m"[39m[38;5;124mis_weekend[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mevent_type[39m[38;5;124m"[39m)[38;5;241m.[39mcount()[38;5;241m.[39morderBy([38;5;124m"[39m[38;5;124mis_weekend[39m[38;5;124m"[39m)[38;5;241m.[39mshow()

File [0;32m/databricks/python/lib/python3.12/site-packages/pyspark/sql/connect/dataframe.py:1123[0m, in [0;36mDataFrame.show[0;34m(self, n, truncate, vertical)[0m
[1;32m   1122[0m [38;5;28;01mdef[39;00m [38;5;21mshow[39m([38;5;28mself[39m, n: [38;5;28mint[39m [38;5;241m=[39m [38;5;241m20[39m, truncate: Union[[38;5;28mbool[39m, [38;5;28mint[39m] [38;5;241m=[39m [38;5;28;01mTrue[39;00m, vertical: [38;5;28mbool[39m [38;5;241m=[39m [38

In [0]:
conversion = events.groupBy("is_weekend").agg(
    F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases"),
    F.countDistinct("user_id").alias("users")
).withColumn(
    "conversion_rate",
    F.col("purchases") / F.col("users")
)

conversion.show()


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4886487906999811>, line 9[0m
[1;32m      1[0m conversion [38;5;241m=[39m events[38;5;241m.[39mgroupBy([38;5;124m"[39m[38;5;124mis_weekend[39m[38;5;124m"[39m)[38;5;241m.[39magg(
[1;32m      2[0m     F[38;5;241m.[39msum(F[38;5;241m.[39mwhen(F[38;5;241m.[39mcol([38;5;124m"[39m[38;5;124mevent_type[39m[38;5;124m"[39m) [38;5;241m==[39m [38;5;124m"[39m[38;5;124mpurchase[39m[38;5;124m"[39m, [38;5;241m1[39m)[38;5;241m.[39motherwise([38;5;241m0[39m))[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mpurchases[39m[38;5;124m"[39m),
[1;32m      3[0m     F[38;5;241m.[39mcountDistinct([38;5;124m"[39m[38;5;124muser_id[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124musers[39m[38;5;124m"[39m)
[0;32m   (...)[0m
[1;32m      6[0m  

In [0]:
events = events.withColumn(
    "converted",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)


In [0]:
user_window = Window.partitionBy("user_id").orderBy("event_time")

features = events \
    .withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price") + 1)) \
    .withColumn(
        "time_since_first_event",
        F.unix_timestamp("event_time") -
        F.unix_timestamp(F.first("event_time").over(user_window))
    ) \
    .withColumn("event_step", F.row_number().over(user_window))


In [0]:
features.select(
    "user_id", "event_time", "event_type",
    "hour", "day_of_week",
    "price", "price_log",
    "time_since_first_event",
    "event_step",
    "converted"
).limit(10).show(truncate=False)


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4886487906999814>, line 8[0m
[1;32m      1[0m features[38;5;241m.[39mselect(
[1;32m      2[0m     [38;5;124m"[39m[38;5;124muser_id[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mevent_time[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mevent_type[39m[38;5;124m"[39m,
[1;32m      3[0m     [38;5;124m"[39m[38;5;124mhour[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mday_of_week[39m[38;5;124m"[39m,
[1;32m      4[0m     [38;5;124m"[39m[38;5;124mprice[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mprice_log[39m[38;5;124m"[39m,
[1;32m      5[0m     [38;5;124m"[39m[38;5;124mtime_since_first_event[39m[38;5;124m"[39m,
[1;32m      6[0m     [38;5;124m"[39m[38;5;124mevent_step[39m[38;5;124m"[39m,
[1;32m      7[0m     [38;5;124m"[39m[38;5;124mc

In [0]:
user_features = features.groupBy("user_id").agg(
    F.max("converted").alias("label"),
    F.avg("price").alias("avg_price"),
    F.max("price").alias("max_price"),
    F.count("*").alias("event_count"),
    F.max("event_step").alias("session_depth"),
    F.avg("time_since_first_event").alias("avg_time_to_action"),
    F.avg("hour").alias("avg_hour"),
    F.avg("day_of_week").alias("avg_day")
)

user_features.show(10)


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4886487906999815>, line 12[0m
[1;32m      1[0m user_features [38;5;241m=[39m features[38;5;241m.[39mgroupBy([38;5;124m"[39m[38;5;124muser_id[39m[38;5;124m"[39m)[38;5;241m.[39magg(
[1;32m      2[0m     F[38;5;241m.[39mmax([38;5;124m"[39m[38;5;124mconverted[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mlabel[39m[38;5;124m"[39m),
[1;32m      3[0m     F[38;5;241m.[39mavg([38;5;124m"[39m[38;5;124mprice[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mavg_price[39m[38;5;124m"[39m),
[0;32m   (...)[0m
[1;32m      9[0m     F[38;5;241m.[39mavg([38;5;124m"[39m[38;5;124mday_of_week[39m[38;5;124m"[39m)[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mavg_day[39m[38;5;124m"[39m)
[1;32m     10[0m )
[0;32m-