In [0]:
# Descriptive stats
# Load events DataFrame from file path (minimal fix)
events = spark.read.load('/Volumes/workspace/ecommerce/ecommerce_data/bronze_events/')
events.describe(["price"]).show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67501979|
|   mean| 292.4593165647889|
| stddev|355.67449958606727|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+



In [0]:
# Correlation
# Minimal fix: use two existing numeric columns
# 'conversion_rate' does not exist; using 'category_id' instead
events.stat.corr("price", "category_id")

-0.05718421046857468

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

# Feature engineering
features = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price")+1)) \
    .withColumn("time_since_first_view",
        F.unix_timestamp("event_time") -
        F.first("event_time").over(Window.partitionBy("user_id").orderBy("event_time")))

In [0]:
# Hypothesis test: Compare mean price between weekday and weekend events
import pyspark.sql.functions as F

# Add 'day_of_week' and 'is_weekend' columns
events_with_day = events.withColumn("day_of_week", F.dayofweek("event_time"))
events_with_day = events_with_day.withColumn(
    "is_weekend",
    F.when(F.col("day_of_week").isin([1,7]), "weekend").otherwise("weekday")
)

# Calculate mean price for each group
mean_prices = events_with_day.groupBy("is_weekend").agg(F.mean("price").alias("mean_price"))
display(mean_prices)

# Optionally, show counts for each group
counts = events_with_day.groupBy("is_weekend").count()
display(counts)

is_weekend,mean_price
weekend,294.6800148352487
weekday,291.17383282528664


is_weekend,count
weekend,24748486
weekday,42753493
