In [0]:
## Calculate statistical summaries

events = spark.table("ecommerce.silver.events")


In [0]:
events.describe(["price"]).show()


+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|         42448764|
|   mean|290.3236606849907|
| stddev|358.2691553394021|
|    min|              0.0|
|    max|          2574.07|
+-------+-----------------+



In [0]:
## Hypothesis Testing (Weekday vs Weekend)

from pyspark.sql import functions as F

weekday = events.withColumn(
    "event_date", F.to_date("event_time")
).withColumn(
    "is_weekend",
    F.dayofweek("event_date").isin([1, 7])
)

weekday.groupBy("is_weekend", "event_type").count().show()



+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|     false|  purchase|  546439|
|     false|      view|29775216|
|     false|      cart|  664318|
|      true|      view|11004183|
|      true|      cart|  262198|
|      true|  purchase|  196410|
+----------+----------+--------+



In [0]:
## Identify Correlations

events = events.withColumn(
    "conversion_rate",
    F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

In [0]:
events.stat.corr("price", "conversion_rate")


0.007166380934035528

In [0]:
## Feature Engineering for Machine Learning

from pyspark.sql.window import Window

features = (
    events
    .withColumn("hour", F.hour("event_time"))
    .withColumn("day_of_week", F.dayofweek("event_date"))
    .withColumn("price_log", F.log(F.col("price") + 1))
    .withColumn(
        "time_since_first_view",
        F.unix_timestamp("event_time") -
        F.first("event_time").over(
            Window.partitionBy("user_id").orderBy("event_time")
        )
    )
)
