In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

events = spark.table("ecommerce.silver.events")

print("Rows:", events.count())
events.printSchema()
events.show(5, truncate=False)


Rows: 42341904
root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- ingestion_ts: timestamp (nullable = true)
 |-- source_file: string (nullable = true)
 |-- event_date: date (nullable = true)
 |-- price_tier: string (nullable = true)

+-------------------+----------+----------+-------------------+---------------------------+-------+------+---------+------------------------------------+--------------------------+-----------+----------+----------+
|event_time         |event_type|product_id|category_id        |category_code              |brand  |price |user_id  |user_session                        |ingestion_ts              |source_file|event_date|price_tier|
+--

In [0]:
events.select("price").summary("count","mean","stddev","min","25%","50%","75%","max").show()
events.select(
    F.count("*").alias("rows"),
    F.sum(F.col("price").isNull().cast("int")).alias("price_nulls")
).show()


+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|         42341904|
|   mean|290.7812638390561|
| stddev|358.3993774731591|
|    min|             0.77|
|    25%|            66.64|
|    50%|            163.9|
|    75%|           358.57|
|    max|          2574.07|
+-------+-----------------+

+--------+-----------+
|    rows|price_nulls|
+--------+-----------+
|42341904|          0|
+--------+-----------+



In [0]:
events_w = events.withColumn(
    "is_weekend",
    F.dayofweek("event_date").isin([1,7])
)

funnel_week = (events_w
  .groupBy("is_weekend")
  .agg(
      F.sum(F.when(F.col("event_type")=="view", 1).otherwise(0)).alias("views"),
      F.sum(F.when(F.col("event_type")=="purchase", 1).otherwise(0)).alias("purchases")
  )
  .withColumn("conversion_rate_pct",
      F.round(F.col("purchases") * 100.0 / F.nullif(F.col("views"), F.lit(0)), 2)
  )
)

funnel_week.show()


+----------+--------+---------+-------------------+
|is_weekend|   views|purchases|conversion_rate_pct|
+----------+--------+---------+-------------------+
|      true|10992259|   196402|               1.79|
|     false|29711315|   546365|               1.84|
+----------+--------+---------+-------------------+



In [0]:
# 1) Conversión diaria (purchases/views por día)
daily_conv = (events_w
  .groupBy("event_date","is_weekend")
  .agg(
      F.sum(F.when(F.col("event_type")=="view", 1).otherwise(0)).alias("views"),
      F.sum(F.when(F.col("event_type")=="purchase", 1).otherwise(0)).alias("purchases")
  )
  .where(F.col("views") > 0)
  .withColumn("conv", F.col("purchases") / F.col("views"))
  .select("is_weekend","conv")
)

data = daily_conv.toPandas()
weekend = data[data["is_weekend"]==True]["conv"].to_numpy()
weekday = data[data["is_weekend"]==False]["conv"].to_numpy()

import numpy as np

observed = weekend.mean() - weekday.mean()

# 2) Permutation test
combined = np.concatenate([weekend, weekday])
n_w = len(weekend)

rng = np.random.default_rng(42)
N = 10000
count_extreme = 0

for _ in range(N):
    rng.shuffle(combined)
    w = combined[:n_w]
    d = combined[n_w:]
    diff = w.mean() - d.mean()
    if abs(diff) >= abs(observed):
        count_extreme += 1

p_value = (count_extreme + 1) / (N + 1)

observed, p_value


(np.float64(-0.0004758526391207378), 0.38176182381761825)

In [0]:
corr_df = (events
  .select(
      F.col("price").cast("double").alias("price"),
      (F.col("event_type")=="purchase").cast("double").alias("is_purchase")
  )
  .where(F.col("price").isNotNull())
)

corr = corr_df.stat.corr("price","is_purchase")
corr


0.0070015989652334595

In [0]:
by_product = (events
  .groupBy("product_id")
  .agg(
      F.avg("price").alias("avg_price"),
      F.sum(F.when(F.col("event_type")=="view", 1).otherwise(0)).alias("views"),
      F.sum(F.when(F.col("event_type")=="purchase", 1).otherwise(0)).alias("purchases")
  )
  .where(F.col("views") > 0)
  .withColumn("product_conv", F.col("purchases") / F.col("views"))
)

corr2 = by_product.select("avg_price","product_conv").stat.corr("avg_price","product_conv")
corr2


-0.044202943113122735

In [0]:
feat = (events
  .withColumn("hour", F.hour("event_time"))
  .withColumn("day_of_week", F.dayofweek("event_date"))
  .withColumn("is_weekend", F.col("day_of_week").isin([1,7]))
  .withColumn("price_log", F.log(F.col("price") + F.lit(1.0)))
)


In [0]:
w_user = Window.partitionBy("user_id").orderBy("event_time")
w_user_all = Window.partitionBy("user_id")

feat = (feat
  .withColumn("first_event_time", F.min("event_time").over(w_user_all))
  .withColumn("time_since_first_event_sec",
      F.unix_timestamp("event_time") - F.unix_timestamp("first_event_time")
  )
  .drop("first_event_time")
)

feat.select("user_id","event_time","time_since_first_event_sec").show(5, truncate=False)


+---------+-------------------+--------------------------+
|user_id  |event_time         |time_since_first_event_sec|
+---------+-------------------+--------------------------+
|205053188|2019-10-09 10:30:19|0                         |
|205053188|2019-10-09 10:30:44|25                        |
|209714031|2019-10-29 19:42:57|781992                    |
|209714031|2019-10-29 19:56:17|782792                    |
|209714031|2019-10-29 19:51:15|782490                    |
+---------+-------------------+--------------------------+
only showing top 5 rows


In [0]:
(feat.write
  .format("delta")
  .mode("overwrite")
  .saveAsTable("ecommerce.silver.events_features")
)

spark.table("ecommerce.silver.events_features").show(5)


+-------------------+----------+----------+-------------------+-------------+-----+------+---------+--------------------+--------------------+-----------+----------+----------+----+-----------+----------+-----------------+--------------------------+
|         event_time|event_type|product_id|        category_id|category_code|brand| price|  user_id|        user_session|        ingestion_ts|source_file|event_date|price_tier|hour|day_of_week|is_weekend|        price_log|time_since_first_event_sec|
+-------------------+----------+----------+-------------------+-------------+-----+------+---------+--------------------+--------------------+-----------+----------+----------+----+-----------+----------+-----------------+--------------------------+
|2019-10-20 03:19:52|      view|  15400069|2070005009256284935|         NULL|intex| 74.65|279068900|48123886-e9cc-449...|2026-01-22 06:01:...|       NULL|2019-10-20|   premium|   3|          1|      true|4.326117440234365|                         0|
