# DAY 11 : Statistical Analysis & ML Prep

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Load data and convert event_time string to timestamp for analysis
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True) \
    .withColumn("event_time", F.to_timestamp("event_time", "yyyy-MM-dd HH:mm:ss 'UTC'")) \
    .withColumn("event_date", F.to_date("event_time"))

#events.cache() # Keep in memory for faster analysis

In [0]:
# Calculate statistical summaries for the price column
events.describe(["price"]).show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67501979|
|   mean|292.45931656479536|
| stddev|355.67449958606727|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+



In [0]:
# Test if user behavior changes on weekends (1=Sunday, 7=Saturday)
weekday = events.withColumn("is_weekend", F.dayofweek("event_date").isin([1, 7]))
weekday.groupBy("is_weekend", "event_type").count().show()

+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|     false|  purchase|  500258|
|     false|      view|40453993|
|      true|      view|23102117|
|     false|      cart| 1799242|
|      true|      cart| 1229688|
|      true|  purchase|  416681|
+----------+----------+--------+



In [0]:
# Identify if higher prices correlate with fewer purchases
corr_df = events.withColumn("is_purchase", F.when(F.col("event_type") == "purchase", 1).otherwise(0))
correlation = corr_df.stat.corr("price", "is_purchase")
print(f"Correlation between Price and Purchase: {correlation}")

Correlation between Price and Purchase: 0.0025286683578114658


In [0]:
# Engineer temporal and behavioral features for model training
# Use unix_timestamp on both sides of the subtraction to avoid BIGINT vs TIMESTAMP mismatch
features = events.withColumn("hour", F.hour("event_time")) \
    .withColumn("day_of_week", F.dayofweek("event_date")) \
    .withColumn("price_log", F.log(F.col("price") + 1)) \
    .withColumn("time_since_first_view", 
        F.unix_timestamp("event_time") - 
        F.unix_timestamp(F.first("event_time").over(Window.partitionBy("user_id").orderBy("event_time"))))

features.select("user_id", "hour", "day_of_week", "price_log", "time_since_first_view").show(5)

+--------+----+-----------+-----------------+---------------------+
| user_id|hour|day_of_week|        price_log|time_since_first_view|
+--------+----+-----------+-----------------+---------------------+
|65800726|   4|          4|4.416428061391214|                    0|
|65800726|   4|          4|4.416428061391214|                  128|
|81255481|   7|          6|4.209902902856373|                    0|
|81255481|  14|          5|4.206779991551889|              1146401|
|82079354|   4|          5|5.155831718251282|                    0|
+--------+----+-----------+-----------------+---------------------+
only showing top 5 rows


In [0]:
# Create the schema if it was missed in this session and save the table
spark.sql("CREATE SCHEMA IF NOT EXISTS workspace.ml_prep_lab")

features.write.mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("workspace.ml_prep_lab.engineered_features")

print("Success: Data saved to workspace.ml_prep_lab.engineered_features")

Success: Data saved to workspace.ml_prep_lab.engineered_features
