### Import required functions and libraries

In [0]:
from pyspark.sql import functions as f
from pyspark.sql import Window as w

### Descriptive Statistics

In [0]:
# cleaned data reference
events = spark.table("workspace.ecommerce_analysis.refined_events")

# Task: Descriptive stats with rounding and formatting
stats_summary = events\
.select(
    f.format_number(f.count("price"), 0).alias("count"),
    f.round(f.mean("price"), 2).alias("mean_price"),
    f.round(f.stddev("price"), 2).alias("stddev_price"),
    f.round(f.min("price"), 2).alias("min_price"),
    f.round(f.max("price"), 2).alias("max_price")
)

display(stats_summary)

count,mean_price,stddev_price,min_price,max_price
67392950,292.49,355.74,0.0,2574.07


### Hypothesis Testing (Weekday vs Weekend)

True represents weekends (Sat/Sun) where users have higher buying intent, while False represents weekdays with higher traffic but more casual browsing.

In [0]:
# Defne the behavioural segments
hypo_df = events\
.withColumn("is_weekend", f.dayofweek("event_timestamp").isin([1, 7])) \
.withColumn("is_purchase", f.when(f.col("event_type") == "purchase", 1).otherwise(0))

# Calculate the formatted results
hypothesis_results = hypo_df\
.groupBy("is_weekend").agg(
f.format_number(f.count("*"), 0).alias("total_events"),
f.format_number(f.avg("is_purchase"), 4).alias("conversion_rate")
)
 display(hypothesis_results)


is_weekend,total_events,conversion_rate
True,24704292,0.0169
False,42688658,0.0117


### Identify Correlations

In [0]:
# Transform 'event_type' into a binary numeric label ('conversion_rate') 
# to satisfy the Pearson Correlation input requirements.

events_with_rate = hypo_df\
.withColumn("conversion_rate", 
f.when(f.col("event_type") == "purchase", 1).otherwise(0))

# Measure the linear relationship between product price and the binary conversion indicator (0 or 1).
correlation_val = events_with_rate.stat.corr("price", "conversion_rate")
correlation_val = events_with_rate.stat.corr("price", "conversion_rate")

# Output the result
print(f"Correlation: {round(correlation_val, 4)}")

Correlation: 0.0025


In [0]:
ws = w.partitionBy("user_id").orderBy("event_timestamp")

# Build the signals (Features)
features = events_with_rate \
.withColumn("hour", f.hour("event_timestamp")) \
.withColumn("day_of_week", f.dayofweek("event_timestamp")) \
.withColumn("price_log", f.round(f.log1p(f.col("price")),4)) \
.withColumn("time_since_first_view",
f.unix_timestamp("event_timestamp") -
f.unix_timestamp(f.first("event_timestamp").over(ws))
    )

# Select the final vector
display(features.select("user_id", "hour", "day_of_week", "price_log", "time_since_first_view", "conversion_rate").limit(5))

user_id,hour,day_of_week,price_log,time_since_first_view,conversion_rate
65800726,4,4,4.4164,0,0
65800726,4,4,4.4164,128,0
81255481,7,6,4.2099,0,0
81255481,14,5,4.2068,1146401,0
106416780,5,5,5.5511,0,0
