In [19]:
# Install and setup
!pip install pyspark -q
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [20]:
# Initialize Spark
spark = SparkSession.builder.appName("Ecommerce").getOrCreate()

***TASK 1: Load full e-commerce dataset***

In [21]:
# 1. Load dataset
df = spark.read.csv("/content/ecommerce_DS.csv", header=True, inferSchema=True)
print(f"Loaded: {df.count()} rows, {len(df.columns)} cols")
df.show(3)

Loaded: 5000 rows, 9 cols
+-------------------+----------+----------+-----------+--------------------+-------+------+---------+------------+
|         event_time|event_type|product_id|category_id|       category_code|  brand| price|  user_id|user_session|
+-------------------+----------+----------+-----------+--------------------+-------+------+---------+------------+
|2019-10-11 23:15:00|      view|  22081788| 2013315092|  computers.notebook|samsung|208.95|526301898|     27b42e3|
|2019-10-12 03:03:00|  purchase|  22300885| 2004981505|furniture.living_...|  creed| 11.55|533657787|     52de3b9|
|2019-10-20 22:13:00|      view|  44250411| 2099052376|electronics.smart...|samsung| 771.1|558574328|     47cc302|
+-------------------+----------+----------+-----------+--------------------+-------+------+---------+------------+
only showing top 3 rows


***TASK 2: Perform Complex JoinsTASK 2: Perform Complex Joins***

In [23]:
# 2. Simple joins
# Create user summary
from pyspark.sql.functions import countDistinct
user_agg = df.groupBy("user_id").agg(
    count("*").alias("total_events"),
    countDistinct("user_session").alias("total_sessions"),
    sum(when(col("event_type") == "purchase", 1).otherwise(0)).alias("user_purchases")
)

In [24]:
# Create product summary
product_agg = df.groupBy("product_id", "brand").agg(
    count("*").alias("product_views"),
    sum(when(col("event_type") == "purchase", 1).otherwise(0)).alias("product_purchases"),
    avg("price").alias("avg_price")
)

In [25]:
# Join everything
joined_df = df.join(user_agg, "user_id", "left") \
              .join(product_agg, ["product_id", "brand"], "left")

print("Joined data sample:")
joined_df.select("user_id", "product_id", "event_type", "user_purchases", "product_views").show(5)

Joined data sample:
+---------+----------+----------+--------------+-------------+
|  user_id|product_id|event_type|user_purchases|product_views|
+---------+----------+----------+--------------+-------------+
|526301898|  22081788|      view|             0|            1|
|533657787|  22300885|  purchase|             1|            1|
|558574328|  44250411|      view|             0|            1|
|520193880|  37217478|  purchase|             1|            1|
|511392366|  34237768|      cart|             0|            1|
+---------+----------+----------+--------------+-------------+
only showing top 5 rows


***TASK 3: Calculate Running Totals with Window Functions***

In [26]:
# 3. Window functions
window_user = Window.partitionBy("user_id").orderBy("event_time")
window_product = Window.partitionBy("product_id").orderBy("event_time")

df_windowed = joined_df \
    .withColumn("user_event_seq", row_number().over(window_user)) \
    .withColumn("time_since_last_event",
                unix_timestamp(col("event_time")) -
                lag(unix_timestamp(col("event_time")), 1).over(window_user)) \
    .withColumn("user_running_total", count("*").over(window_user)) \
    .withColumn("product_running_views",
                sum(when(col("event_type") == "view", 1).otherwise(0)).over(window_product))

print("Window functions applied:")
df_windowed.select("user_id", "event_time", "event_type", "user_event_seq", "time_since_last_event").show(5)

Window functions applied:
+---------+-------------------+----------+--------------+---------------------+
|  user_id|         event_time|event_type|user_event_seq|time_since_last_event|
+---------+-------------------+----------+--------------+---------------------+
|500001755|2019-10-25 15:35:00|      view|             1|                 NULL|
|500018179|2019-10-27 02:24:00|      cart|             1|                 NULL|
|500025957|2019-10-28 06:44:00|      cart|             1|                 NULL|
|500026505|2019-10-23 21:00:00|  purchase|             1|                 NULL|
|500039102|2019-10-24 09:15:00|      cart|             1|                 NULL|
+---------+-------------------+----------+--------------+---------------------+
only showing top 5 rows


***TASK 4: Create Derived Features***

In [27]:
# 4. Derived features (simplified)
final_df = df_windowed \
    .withColumn("event_hour", hour("event_time")) \
    .withColumn("event_day", dayofweek("event_time")) \
    .withColumn("is_weekend", when(col("event_day").isin([1, 7]), 1).otherwise(0)) \
    .withColumn("price_bucket",
                when(col("price") < 10, "budget")
                .when(col("price") < 50, "mid")
                .otherwise("premium")) \
    .withColumn("purchase_flag", when(col("event_type") == "purchase", 1).otherwise(0)) \
    .withColumn("conversion_rate", col("product_purchases") / col("product_views")) \
    .withColumn("user_frequency", col("total_events") / col("total_sessions"))

print("Final features:")
final_df.select("user_id", "event_hour", "is_weekend", "price_bucket", "purchase_flag", "conversion_rate").show(5)

Final features:
+---------+----------+----------+------------+-------------+---------------+
|  user_id|event_hour|is_weekend|price_bucket|purchase_flag|conversion_rate|
+---------+----------+----------+------------+-------------+---------------+
|526301898|        23|         0|     premium|            0|            0.0|
|533657787|         3|         1|         mid|            1|            1.0|
|558574328|        22|         1|     premium|            0|            0.0|
|520193880|        11|         0|         mid|            1|            1.0|
|511392366|        20|         1|     premium|            0|            0.0|
+---------+----------+----------+------------+-------------+---------------+
only showing top 5 rows


In [28]:
# Basic insights
print("=== INSIGHTS ===")
# Most viewed products
top_products = final_df.filter(col("event_type") == "view") \
    .groupBy("product_id", "brand") \
    .count() \
    .orderBy(desc("count")) \
    .limit(5)
print("Top 5 viewed products:")
top_products.show()


=== INSIGHTS ===
Top 5 viewed products:
+----------+-------+-----+
|product_id|  brand|count|
+----------+-------+-----+
|   3345286|samsung|    1|
|  31596588|  creed|    1|
|  14151982|samsung|    1|
|  35143954| pulser|    1|
|  16463649|samsung|    1|
+----------+-------+-----+



In [29]:
# Purchase by hour
purchase_by_hour = final_df.filter(col("event_type") == "purchase") \
    .groupBy("event_hour") \
    .count() \
    .orderBy("event_hour")
print("Purchases by hour:")
purchase_by_hour.show()

print(f"Final dataset: {final_df.count()} rows, {len(final_df.columns)} columns")

Purchases by hour:
+----------+-----+
|event_hour|count|
+----------+-----+
|         0|   78|
|         1|   66|
|         2|   71|
|         3|   58|
|         4|   63|
|         5|   76|
|         6|   69|
|         7|   74|
|         8|   61|
|         9|   67|
|        10|   70|
|        11|   54|
|        12|   64|
|        13|   70|
|        14|   59|
|        15|   78|
|        16|   78|
|        17|   75|
|        18|   81|
|        19|   68|
+----------+-----+
only showing top 20 rows
Final dataset: 5000 rows, 26 columns


In [30]:
# Cleanup
spark.stop()
