### Read Sample CSV file into Dataframe

In [0]:
# Load Ecommerce Sample data
events = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)
events.printSchema()


root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



### Display dataframe with First 5 rows 

In [0]:
# Display first 5 rows interactively
display(events.limit(5))

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-01T00:00:00.000Z,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
2019-11-01T00:00:00.000Z,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2019-11-01T00:00:01.000Z,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
2019-11-01T00:00:01.000Z,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
2019-11-01T00:00:01.000Z,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


### Extract product_name from category code and displays sample rows

In [0]:
from pyspark.sql.functions import split, col
# Extract product_name from category_code
events = events\
.withColumn(
"product_name",
split(col("category_code"), r"\.")[1])

# Filter out rows with no product_name
events_filtered = events.filter(col("product_name").isNotNull())

# Select key columns and show first 10 rows
events_filtered.select("event_type", "product_name", "price").show(10)

+----------+--------------+------+
|event_type|  product_name| price|
+----------+--------------+------+
|      view|    smartphone|489.07|
|      view|sewing_machine|293.65|
|      view|       kitchen|712.87|
|      view|    smartphone|183.27|
|      view|      notebook|360.09|
|      view|      notebook|514.56|
|      view|    smartphone|732.07|
|      view|   living_room|437.33|
|      view|       kitchen|155.11|
|      view|       kitchen| 31.64|
+----------+--------------+------+
only showing top 10 rows


### Analyze Events: Using Filter, Select, GroupBy, and OrderBy

In [0]:
# Filter expensive products (price > 100)
filtered_events = events_filtered.filter(col("price") > 100)
print("Number of events with price > 100:", filtered_events.count())

# Count events by event_type
event_type_counts = events_filtered.groupBy("event_type").count()
event_type_counts.show()
# Top 5 brands by event count
top_products = (events_filtered
                .groupBy("brand")
                .count()
                .orderBy(col("count").desc())
                .limit(10))
display(top_products)

Number of events with price > 100: 34648939
+----------+--------+
|event_type|   count|
+----------+--------+
|  purchase|  682721|
|      cart| 2202437|
|      view|42718650|
+----------+--------+



brand,count
samsung,7733328
apple,6213902
xiaomi,4138112
,3514238
huawei,1384154
lg,1024251
oppo,811698
respect,732666
lenovo,727279
acer,698910
