In [0]:
from pyspark.sql.types import StructType, StructField, TimestampType, StringType, LongType, DoubleType

schema = StructType([
    StructField("event_time", TimestampType(), True),
    StructField("event_type", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("category_id", LongType(), True),
    StructField("category_code", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("price", DoubleType(), True),
    StructField("user_id", LongType(), True),
    StructField("user_session", StringType(), True)
])

In [0]:
df_october = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",header="true",schema=schema)

df_november = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",header="true",schema=schema)

In [0]:
display(df_october.select("event_type").distinct())

event_type
purchase
cart
view


In [0]:
df = df_november.unionByName(df_october)

In [0]:
from pyspark.sql.functions import avg, count

Top10Purchase = df.filter((df.event_type == "purchase")&(df.category_code.isNotNull())).groupBy("category_code", "brand","event_type").agg(avg("price").alias("avg_price"), count("category_code").alias("category_count")).orderBy("category_count", ascending=False).limit(10)

#display(Top10Category)
     

In [0]:
%sql

show databases

databaseName
default
ecommerce
information_schema


In [0]:
%sql

Use database ecommerce

In [0]:
df.createOrReplaceTempView("ecommerce.ecommerce_tbl")

In [0]:
%sql

CREATE OR REPLACE TABLE ecommerce.top10Cart AS
select distinct category_code, brand, event_type, avg(price) as avg_price, count(category_code) as category_count
from ecommerce_tbl
where event_type = 'cart' and category_code is not null
group by category_code, brand, event_type
order by avg_price desc
limit 10;

num_affected_rows,num_inserted_rows


In [0]:
%sql

CREATE OR REPLACE TABLE  ecommerce.top10View AS
select distinct category_code, brand, event_type, avg(price) as avg_price, count(category_code) as category_count
from ecommerce_tbl
where event_type = 'view' and category_code is not null
group by category_code, brand, event_type
order by avg_price desc
limit 10;
     

num_affected_rows,num_inserted_rows


In [0]:
%sql
show tables

database,tableName,isTemporary
ecommerce,top10cart,False
ecommerce,top10view,False
,ecommerce_tbl,True


In [0]:
%sql
select * From ecommerce.top10cart

category_code,brand,event_type,avg_price,category_count
appliances.kitchen.refrigerators,climadiff,cart,2524.77,1
electronics.clocks,rado,cart,2131.709259259259,27
furniture.living_room.sofa,trevi,cart,1982.033333333333,3
computers.desktop,apple,cart,1944.92154109589,292
electronics.clocks,louiserard,cart,1927.9758333333327,12
kids.skates,minimotors,cart,1803.7702777777772,36
electronics.audio.acoustic,dynacord,cart,1802.25,7
computers.notebook,dreammachines,cart,1801.82,4
sport.trainer,kettler,cart,1781.23,4
electronics.video.projector,xiaomi,cart,1744.610714285714,14


In [0]:
%sql

select * FROM ecommerce.top10view

category_code,brand,event_type,avg_price,category_count
electronics.audio.acoustic,fly,view,2562.63,5
sport.bicycle,pinarello,view,2533.238401486989,538
sport.trainer,nordictrack,view,2522.59,431
sport.ski,kessler,view,2372.0,106
appliances.kitchen.refrigerators,climadiff,view,2261.348371559635,436
computers.notebook,dreammachines,view,2232.6271808510637,188
appliances.kitchen.washer,siemens,view,2208.03445026178,191
construction.tools.pump,helix,view,2199.0477358490566,53
appliances.environment.water_heater,gree,view,2073.989333333333,15
electronics.clocks,rado,view,2062.901870767496,10632


In [0]:
spark.table("ecommerce.top10Cart").write.mode("overwrite").format("delta").save("/Volumes/workspace/ecommerce/ecommerce_data/top10Cart")
spark.table("ecommerce.top10View").write.mode("overwrite").format("delta").save("/Volumes/workspace/ecommerce/ecommerce_data/top10View")
Top10Purchase.write.mode("overwrite").format("delta").save("/Volumes/workspace/ecommerce/ecommerce_data/top10Purchase")

In [0]:
Top10category = spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/top10Cart") \
    .unionByName(spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/top10View")) \
    .unionByName(spark.read.format("delta").load("/Volumes/workspace/ecommerce/ecommerce_data/top10Purchase"))
display(Top10category.orderBy("category_count", ascending=False))

category_code,brand,event_type,avg_price,category_count
electronics.smartphone,samsung,purchase,262.64350656250696,304914
electronics.smartphone,apple,purchase,897.8349827903909,242597
electronics.smartphone,xiaomi,purchase,205.02228251783487,87329
electronics.smartphone,huawei,purchase,212.63314904177585,43518
electronics.audio.headphone,apple,purchase,177.3836740669023,43377
electronics.smartphone,oppo,purchase,227.2342428092871,25971
electronics.video.tv,samsung,purchase,517.928933747907,20905
electronics.clocks,apple,purchase,431.9547117677853,17191
computers.notebook,acer,purchase,543.3168494228751,11436
appliances.environment.vacuum,samsung,purchase,100.37209391974774,11414
