In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

In [0]:
 # Azure storage access info
blob_account_name = "bgupb202402juanbarriento"
blob_container_name = "marketplace"
blob_sas_token = 'sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2024-12-01T01:56:16Z&st=2024-10-16T17:56:16Z&spr=https&sig=cM7vK4vvrL9YH31ZYI%2BX%2BmssMUTdRieP%2BMGcVNPoagA%3D'

# Allow SPARK to read from Blob remotely
wasbs_path = f'wasbs://{blob_container_name}@{blob_account_name}.blob.core.windows.net/'
spark.conf.set( f'fs.azure.sas.{blob_container_name}.{blob_account_name}.blob.core.windows.net' , blob_sas_token)
print('Remote blob path: ' + wasbs_path)

Remote blob path: wasbs://marketplace@bgupb202402juanbarriento.blob.core.windows.net/


In [0]:
silver_df = spark.read.format("delta").load(f"{wasbs_path}silver")

In [0]:
# Create the sales by product table, including total sales and average price
gold_df_sales_by_product = silver_df.groupBy(
    "year", "month","product_id", "brand", "category", "subcategory"
).agg(
    F.count("event_type").alias("event_count"),  # Count number of events (sales/visits)
    F.sum("price").alias("total_sales"),        # Sum of sales (total revenue)
    F.avg("price").alias("avg_price"),          # Average price of the product
    F.first("category_code").alias("category_code")  # Take the first non-null category code
)

In [0]:
gold_df_sales_by_product.write.format("delta").mode("overwrite").partitionBy("year", "month").save(f"{wasbs_path}gold4/sales_by_product")

In [0]:
%sql
USE jdbr_mkp;
DROP TABLE IF EXISTS mkp_gold_sales_by_product;
CREATE TABLE mkp_gold_sales_by_product
USING DELTA
LOCATION 'wasbs://marketplace@bgupb202402juanbarriento.blob.core.windows.net/gold4/sales_by_product'

In [0]:
%sql
SELECT * FROM jdbr_mkp.mkp_gold_sales_by_product
LIMIT 10

year,month,product_id,brand,category,subcategory,event_count,total_sales,avg_price,category_code
2019,10,1005116,apple,electronics,smartphone,53560,56773462.94999258,1059.9974411873147,electronics.smartphone
2019,10,1004919,apple,electronics,smartphone,2946,2619088.1400000127,889.031955193487,electronics.smartphone
2019,10,1002629,apple,electronics,smartphone,33885,12312750.210000291,363.36875343073024,electronics.smartphone
2019,10,1005117,apple,electronics,smartphone,8986,10669561.96000027,1187.3538793679356,electronics.smartphone
2019,10,1004564,meizu,electronics,smartphone,14838,1937084.4800000144,130.54889338185836,electronics.smartphone
2019,10,1004239,apple,electronics,smartphone,2232,2971727.770000021,1331.419251792124,electronics.smartphone
2019,10,1003428,meizu,electronics,smartphone,5511,2253821.1800000463,408.9677336236701,electronics.smartphone
2019,10,1004888,samsung,electronics,smartphone,20339,4640477.900000142,228.156639952807,electronics.smartphone
2019,10,1003713,samsung,electronics,smartphone,2678,1932235.0899999936,721.521691560864,electronics.smartphone
2019,10,1005152,xiaomi,electronics,smartphone,3916,1612503.1500000064,411.77302093973606,electronics.smartphone


In [0]:
# Create the user sessions table, counting the number of events per session
gold_df_sessions = silver_df.groupBy(
    "user_id", "user_session", "category", "subcategory"
).agg(
    F.count("event_type").alias("event_count"),  # Number of events per session
    F.sum("price").alias("total_spent"),         # Total amount spent by the user during the session
    F.first("month").alias("month"),             # Capture the session's month
    F.first("year").alias("year")                # Capture the session's year
)

In [0]:
gold_df_sessions.write.format("delta").mode("overwrite").partitionBy("year", "month").save(f"{wasbs_path}gold4/sessions")

In [0]:
%sql
USE jdbr_mkp;
DROP TABLE IF EXISTS mkp_gold_sessions;
CREATE TABLE mkp_gold_sessions
USING DELTA
LOCATION 'wasbs://marketplace@bgupb202402juanbarriento.blob.core.windows.net/gold4/sessions'

In [0]:
%sql
SELECT * FROM jdbr_mkp.mkp_gold_sessions
LIMIT 10

user_id,user_session,category,subcategory,event_count,total_spent,month,year
523102056,308011e2-0326-48b3-89c7-47329e7baee2,apparel,belt,3,38.82,1,2020
579437195,52d7b9fa-3973-411a-8b1f-d88a88e2773c,apparel,belt,4,211.48,1,2020
599622270,65acc7f5-c358-446a-82bf-dc2495bde9b8,apparel,belt,3,110.94,1,2020
513084984,0debd018-8eb7-46eb-aa32-8e1cdc7d0c68,apparel,belt,2,121.14,1,2020
598465711,18ae322a-d915-469a-bf4a-538078151806,apparel,skirt,2,1112.88,1,2020
526864734,648ed2cf-434c-49ff-b174-c81395b2c3b9,apparel,belt,1,43.76,1,2020
578353151,d59a6f50-8bac-4f44-a2c4-6e3f7557fc1c,apparel,belt,1,20.54,1,2020
547887517,878163f4-999a-4ff8-925b-ed5a420c8bad,apparel,belt,1,25.74,1,2020
588123954,a826a252-a5ed-44aa-89c5-db7158321656,apparel,jacket,2,132.66,1,2020
575351074,dde4191f-eb2f-4c4d-ab14-976725711e3a,apparel,jacket,1,66.33,1,2020


In [0]:
# Create the sales by day of the week and category table
gold_df_sales_by_day = silver_df.groupBy(
    "day_of_week", "category", "subcategory"
).agg(
    F.count("event_type").alias("event_count"),  # Count the number of events per day/category
    F.sum("price").alias("total_sales"),         # Sum of total sales per day/category
    F.first("year").alias("year"),               # Capture the year
    F.first("month").alias("month")              # Capture the month
)

In [0]:
gold_df_sales_by_day.write.format("delta").mode("overwrite").partitionBy("year", "month").save(f"{wasbs_path}gold4/sales_by_day")

In [0]:
%sql
USE jdbr_mkp;
DROP TABLE IF EXISTS mkp_gold_sales_by_day;
CREATE TABLE mkp_gold_sales_by_day
LOCATION 'wasbs://marketplace@bgupb202402juanbarriento.blob.core.windows.net/gold4/sales_by_day'

In [0]:
%sql
SELECT * FROM jdbr_mkp.mkp_gold_sales_by_day
LIMIT 10

day_of_week,category,subcategory,event_count,total_sales,year,month
3,kids,fmcg,19832,1292846.3999999969,2019,10
7,kids,fmcg,18411,1106415.0799999963,2019,10
6,kids,fmcg,19303,1155904.0099999944,2019,10
4,kids,fmcg,18693,1198632.980000016,2019,10
2,kids,fmcg,19507,1297272.7900000005,2019,10
5,kids,fmcg,19384,1153125.370000002,2019,10
1,kids,fmcg,18909,1262839.3200000068,2019,10
3,appliances,iron,56975,7273869.199999969,2019,10
4,appliances,iron,54135,7026003.969999998,2019,10
2,appliances,iron,58510,7446217.500000173,2019,10
