In [14]:
import os 
import findspark 
findspark.init()

# for sql
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col
from pyspark.sql.functions import sum,avg,max,count

# for time 
import time 
import datetime as dt


In [2]:
# 可以改成 *.csv 
root = '../../*.csv'
spark = SparkSession.builder.appName('eCommerce').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/23 22:02:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:

ecommerce = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(root)

                                                                                

In [4]:
ecommerce.createOrReplaceTempView('ecommerce_2019_oct')

In [5]:
ecommerce.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [6]:
ecommerce.show(5)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 01:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 01:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 01:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 01:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

### Try with smaller (day) to larger size (months) and show when the time takes to long

- Smaller days 

In [7]:
only_purchases = ecommerce.filter(col("event_type") == 'purchase')

In [8]:
only_purchases.show(5)

+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+
|2019-11-01 01:00:41|  purchase|  13200605|2053013557192163841|furniture.bedroom...|   NULL| 566.3|559368633|d6034fa2-41fb-4ac...|
|2019-11-01 01:01:04|  purchase|   1005161|2053013555631882655|electronics.smart...| xiaomi|211.92|513351129|e6b7ce9b-1938-4e2...|
|2019-11-01 01:04:51|  purchase|   1004856|2053013555631882655|electronics.smart...|samsung|128.42|562958505|0f039697-fedc-40f...|
|2019-11-01 01:05:34|  purchase|  26401669|2053013563651392361|                NULL|lucente|109.66|541854711|c41c44d5-ef9b-41b...|
|2019-11-01 01:06:33|  purchase|   1801881|2053013554415534427|electronics.video.tv

In [15]:
start_date = dt.datetime(2019,10,1)
end_date = dt.datetime(2019,10,2)

# filter data by date peirod 
specified_period = only_purchases.filter((col("event_time") >= start_date) & (col("event_time") < end_date))

# count number of rows
print(f'Number of rows: {specified_period.count()}')

start = time.time() 
# calculate average price for specified period
average_price_specified_period = specified_period.agg(avg("price")).collect()
print(f'average_price_specified_period: {average_price_specified_period}')
print(f'Average price for specified period ({start_date} to {end_date}): {average_price_specified_period[0][0]}')
print(f'Time taken: {time.time() - start} sec')

                                                                                

Number of rows: 19141




average_price_specified_period: [Row(avg(price)=324.98067760305406)]
Average price for specified period (2019-10-01 00:00:00 to 2019-10-02 00:00:00): 324.98067760305406
Time taken: 85.04525589942932 sec


                                                                                

- larger size (whole month)

In [44]:
# count number of rows
oct30_date = dt.datetime(2019,10,31,23,59,59)
print(f'Number of rows: {only_purchases.count()}')
all_oct_purchases = only_purchases.filter(col("event_time") <= oct30_date)
print(f'Number of rows: {all_oct_purchases.count()}') 

                                                                                

Number of rows: 1659788




Number of rows: 742752


                                                                                

In [49]:
from pyspark.sql import functions as F

# Use F.max for PySpark's max function
max_event_time = all_oct_purchases.agg(F.max("event_time")).collect()
print(f"Maximum event time: {max_event_time[0][0]}")



Maximum event time: 2019-10-31 23:59:06


                                                                                

In [50]:
from pyspark.sql import functions as F

# Use F.max for PySpark's max function
max_event_time = only_purchases.agg(F.max("event_time")).collect()
print(f"Maximum event time: {max_event_time[0][0]}")



Maximum event time: 2019-12-01 00:59:44


                                                                                

In [33]:

start = time.time() 
# calculate average price for specified period
average_proce_all = all_oct_purchases.agg(avg("price")).collect()
print(f'average_proce_all: {average_proce_all}')
print(f'Average price for whole period: {average_proce_all[0][0]}')
print(f'Time taken: {time.time() - start} sec')



average_proce_all: [Row(avg(price)=309.55203582083766)]
Average price for whole period: 309.55203582083766
Time taken: 88.99952006340027 sec


                                                                                

In [51]:
print("a")

a
