In [1]:
import os 
import findspark 
findspark.init()

# for sql
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col
from pyspark.sql.functions import sum,avg,max,count

# for time 
import time 


In [2]:
# 可以改成 *.csv 
root = '../../Data/eCommerce-behavior-data/*.csv'
spark = SparkSession.builder.appName('eCommerce').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/20 14:52:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:

ecommerce = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(root)

                                                                                

In [4]:
ecommerce.createOrReplaceTempView('ecommerce_2019_oct')

In [5]:
ecommerce.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [6]:
ecommerce.show(5)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 01:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 01:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 01:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 01:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

### Try with smaller (day) to larger size (months) and show when the time takes to long

- Smaller days 

In [7]:
start_date = "2019-10-01"
end_date = "2019-10-02"

# filter data by date peirod 
specified_period = ecommerce.filter((col("event_time") >= start_date) & (col("event_time") < end_date))

# count number of rows
print(f'Number of rows: {specified_period.count()}')

start = time.time() 
# calculate average price for specified period
average_price_specified_period = specified_period.agg(avg("price")).collect()
print(f'average_price_specified_period: {average_price_specified_period}')
print(f'Average price for specified period ({start_date} to {end_date}): {average_price_specified_period[0][0]}')
print(f'Time taken: {time.time() - start} sec')

                                                                                

Number of rows: 1230402




average_price_specified_period: [Row(avg(price)=297.6879048473118)]
Average price for specified period (2019-10-01 to 2019-10-02): 297.6879048473118
Time taken: 28.88580584526062 sec


                                                                                

- larger size (whole month)

In [8]:
start = time.time() 

# count number of rows
print(f'Number of rows: {ecommerce.count()}')

# calculate average price for specified period
average_proce_all = ecommerce.agg(avg("price")).collect()
print(f'average_proce_all: {average_proce_all}')
print(f'Average price for whole period: {average_proce_all[0][0]}')
print(f'Time taken: {time.time() - start} sec')

                                                                                

Number of rows: 109950743




average_proce_all: [Row(avg(price)=291.63480233260026)]
Average price for whole period: 291.63480233260026
Time taken: 27.878618001937866 sec


                                                                                

# Make it faster with using ... 
1. Increase the number of parallel partitions 
2. Using cache

In [9]:
# Make DataFrame to RDD to check the number of partitions
print("Partition count:", ecommerce.rdd.getNumPartitions())

Partition count: 110


In [10]:
ecommerce = ecommerce.repartition(3000)
start_date = "2019-10-01"
end_date = "2019-10-02"

# For one day
specified_period = ecommerce.filter((col("event_time") >= start_date) & (col("event_time") < end_date))
print(f'Number of rows: {specified_period.count()}')
start = time.time() 
average_price_specified_period = ecommerce.agg(avg("price")).collect()
print(f'average_price_specified_period: {average_price_specified_period}')
print(f'Average price for specified period ({start_date} to {end_date}): {average_price_specified_period[0][0]}')
print(f'Time taken: {time.time() - start} sec')


# For whole period
start = time.time() 
print(f'Number of rows: {ecommerce.count()}')
average_proce_all = ecommerce.agg(avg("price")).collect()
print(f'average_proce_all: {average_proce_all}')
print(f'Average price for whole period: {average_proce_all[0][0]}')
print(f'Time taken: {time.time() - start} sec')


                                                                                

Number of rows: 1230402


                                                                                

average_price_specified_period: [Row(avg(price)=291.63480233262527)]
Average price for specified period (2019-10-01 to 2019-10-02): 291.63480233262527
Time taken: 33.21626877784729 sec


                                                                                

Number of rows: 109950743




average_proce_all: [Row(avg(price)=291.63480233262527)]
Average price for whole period: 291.63480233262527
Time taken: 47.80622410774231 sec


                                                                                