In [1]:
import os 
import findspark 
findspark.init()

# for sql
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col
from pyspark.sql.functions import sum,avg,max,count

# for time 
import time 


In [2]:
# 可以改成 *.csv 
root = '../../*.csv'
spark = SparkSession.builder.appName('eCommerce').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/21 23:51:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:

ecommerce = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(root)

                                                                                

In [4]:
ecommerce.createOrReplaceTempView('ecommerce_2019_oct')

In [5]:
ecommerce.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [6]:
ecommerce.show(5)

+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code| brand| price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+------+------+---------+--------------------+
|2019-11-01 01:00:00|      view|   1003461|2053013555631882655|electronics.smart...|xiaomi|489.07|520088904|4d3b30da-a5e4-49d...|
|2019-11-01 01:00:00|      view|   5000088|2053013566100866035|appliances.sewing...|janome|293.65|530496790|8e5f4f83-366c-4f7...|
|2019-11-01 01:00:01|      view|  17302664|2053013553853497655|                NULL| creed| 28.31|561587266|755422e7-9040-477...|
|2019-11-01 01:00:01|      view|   3601530|2053013563810775923|appliances.kitche...|    lg|712.87|518085591|3bfb58cd-7892-48c...|
|2019-11-01 01:00:01|      view|   1004775|2053013555631882655|electronics.smart...|xiaomi

### Try with smaller (day) to larger size (months) and show when the time takes to long

- Smaller days 

In [7]:


# 使用 SQL 查询< '2019-10-02'
query = """
SELECT AVG(price) FROM ecommerce_2019_oct
WHERE event_time < '2019-10-02'
"""
result = spark.sql(query)
spark.sql(query).explain()

start = time.time()
avg_price = result.collect()[0][0]
print(f"Average price: {avg_price}")
print(f'Time taken: {time.time() - start} sec')


== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[avg(price#23)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=49]
      +- HashAggregate(keys=[], functions=[partial_avg(price#23)])
         +- Project [price#23]
            +- Filter (isnotnull(event_time#17) AND (event_time#17 < 2019-10-02 00:00:00))
               +- FileScan csv [event_time#17,price#23] Batched: false, DataFilters: [isnotnull(event_time#17), (event_time#17 < 2019-10-02 00:00:00)], Format: CSV, Location: InMemoryFileIndex(2 paths)[file:/Users/clara/hka_code/2019-Nov.csv, file:/Users/clara/hka_code/20..., PartitionFilters: [], PushedFilters: [IsNotNull(event_time), LessThan(event_time,2019-10-02 00:00:00.0)], ReadSchema: struct<event_time:timestamp,price:double>






Average price: 297.6879048473118
Time taken: 78.49041295051575 sec


                                                                                

- larger size (whole month)

In [8]:
# 查询整個
query_all = "SELECT AVG(price) FROM ecommerce_2019_oct"
result_all = spark.sql(query_all)
spark.sql(query_all).explain()
start = time.time() 
avg_all_price = result_all.collect()[0][0]
print(f"Average price: {avg_all_price}")
print(f'Time taken: {time.time() - start} sec')

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[], functions=[avg(price#23)])
   +- Exchange SinglePartition, ENSURE_REQUIREMENTS, [plan_id=108]
      +- HashAggregate(keys=[], functions=[partial_avg(price#23)])
         +- FileScan csv [price#23] Batched: false, DataFilters: [], Format: CSV, Location: InMemoryFileIndex(2 paths)[file:/Users/clara/hka_code/2019-Nov.csv, file:/Users/clara/hka_code/20..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<price:double>






Average price: 291.63480233260026
Time taken: 39.81396293640137 sec


                                                                                

In [9]:
result_all.show()




+------------------+
|        avg(price)|
+------------------+
|291.63480233260026|
+------------------+



                                                                                

# Make it faster with using ... 
1. Increase the number of parallel partitions 
2. Using cache

In [None]:
# Make DataFrame to RDD to check the number of partitions
print("Partition count:", ecommerce.rdd.getNumPartitions())

In [None]:
ecommerce = ecommerce.repartition(3000)
start_date = "2019-10-01"
end_date = "2019-10-02"

# For one day
specified_period = ecommerce.filter((col("event_time") >= start_date) & (col("event_time") < end_date))
print(f'Number of rows: {specified_period.count()}')
start = time.time() 
average_price_specified_period = ecommerce.agg(avg("price")).collect()
print(f'average_price_specified_period: {average_price_specified_period}')
print(f'Average price for specified period ({start_date} to {end_date}): {average_price_specified_period[0][0]}')
print(f'Time taken: {time.time() - start} sec')


# For whole period
start = time.time() 
print(f'Number of rows: {ecommerce.count()}')
average_proce_all = ecommerce.agg(avg("price")).collect()
print(f'average_proce_all: {average_proce_all}')
print(f'Average price for whole period: {average_proce_all[0][0]}')
print(f'Time taken: {time.time() - start} sec')
