In [2]:
# Load everything

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [3]:
# Read data
data_csv = "../../../2019-Oct.csv"
raw_data = pd.read_csv(data_csv)

# Get only purchases
only_purchases = raw_data.loc[raw_data.event_type == 'purchase']

In [4]:
only_purchases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 742849 entries, 162 to 42448657
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     742849 non-null  object 
 1   event_type     742849 non-null  object 
 2   product_id     742849 non-null  int64  
 3   category_id    742849 non-null  int64  
 4   category_code  569424 non-null  object 
 5   brand          684544 non-null  object 
 6   price          742849 non-null  float64
 7   user_id        742849 non-null  int64  
 8   user_session   742849 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 56.7+ MB


In [5]:
# Get 2019-10-01
only_purchases_1day = only_purchases.loc[only_purchases.event_time.str.contains('2019-10-01')]

In [8]:
# save to csv
only_purchases_1day.to_csv('../../data/only_purchases_1day.csv', index=False)


In [9]:
only_purchases_1day.dtypes

event_time        object
event_type        object
product_id         int64
category_id        int64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object

In [10]:
only_purchases_data_1day=only_purchases_1day.groupby(by='user_session').agg(Date_order=('event_time',lambda x: x.max()),
                                                user_id=('user_id',lambda x: x.max()),
                                                Quantity=('user_session','count'),
                                                money_spent=('price','sum')).reset_index(drop=True)
only_purchases_data_1day

Unnamed: 0,Date_order,user_id,Quantity,money_spent
0,2019-10-01 06:10:46 UTC,541539898,1,9.76
1,2019-10-01 13:37:06 UTC,520206104,1,259.98
2,2019-10-01 14:13:11 UTC,516675926,1,1619.09
3,2019-10-01 08:31:51 UTC,541146978,1,383.51
4,2019-10-01 05:49:27 UTC,532770412,2,1507.12
...,...,...,...,...
16244,2019-10-01 07:57:39 UTC,549516290,1,453.27
16245,2019-10-01 11:23:22 UTC,515581326,1,413.14
16246,2019-10-01 16:23:37 UTC,512440842,1,975.57
16247,2019-10-01 12:35:37 UTC,555627492,1,66.87


In [11]:
only_purchases_data=only_purchases.groupby(by='user_session').agg(Date_order=('event_time',lambda x: x.max()),
                                                user_id=('user_id',lambda x: x.max()),
                                                Quantity=('user_session','count'),
                                                money_spent=('price','sum')).reset_index(drop=True)
only_purchases_data

Unnamed: 0,Date_order,user_id,Quantity,money_spent
0,2019-10-06 11:34:30 UTC,546521725,1,289.52
1,2019-10-25 08:39:11 UTC,560486342,1,171.90
2,2019-10-17 13:37:59 UTC,560744406,2,379.78
3,2019-10-15 08:09:02 UTC,520649833,1,90.07
4,2019-10-23 14:27:08 UTC,534210306,2,571.92
...,...,...,...,...
629555,2019-10-29 05:18:45 UTC,565273783,2,2715.20
629556,2019-10-04 09:01:28 UTC,548691169,1,193.03
629557,2019-10-14 09:19:57 UTC,560049739,1,131.64
629558,2019-10-10 12:34:49 UTC,517051774,1,130.12


# use Pyspark to get the data

In [9]:
import os 
import findspark 
findspark.init()

# for sql
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col
from pyspark.sql.functions import sum,avg,max,count

# for time 
import time 
import datetime as dt

In [10]:
# 可以改成 *.csv 
root = '../../data/only_purchases_1day.csv'
spark = SparkSession.builder.appName('eCommerce').getOrCreate()

In [11]:

ecommerce_1day = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(root)

In [12]:
ecommerce_1day.createOrReplaceTempView('ecommerce_2019_oct_1st')

In [13]:
ecommerce_1day.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [14]:
from pyspark.sql import functions as F

# Assuming your DataFrame is named only_purchases_1day and is a Spark DataFrame
spark_data_1day = (ecommerce_1day
                            .groupBy("user_session")
                            .agg(
                                F.max("event_time").alias("Date_order"),
                                F.max("user_id").alias("user_id"),
                                F.count("user_session").alias("Quantity"),
                                F.sum("price").alias("money_spent")
                            )
                            .withColumnRenamed("user_session", "user_session_id")
                           )
spark_data_1day.show()

+--------------------+-------------------+---------+--------+-----------------+
|     user_session_id|         Date_order|  user_id|Quantity|      money_spent|
+--------------------+-------------------+---------+--------+-----------------+
|2af9b570-0942-4dc...|2019-10-01 02:09:26|524601178|       1|           189.91|
|62a3b59a-de32-450...|2019-10-01 05:28:56|543624132|       1|           254.76|
|3a8a2e45-3c9b-4d1...|2019-10-01 05:31:53|521819296|       1|           360.11|
|194fc2ad-6a50-4dc...|2019-10-01 05:57:31|555477458|       1|           130.76|
|f70b875e-caf2-4c1...|2019-10-01 06:03:31|550692948|       1|           583.28|
|7842bc7f-6fa3-4b3...|2019-10-01 06:09:46|515346540|       1|           738.61|
|2ef63cbc-4d37-4bf...|2019-10-01 06:12:59|525109856|       1|           463.02|
|8c7c087e-8018-404...|2019-10-01 06:42:17|555484845|       1|            66.88|
|420ef1df-cc8f-4a1...|2019-10-01 06:55:56|519083227|       2|           590.98|
|ca468ec2-460d-467...|2019-10-01 07:04:2

In [15]:

print(f'Number of rows: {spark_data_1day.count()}')

start = time.time() 
# calculate average price for specified period
average_price_1day = spark_data_1day.agg(avg("money_spent")).collect()
print(f'average_price_specified_period: {average_price_1day}')
print(f'Time taken: {time.time() - start} sec')

Number of rows: 16249
average_price_specified_period: [Row(avg(money_spent)=386.23693827312155)]
Time taken: 0.4167046546936035 sec


## calculate average price for 1 month

In [16]:
all_csv='../../../2019-Oct.csv'
ecommerce_1month = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(all_csv)

In [17]:
# Assuming your DataFrame is named only_purchases_1day and is a Spark DataFrame
spark_data_1month = (ecommerce_1month
                            .groupBy("user_session")
                            .agg(
                                F.max("event_time").alias("Date_order"),
                                F.max("user_id").alias("user_id"),
                                F.count("user_session").alias("Quantity"),
                                F.sum("price").alias("money_spent")
                            )
                            .withColumnRenamed("user_session", "user_session_id")
                           )
spark_data_1month.show()

+--------------------+-------------------+---------+--------+------------------+
|     user_session_id|         Date_order|  user_id|Quantity|       money_spent|
+--------------------+-------------------+---------+--------+------------------+
|c3012f56-70f3-419...|2019-10-01 02:03:35|519829701|       1|            975.57|
|2af9b570-0942-4dc...|2019-10-01 02:09:54|524601178|       3|            569.73|
|78c0b329-af93-44f...|2019-10-01 02:13:26|519194796|       1|            115.32|
|c7e588c7-78a9-403...|2019-10-01 04:18:59|512694696|       2|           1822.26|
|a6b41834-75dd-4cd...|2019-10-01 04:24:02|400972610|       5|            162.25|
|898ff23f-fc70-423...|2019-10-01 04:30:26|525268787|       3|            169.38|
|85881243-7b33-409...|2019-10-01 04:25:47|517438582|       3|            339.01|
|b36c1441-930a-4a6...|2019-10-01 04:24:22|537486678|       1|            308.37|
|fa1df743-4ef7-427...|2019-10-01 05:20:31|513384234|       9|            601.32|
|ce054142-5e1d-4ed...|2019-1

In [19]:
print(f'Number of rows: {spark_data_1month.count()}')

start = time.time() 
average_price = spark_data_1month.agg(avg("money_spent")).collect()
print(f'average_price_specified_period: {average_price}')

# calculate average price for specified period
print(f'Time taken: {time.time() - start} sec')

Number of rows: 9244422
average_price_specified_period: [Row(avg(money_spent)=1333.1153160283807)]
Time taken: 19.60007882118225 sec
