In [2]:
# Load everything

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [3]:
# Read data
data_csv = "../../../2019-Oct.csv"
raw_data = pd.read_csv(data_csv)

# Get only purchases
only_purchases = raw_data.loc[raw_data.event_type == 'purchase']

In [4]:
only_purchases.info()

<class 'pandas.core.frame.DataFrame'>
Index: 742849 entries, 162 to 42448657
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     742849 non-null  object 
 1   event_type     742849 non-null  object 
 2   product_id     742849 non-null  int64  
 3   category_id    742849 non-null  int64  
 4   category_code  569424 non-null  object 
 5   brand          684544 non-null  object 
 6   price          742849 non-null  float64
 7   user_id        742849 non-null  int64  
 8   user_session   742849 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 56.7+ MB


In [5]:
# Get 2019-10-01
only_purchases_1day = only_purchases.loc[only_purchases.event_time.str.contains('2019-10-01')]

In [8]:
# save to csv
only_purchases_1day.to_csv('../../data/only_purchases_1day.csv', index=False)


In [9]:
only_purchases_1day.dtypes

event_time        object
event_type        object
product_id         int64
category_id        int64
category_code     object
brand             object
price            float64
user_id            int64
user_session      object
dtype: object

In [10]:
only_purchases_1day_data=only_purchases_1day.groupby(by='user_session').agg(Date_order=('event_time',lambda x: x.max()),
                                                user_id=('user_id',lambda x: x.max()),
                                                Quantity=('user_session','count'),
                                                money_spent=('price','sum')).reset_index(drop=True)
only_purchases_1day_data

Unnamed: 0,Date_order,user_id,Quantity,money_spent
0,2019-10-01 06:10:46 UTC,541539898,1,9.76
1,2019-10-01 13:37:06 UTC,520206104,1,259.98
2,2019-10-01 14:13:11 UTC,516675926,1,1619.09
3,2019-10-01 08:31:51 UTC,541146978,1,383.51
4,2019-10-01 05:49:27 UTC,532770412,2,1507.12
...,...,...,...,...
16244,2019-10-01 07:57:39 UTC,549516290,1,453.27
16245,2019-10-01 11:23:22 UTC,515581326,1,413.14
16246,2019-10-01 16:23:37 UTC,512440842,1,975.57
16247,2019-10-01 12:35:37 UTC,555627492,1,66.87


In [11]:
only_purchases_data=only_purchases.groupby(by='user_session').agg(Date_order=('event_time',lambda x: x.max()),
                                                user_id=('user_id',lambda x: x.max()),
                                                Quantity=('user_session','count'),
                                                money_spent=('price','sum')).reset_index(drop=True)
only_purchases_data

Unnamed: 0,Date_order,user_id,Quantity,money_spent
0,2019-10-06 11:34:30 UTC,546521725,1,289.52
1,2019-10-25 08:39:11 UTC,560486342,1,171.90
2,2019-10-17 13:37:59 UTC,560744406,2,379.78
3,2019-10-15 08:09:02 UTC,520649833,1,90.07
4,2019-10-23 14:27:08 UTC,534210306,2,571.92
...,...,...,...,...
629555,2019-10-29 05:18:45 UTC,565273783,2,2715.20
629556,2019-10-04 09:01:28 UTC,548691169,1,193.03
629557,2019-10-14 09:19:57 UTC,560049739,1,131.64
629558,2019-10-10 12:34:49 UTC,517051774,1,130.12


# use Pyspark to get the data

In [13]:
import os 
import findspark 
findspark.init()

# for sql
from pyspark.sql import SparkSession 
from pyspark.sql.functions import col
from pyspark.sql.functions import sum,avg,max,count

# for time 
import time 
import datetime as dt

In [14]:
# 可以改成 *.csv 
root = '../../data/only_purchases_1day.csv'
spark = SparkSession.builder.appName('eCommerce').getOrCreate()

In [15]:

ecommerce = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(root)

In [16]:
ecommerce.createOrReplaceTempView('ecommerce_2019_oct_1st')

In [17]:
ecommerce.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [18]:

print(f'Number of rows: {ecommerce.count()}')

start = time.time() 
# calculate average price for specified period
average_price_specified_period = ecommerce.agg(avg("price")).collect()
print(f'average_price_specified_period: {average_price_specified_period}')
print(f'Time taken: {time.time() - start} sec')

Number of rows: 19307
average_price_specified_period: [Row(avg(price)=325.0615843994414)]
Time taken: 0.5733463764190674 sec


## calculate average price for 1 month

In [19]:
all_csv='../../../2019-Oct.csv'
ecommerce = spark.read\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .csv(all_csv)

In [20]:
print(f'Number of rows: {ecommerce.count()}')

start = time.time() 
# calculate average price for specified period
average_price_specified_period = ecommerce.agg(avg("price")).collect()
print(f'average_price_specified_period: {average_price_specified_period}')
print(f'Time taken: {time.time() - start} sec')

Number of rows: 42448764
average_price_specified_period: [Row(avg(price)=290.3236606848809)]
Time taken: 9.183218955993652 sec
