In [32]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# 让画图内置
%matplotlib inline
# 设置画布大小
plt.rcParams["figure.figsize"] = (10, 6)


# 忽略版本号之类的warnings
import warnings
warnings.filterwarnings('ignore')
import os
from datetime import date

# 数据导入

In [3]:
# 导入数据
# original_data = pd.read_csv('data\processed_data.csv')
# data = original_data.copy()
# data.shape

In [4]:
# 导入数据
data = pd.read_csv('data\processed_data.csv')
data.shape

(42413557, 11)

In [5]:
data['event_time'] = pd.to_datetime(data['event_time'])
data['category_code'] = data['category_code'].astype('category')
data['brand'] = data['brand'].astype('category')

data['event_day'] = data['event_time'].dt.date
data['event_hour'] = data['event_time'].dt.hour

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42413557 entries, 0 to 42413556
Data columns (total 13 columns):
 #   Column         Dtype         
---  ------         -----         
 0   event_time     datetime64[ns]
 1   event_type     object        
 2   product_id     int64         
 3   category_id    int64         
 4   category_code  category      
 5   brand          category      
 6   price          float64       
 7   user_id        int64         
 8   category       object        
 9   sub_category   object        
 10  product_name   object        
 11  event_day      object        
 12  event_hour     int32         
dtypes: category(2), datetime64[ns](1), float64(1), int32(1), int64(3), object(5)
memory usage: 3.4+ GB


In [7]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,category,sub_category,product_name,event_day,event_hour
0,2019-10-01 00:00:00,view,44600062,2103807459595387724,,shiseido,35.79,541312140,,,,2019-10-01,0
1,2019-10-01 00:00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,appliances,environment,water_heater,2019-10-01,0
2,2019-10-01 00:00:01,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,furniture,living_room,sofa,2019-10-01,0
3,2019-10-01 00:00:01,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,computers,notebook,notebook,2019-10-01,0
4,2019-10-01 00:00:04,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,electronics,smartphone,smartphone,2019-10-01,0


# 流量指标
- PV UV   


In [23]:
view_info  = data.query('event_type == "view"')
puv = view_info.groupby('event_type').agg(PV = ('user_id','count'),
              UV = ('user_id','nunique'))
puv['PUV'] = puv['PV'] / puv['UV']
puv

Unnamed: 0_level_0,PV,UV,PUV
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
view,40772341,3022130,13.49


### 分日

In [9]:
view_info_byday = view_info.groupby(['event_day']).agg(PV = ('user_id','count'),
              UV = ('user_id','nunique'))
view_info_byday['puv'] = round(view_info_byday['PV']/view_info_byday['UV'],2)
view_info_byday

Unnamed: 0_level_0,PV,UV,puv
event_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-10-01,1208165,190158,6.35
2019-10-02,1154436,184955,6.24
2019-10-03,1088577,170655,6.38
2019-10-04,1346162,209393,6.43
2019-10-05,1271176,194949,6.52
2019-10-06,1263876,193194,6.54
2019-10-07,1160929,186939,6.21
2019-10-08,1328893,221194,6.01
2019-10-09,1306089,214133,6.1
2019-10-10,1242891,205307,6.05


In [34]:
view_info_byday.describe()

Unnamed: 0,PV,UV,puv
count,31.0,31.0,31.0
mean,1315236.81,208810.32,6.3
std,111084.61,16233.32,0.25
min,1088577.0,170655.0,5.96
25%,1232293.0,195246.5,6.11
50%,1328893.0,208456.0,6.25
75%,1383097.0,223301.0,6.4
max,1584488.0,231826.0,7.01


### 分时

In [11]:
view_info_by_hour = view_info.groupby(['event_hour']).agg(PV = ('user_id','count'),
              UV = ('user_id','nunique'))
view_info_by_hour

Unnamed: 0_level_0,PV,UV
event_hour,Unnamed: 1_level_1,Unnamed: 2_level_1
0,299740,59584
1,545513,116376
2,1036894,209100
3,1485498,290737
4,1826073,351996
5,2020202,387131
6,2154159,413004
7,2217399,426678
8,2267568,432146
9,2228945,421753


# 销售
- 下单数 gmv 客单价
- order_info

In [30]:
# 订单数 gmv
order_info  = data.query('event_type == "purchase"')
temp = order_info.groupby('event_type').agg(order_num = ('user_id','count'),
                                    gmv = ('price','sum'),
                                    ATV = ('price','mean'),
                                    order_user_num = ('user_id','nunique')
                                    )
temp['avg_user_value'] = temp['gmv']/temp['order_user_num']
temp['avg_user_order_num'] = temp['order_num']/temp['order_user_num']
temp

Unnamed: 0_level_0,order_num,gmv,ATV,order_user_num,avg_user_value,avg_user_order_num
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
purchase,742773,229933212.63,309.56,347118,662.41,2.14


### 分日

In [31]:
order_info_byday= order_info.groupby('event_day').agg(order_num = ('user_id','count'),
                                    gmv = ('price','sum'),
                                    ATV = ('price','mean'),
                                    order_user_num = ('user_id','nunique'))
order_info_byday['avg_user_value'] = order_info_byday['gmv']/order_info_byday['order_user_num']
order_info_byday['avg_user_order_num'] = order_info_byday['order_num']/order_info_byday['order_user_num']
order_info_byday

Unnamed: 0_level_0,order_num,gmv,ATV,order_user_num,avg_user_value,avg_user_order_num
event_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-10-01,19305,6275579.06,325.08,14064,446.22,1.37
2019-10-02,19469,6213628.53,319.15,13894,447.22,1.4
2019-10-03,19255,6233782.98,323.75,13722,454.29,1.4
2019-10-04,27039,8623058.19,318.91,19214,448.79,1.41
2019-10-05,23492,7341094.46,312.49,16734,438.69,1.4
2019-10-06,22169,6737258.17,303.9,16096,418.57,1.38
2019-10-07,21378,6348189.06,296.95,15438,411.21,1.38
2019-10-08,23071,6819701.26,295.6,16590,411.07,1.39
2019-10-09,22747,6855326.05,301.37,16417,417.57,1.39
2019-10-10,21992,6665413.21,303.08,15959,417.66,1.38


In [33]:
order_info_byday.describe()

Unnamed: 0,order_num,gmv,ATV,order_user_num,avg_user_value,avg_user_order_num
count,31.0,31.0,31.0,31.0,31.0,31.0
mean,23960.42,7417200.41,309.81,17193.35,431.44,1.39
std,3083.45,957899.74,11.09,2042.37,19.65,0.02
min,19255.0,6213628.53,286.11,13722.0,396.7,1.36
25%,21685.0,6649647.68,303.01,15698.5,418.11,1.38
50%,23883.0,7307691.57,311.28,17200.0,430.39,1.39
75%,25616.0,7887907.54,318.73,18379.5,443.38,1.4
max,31393.0,9747164.72,329.4,21993.0,482.6,1.47


### 分时

In [19]:
order_info_byhour= order_info.groupby('event_hour').agg(order_num = ('user_id','count'),
                                    gmv = ('price','sum'))
order_info_byhour

Unnamed: 0_level_0,order_num,gmv
event_hour,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2949,980317.85
1,5526,1577532.87
2,13968,3873027.44
3,29632,8696837.45
4,41143,12064072.73
5,48068,14398553.24
6,52002,15808753.76
7,53404,16338607.12
8,55195,17058580.92
9,55182,17551669.73
