In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# 让画图内置
%matplotlib inline

# 忽略版本号之类的warnings
import warnings
warnings.filterwarnings('ignore')
import os
from datetime import date

# 数据导入

In [3]:
# 导入数据
# original_data = pd.read_csv('data\processed_data.csv')
# data = original_data.copy()
# data.shape

In [4]:
# 导入数据
data = pd.read_csv('data\processed_data.csv')
data.shape

(42413557, 11)

In [5]:
data['event_time'] = pd.to_datetime(data['event_time'])
data['category_code'] = data['category_code'].astype('category')
data['brand'] = data['brand'].astype('category')

data['event_day'] = data['event_time'].dt.date
data['event_hour'] = data['event_time'].dt.hour

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42413557 entries, 0 to 42413556
Data columns (total 13 columns):
 #   Column         Dtype         
---  ------         -----         
 0   event_time     datetime64[ns]
 1   event_type     object        
 2   product_id     int64         
 3   category_id    int64         
 4   category_code  category      
 5   brand          category      
 6   price          float64       
 7   user_id        int64         
 8   category       object        
 9   sub_category   object        
 10  product_name   object        
 11  event_day      object        
 12  event_hour     int32         
dtypes: category(2), datetime64[ns](1), float64(1), int32(1), int64(3), object(5)
memory usage: 3.4+ GB


In [7]:
data.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,category,sub_category,product_name,event_day,event_hour
0,2019-10-01 00:00:00,view,44600062,2103807459595387724,,shiseido,35.79,541312140,,,,2019-10-01,0
1,2019-10-01 00:00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.2,554748717,appliances,environment,water_heater,2019-10-01,0
2,2019-10-01 00:00:01,view,17200506,2053013559792632471,furniture.living_room.sofa,,543.1,519107250,furniture,living_room,sofa,2019-10-01,0
3,2019-10-01 00:00:01,view,1307067,2053013558920217191,computers.notebook,lenovo,251.74,550050854,computers,notebook,notebook,2019-10-01,0
4,2019-10-01 00:00:04,view,1004237,2053013555631882655,electronics.smartphone,apple,1081.98,535871217,electronics,smartphone,smartphone,2019-10-01,0


# 热销商品
- '下单数','动销商品数','gmv','单均价'

In [8]:
# 商品价值
product_info = data.query('event_type == "purchase"').groupby('event_type').agg({'product_id': ['count','nunique'],'price': ['sum','mean']})
product_info.columns = ['下单数','动销商品数','gmv','单均价']
product_info

Unnamed: 0_level_0,下单数,动销商品数,gmv,单均价
event_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
purchase,742773,42241,229933200.0,309.560542


## 分类别

In [15]:
product_info_bycategory = data.query('event_type == "purchase"').groupby('category').agg({'product_id': ['count','nunique'],'price': ['sum','mean']})
product_info_bycategory.columns = ['下单数','动销商品数','gmv','单均价']
product_info_bycategory.sort_values(by='gmv',ascending=False)

Unnamed: 0_level_0,下单数,动销商品数,gmv,单均价
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
electronics,422979,4698,176445600.0,417.149735
appliances,74988,4838,13581760.0,181.11912
computers,27853,2638,11377870.0,408.497139
furniture,8299,1634,1673243.0,201.619835
auto,10619,609,1273956.0,119.969461
construction,7801,939,932995.0,119.599413
kids,5482,1007,678140.7,123.70315
apparel,8002,2512,624937.8,78.097694
sport,1236,270,322559.0,260.970065
accessories,1587,554,68783.88,43.342079


## 分小类

In [19]:
product_info_bysubcate = data.query('event_type == "purchase"').groupby(['category','sub_category']).agg({'product_id': ['count','nunique'],'price': ['sum','mean']})

product_info_bysubcate.columns =  ['下单数','动销商品数','gmv','单均价']

product_info_bysubcate.query('下单数 > 0').sort_values('gmv',ascending=False)[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,下单数,动销商品数,gmv,单均价
category,sub_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
electronics,smartphone,337979,924,157033900.0,464.626246
appliances,kitchen,48119,3084,10533020.0,218.895239
computers,notebook,15588,614,8978883.0,576.012537
electronics,video,21643,406,8459179.0,390.85059
electronics,clocks,17903,1439,4817089.0,269.066025
electronics,audio,35590,1356,4151286.0,116.641928
appliances,environment,18073,953,2320239.0,128.381508
electronics,tablet,5602,222,1610917.0,287.561123
auto,accessories,10619,609,1273956.0,119.969461
computers,desktop,3232,445,1116822.0,345.551355


## 分商品名 

In [17]:
product_info_bypname = data.query('event_type == "purchase"').groupby('product_name').agg({'product_id': ['count','nunique'],'price': ['sum','mean']})

product_info_bypname.columns = ['下单数','动销商品数','gmv','单均价']

product_info_bypname.query('下单数 > 0').sort_values('gmv',ascending=False)[:20]

Unnamed: 0_level_0,下单数,动销商品数,gmv,单均价
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
smartphone,337979,924,157033900.0,464.626246
notebook,15588,614,8978883.0,576.012537
tv,21561,387,8422119.0,390.618217
clocks,17903,1439,4817089.0,269.066025
washer,16146,342,4658223.0,288.506346
refrigerators,11218,624,3830077.0,341.422447
headphone,30501,678,3538807.0,116.022661
vacuum,12378,414,1716425.0,138.667427
tablet,5602,222,1610917.0,287.561123
desktop,3232,445,1116822.0,345.551355


## 分品牌

In [18]:
product_info_bybrand = data.query('event_type == "purchase"').groupby('brand').agg({'product_id': ['count','nunique'],'price': ['sum','mean']})

product_info_bybrand.columns = ['下单数','动销商品数','gmv','单均价']

product_info_bybrand.query('下单数 > 0').sort_values('gmv',ascending=False)[:20]

Unnamed: 0_level_0,下单数,动销商品数,gmv,单均价
brand,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
apple,142858,394,111199500.0,778.3918
samsung,172878,687,46402040.0,268.409152
xiaomi,56609,474,9192640.0,162.388317
huawei,23499,104,4883105.0,207.800533
acer,6880,174,3575716.0,519.726118
lg,8725,257,3387361.0,388.236207
lucente,11576,445,3123439.0,269.820228
sony,6729,442,2478197.0,368.286028
oppo,10887,35,2412034.0,221.551757
lenovo,4578,206,1752639.0,382.839347


## 分商品id

In [13]:
product_info_bybrand = data.query('event_type == "purchase"').groupby('product_id').agg({'product_id': ['count','nunique'],'price': ['sum','mean']})

product_info_bybrand.columns = ['下单数','动销商品数','gmv','单均价']

product_info_bybrand.query('下单数 > 0')

Unnamed: 0_level_0,下单数,动销商品数,gmv,单均价
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1000978,12,1,3840.13,320.010833
1001588,12,1,1538.18,128.181667
1002042,3,1,231.42,77.140000
1002062,18,1,1723.47,95.748333
1002098,13,1,4818.32,370.640000
...,...,...,...,...
59000007,1,1,47.88,47.880000
60400006,1,1,332.05,332.050000
60500001,4,1,102.34,25.585000
60500002,4,1,170.82,42.705000


## 分下单方式
- 直接下单 / 加购后下单