In [1]:
import pandas as pd
import numpy as np
import random
import datetime

import plotly.express as px
import plotly.graph_objects as go


In [2]:
# 绘制子图
from plotly.subplots import make_subplots


In [3]:
# 时间
time_range = pd.date_range(start="2019/1/1", end="2021/12/31")
time_range


DatetimeIndex(['2019-01-01', '2019-01-02', '2019-01-03', '2019-01-04',
               '2019-01-05', '2019-01-06', '2019-01-07', '2019-01-08',
               '2019-01-09', '2019-01-10',
               ...
               '2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
               '2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
               '2021-12-30', '2021-12-31'],
              dtype='datetime64[ns]', length=1096, freq='D')

In [4]:
len(time_range)


1096

In [5]:
# 水果
fruits = ["香蕉", "苹果", "葡萄", "橙子", "哈密瓜", "芭乐", "梨", "桃子"]
fruits_list = np.random.choice(fruits, size = len(time_range), replace=True)
fruits_list


array(['葡萄', '哈密瓜', '葡萄', ..., '哈密瓜', '橙子', '葡萄'], dtype='<U3')

In [6]:
# 客户
names = ["Mike", "John", "Tom", "xiaoming", "Jimmy", "Lym", "Michk"]
names_list = np.random.choice(names, size = len(time_range), replace=True)
names_list


array(['John', 'John', 'Lym', ..., 'xiaoming', 'xiaoming', 'Mike'],
      dtype='<U8')

In [7]:
# 生成订单数据
order = pd.DataFrame({
    "time": time_range,
    "fruit": fruits_list,
    "name": names_list,
    "kilogram": np.random.choice(list(range(50, 100)), size=len(time_range), replace=True)
})
order


Unnamed: 0,time,fruit,name,kilogram
0,2019-01-01,葡萄,John,60
1,2019-01-02,哈密瓜,John,89
2,2019-01-03,葡萄,Lym,50
3,2019-01-04,橙子,Mike,61
4,2019-01-05,橙子,Lym,92
...,...,...,...,...
1091,2021-12-27,桃子,Tom,82
1092,2021-12-28,芭乐,Michk,60
1093,2021-12-29,哈密瓜,xiaoming,77
1094,2021-12-30,橙子,xiaoming,50


In [8]:
# 水果信息
information = pd.DataFrame({
    "fruit": fruits,
    "price": [3.8, 8.9, 12.8, 6.8, 15.8, 4.9, 5.8, 7],
    "region": ["华南", "华北", "西北", "华中", "西北", "华南", "华北", "华中"]
})
information


Unnamed: 0,fruit,price,region
0,香蕉,3.8,华南
1,苹果,8.9,华北
2,葡萄,12.8,西北
3,橙子,6.8,华中
4,哈密瓜,15.8,西北
5,芭乐,4.9,华南
6,梨,5.8,华北
7,桃子,7.0,华中


In [9]:
# 数据合并
df = pd.merge(
    order,
    information,
    how="outer"
).sort_values("time").reset_index(drop=True)

df.head()


Unnamed: 0,time,fruit,name,kilogram,price,region
0,2019-01-01,葡萄,John,60,12.8,西北
1,2019-01-02,哈密瓜,John,89,15.8,西北
2,2019-01-03,葡萄,Lym,50,12.8,西北
3,2019-01-04,橙子,Mike,61,6.8,华中
4,2019-01-05,橙子,Lym,92,6.8,华中


In [10]:
# 生成订单金额字段
df["amount"] = df["kilogram"] * df["price"]

df.head()


Unnamed: 0,time,fruit,name,kilogram,price,region,amount
0,2019-01-01,葡萄,John,60,12.8,西北,768.0
1,2019-01-02,哈密瓜,John,89,15.8,西北,1406.2
2,2019-01-03,葡萄,Lym,50,12.8,西北,640.0
3,2019-01-04,橙子,Mike,61,6.8,华中,414.8
4,2019-01-05,橙子,Lym,92,6.8,华中,625.6


In [11]:
df1 = pd.pivot_table(
    df,
    index=pd.Grouper(key="time", freq="M"),
    values="kilogram",
    aggfunc=np.sum
).reset_index()

# df1["time"] = df1["time"].dt.strftime("%Y-%m")

fig1 = px.bar(df1, x="time", y = "kilogram", color="kilogram")
fig1.update_layout(xaxis_tickangle = 45, xaxis_tickformat = "%Y-%m")
fig1.show()

In [12]:
# 2019-2021年销售额走势
df2 = pd.pivot_table(
    df,
    index=pd.Grouper(key="time", freq="M"),
    values="amount",
    aggfunc=np.sum
).reset_index()


In [13]:
fig2 = px.line(
    df2,
    x = "time",
    y = "amount",
    markers = True,
    hover_data={"time": "|%Y年%m月"}
)

fig2.update_layout(
    xaxis_tickangle = 45,
    xaxis_tickformat = "%Y-%m",
    yaxis_tickformat = "0"
)
fig2.show()

In [14]:
# 年度销量、销售额和平均销售额

df3 = pd.pivot_table(
    df,
    index=pd.Grouper(key="time", freq="Y"),
    values=["kilogram", "amount"],
    aggfunc=np.sum
).reset_index()


In [15]:
df3["mean_amount"] = df3["amount"] / df3["kilogram"]
df3


Unnamed: 0,time,amount,kilogram,mean_amount
0,2019-12-31,231699.8,27121,8.543188
1,2020-12-31,227548.9,26914,8.454667
2,2021-12-31,231515.9,27594,8.390081


In [16]:
# 水果年度销量占比

df4 = pd.pivot_table(
    df,
    index=[pd.Grouper(key="time", freq="Y"), "fruit"],
    values=["kilogram", "amount"],
    aggfunc=np.sum
).reset_index()

df4["time"] = df4["time"].dt.strftime("%Y")
df4

Unnamed: 0,time,fruit,amount,kilogram
0,2019,哈密瓜,65854.4,4168
1,2019,桃子,23254.0,3322
2,2019,梨,19760.6,3407
3,2019,橙子,23303.6,3427
4,2019,芭乐,13876.8,2832
5,2019,苹果,24065.6,2704
6,2019,葡萄,48345.6,3777
7,2019,香蕉,13239.2,3484
8,2020,哈密瓜,62125.6,3932
9,2020,桃子,25487.0,3641


In [17]:
fig3 = make_subplots(
    rows=1, cols=3,
    subplot_titles=["2019年", "2020年", "2021年"],
    specs=[[{"type": "domain"}, {"type": "domain"}, {"type": "domain"}]]
)

years = df4["time"].unique().tolist()

for i, year in enumerate(years):
    name = df4[df4["time"] == year].fruit.tolist()
    value = df4[df4["time"] == year].kilogram.tolist()
    fig3.add_trace(
        go.Pie(
            labels = name,
            values = value
        ),
        row = 1, col = i + 1
    )

fig3.update_traces(
    textposition = "inside",
    textinfo = "percent+label",
    hole = 0.4,
    insidetextorientation = "radial",
    hoverinfo = "label+percent+name"
)

fig3.show()

In [18]:
for i, year in enumerate(years):
    df5 = df4[df4["time"] == year]
    fig4 = go.Figure(go.Treemap(
        labels = df5["fruit"].tolist(),
        parents = df5["time"].tolist(),
        values = df5["amount"].tolist(),
        textinfo = "label+value+percent root"
    ))
    fig4.show()

In [19]:
# 商品月度销量变化

df6 = pd.pivot_table(
    df,
    index=[pd.Grouper(key="time", freq="M"), "fruit"],
    values="amount",
    aggfunc=np.sum
).reset_index()

df6["time"] = df6["time"].dt.strftime("%Y-%m")
df6

Unnamed: 0,time,fruit,amount
0,2019-01,哈密瓜,5008.6
1,2019-01,桃子,1001.0
2,2019-01,梨,852.6
3,2019-01,橙子,3495.2
4,2019-01,芭乐,769.3
...,...,...,...
279,2021-12,橙子,3291.2
280,2021-12,芭乐,294.0
281,2021-12,苹果,2189.4
282,2021-12,葡萄,6668.8


In [20]:
fig5 = px.bar(
    df6,
    x = "time",
    y = "amount",
    color = "fruit",
    hover_data={"time": "|%Y年%m月"}
)
fig5.update_layout(
    xaxis_tickangle = 45,
    xaxis_tickformat = "%Y-%m",
    yaxis_tickformat = "0"
)

fig5.show()

In [21]:
# 不同地区的销量
df7 = pd.pivot_table(
    df,
    index=[pd.Grouper(key="time", freq="Y"), "region"],
    values="kilogram",
    aggfunc=np.sum
).reset_index()
df7["time"] = df7["time"].dt.strftime("%Y")


df7

Unnamed: 0,time,region,kilogram
0,2019,华中,6749
1,2019,华北,6111
2,2019,华南,6316
3,2019,西北,7945
4,2020,华中,6436
5,2020,华北,6692
6,2020,华南,6585
7,2020,西北,7201
8,2021,华中,6966
9,2021,华北,5534


In [22]:
fig6 = px.bar(
    df7,
    x = "region",
    y = "kilogram",
    color="region",
    facet_col="time",
    text = "kilogram"
)

fig6.show()

In [23]:
# 不同地区年度平均销售额
df8 = pd.pivot_table(
    df,
    index=[pd.Grouper(key="time", freq="Y"), "region"],
    values="amount",
    aggfunc=np.mean
).reset_index()

df8["time"] = df8["time"].dt.strftime("%Y")
df8["time"] = df8["time"].astype("int")
df8.style.background_gradient(cmap="Spectral_r")

Unnamed: 0,time,region,amount
0,2019,华中,506.06087
1,2019,华北,547.8275
2,2019,华南,319.011765
3,2019,西北,1057.407407
4,2020,华中,494.366667
5,2020,华北,547.33587
6,2020,华南,330.255172
7,2020,西北,1071.843299
8,2021,华中,527.868132
9,2021,华北,583.830556


In [24]:
# 用户订单量、金额对比
df9 = pd.pivot_table(
    df,
    index = "name",
    aggfunc={
        "time": "count",
        "amount": "sum"
    }
).reset_index()\
    .rename(columns = {"time": "order_number"})

df9.style.background_gradient(cmap="Spectral_r")

Unnamed: 0,name,amount,order_number
0,Jimmy,99264.0,162
1,John,95849.1,152
2,Lym,100757.9,157
3,Michk,108559.1,174
4,Mike,98849.0,152
5,Tom,93745.8,150
6,xiaoming,93739.7,149


In [25]:
# 用户水果喜好
df10 = pd.pivot_table(
    df,
    index=["name", "fruit"],
    aggfunc={
        "time": "count",
        "amount": "sum" 
    }
).reset_index()\
    .rename(columns={"time": "number"})

df10.sort_values(["name", "number", "amount"], ascending=[True, False, False])
df10.style.bar(subset=["number", "amount"], color = "#a97fcf")

Unnamed: 0,name,fruit,amount,number
0,Jimmy,哈密瓜,28076.6,24
1,Jimmy,桃子,6370.0,12
2,Jimmy,梨,9581.6,23
3,Jimmy,橙子,14028.4,28
4,Jimmy,芭乐,7972.3,22
5,Jimmy,苹果,10831.3,17
6,Jimmy,葡萄,17779.2,19
7,Jimmy,香蕉,4624.6,17
8,John,哈密瓜,26638.8,23
9,John,桃子,11403.0,22


In [26]:
fig7 = px.bar(
    df10,
    x = "fruit", 
    y = "amount",
    facet_col="name",
    color = "number"
)

fig7.update_layout(
    yaxis_tickformat = "0"
)

fig7.show()

In [27]:
# 用户分层——RFM模型
df11 = pd.pivot_table(
    df,
    index="name",
    aggfunc={
        "fruit": "count",
        "amount": "sum"
    }
).reset_index()\
    .rename(columns={"fruit": "F", "amount": "M"})

df11

Unnamed: 0,name,M,F
0,Jimmy,99264.0,162
1,John,95849.1,152
2,Lym,100757.9,157
3,Michk,108559.1,174
4,Mike,98849.0,152
5,Tom,93745.8,150
6,xiaoming,93739.7,149


In [28]:
now = datetime.datetime.now()
now

datetime.datetime(2022, 1, 14, 15, 9, 58, 551577)

In [29]:
df["R"] = df["time"].apply(lambda x: (now - x).days)
df.sort_values(["name", "R"], ascending=[False, True])

Unnamed: 0,time,fruit,name,kilogram,price,region,amount,R
1094,2021-12-30,橙子,xiaoming,50,6.8,华中,340.0,15
1093,2021-12-29,哈密瓜,xiaoming,77,15.8,西北,1216.6,16
1085,2021-12-21,苹果,xiaoming,64,8.9,华北,569.6,24
1084,2021-12-20,香蕉,xiaoming,90,3.8,华南,342.0,25
1077,2021-12-13,葡萄,xiaoming,75,12.8,西北,960.0,32
...,...,...,...,...,...,...,...,...
54,2019-02-24,苹果,Jimmy,82,8.9,华北,729.8,1055
53,2019-02-23,橙子,Jimmy,94,6.8,华中,639.2,1056
46,2019-02-16,苹果,Jimmy,60,8.9,华北,534.0,1063
36,2019-02-06,葡萄,Jimmy,68,12.8,西北,870.4,1073


In [30]:
df12 = pd.pivot_table(
    df,
    index="name",
    values="R",
    aggfunc="min"
).reset_index()

df12

Unnamed: 0,name,R
0,Jimmy,20
1,John,19
2,Lym,33
3,Michk,17
4,Mike,14
5,Tom,18
6,xiaoming,15


In [31]:
df13 = pd.merge(df11, df12)
df13 = df13[["name", "F", "M", "R"]]
df13.style.background_gradient(cmap = "Spectral_r")

Unnamed: 0,name,F,M,R
0,Jimmy,162,99264.0,20
1,John,152,95849.1,19
2,Lym,157,100757.9,33
3,Michk,174,108559.1,17
4,Mike,152,98849.0,14
5,Tom,150,93745.8,18
6,xiaoming,149,93739.7,15


In [32]:
# 用户复购周期分析
# 每个用户的购买时间升序
df14 = df[["name", "time"]].sort_values(["name", "time"], ascending=[False, True])
df14

Unnamed: 0,name,time
5,xiaoming,2019-01-06
12,xiaoming,2019-01-13
15,xiaoming,2019-01-16
16,xiaoming,2019-01-17
20,xiaoming,2019-01-21
...,...,...
1070,Jimmy,2021-12-06
1072,Jimmy,2021-12-08
1082,Jimmy,2021-12-18
1087,Jimmy,2021-12-23


In [33]:
df15 = df14.groupby("name").shift(1).rename(columns={"time": "time1"})
df15

Unnamed: 0,time1
5,NaT
12,2019-01-06
15,2019-01-13
16,2019-01-16
20,2019-01-17
...,...
1070,2021-11-26
1072,2021-12-06
1082,2021-12-08
1087,2021-12-18


In [34]:
df16 = pd.concat([df14, df15], axis=1)
df16.dropna(inplace=True)
df16["timedelta"] = df16["time"] - df16["time1"]
df16["timedelta"] = df16["timedelta"].apply(lambda x: x.days)
df16

Unnamed: 0,name,time,time1,timedelta
12,xiaoming,2019-01-13,2019-01-06,7
15,xiaoming,2019-01-16,2019-01-13,3
16,xiaoming,2019-01-17,2019-01-16,1
20,xiaoming,2019-01-21,2019-01-17,4
22,xiaoming,2019-01-23,2019-01-21,2
...,...,...,...,...
1070,Jimmy,2021-12-06,2021-11-26,10
1072,Jimmy,2021-12-08,2021-12-06,2
1082,Jimmy,2021-12-18,2021-12-08,10
1087,Jimmy,2021-12-23,2021-12-18,5


In [35]:
fig8 = px.bar(
    df16,
    x = "timedelta",
    y = "name",
    orientation="h",
    color = "timedelta",
    color_continuous_scale="spectral_r"
)
fig8.show()

In [36]:
df16.groupby("name")["timedelta"].agg(["count", "mean"])

Unnamed: 0_level_0,count,mean
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Jimmy,161,6.695652
John,151,7.218543
Lym,156,6.884615
Michk,173,6.277457
Mike,151,7.231788
Tom,149,7.181208
xiaoming,148,7.358108


In [37]:
fig9 = px.violin(
    df16,
    y = "timedelta",
    color="name"
)

fig9.show()