In [2]:
import pandas as pd
import numpy as np

# 数据处理

In [3]:
df = pd.read_csv('data/OnlineRetail.csv')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
# 去除CustomerID最后一位小数点
df = df.dropna(subset=['CustomerID'])
df['CustomerID'] = df['CustomerID'].astype(int)
# df = df[df['CustomerID'] != 'nan']
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850,United Kingdom


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    406829 non-null  object 
 1   StockCode    406829 non-null  object 
 2   Description  406829 non-null  object 
 3   Quantity     406829 non-null  int64  
 4   InvoiceDate  406829 non-null  object 
 5   UnitPrice    406829 non-null  float64
 6   CustomerID   406829 non-null  int64  
 7   Country      406829 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 27.9+ MB


In [7]:
# 检查各字段是否有空值↑
# 直接删除空包含空值数据
# 删除空值后数据集基本信息
# 处理完后用info查看都没有空值了，isnull查看也一样
df.isnull().sum()

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64

In [8]:
# 定义日期处理函数
def process_date(series, format = "%m/%d/%Y %H:%M"):
    return pd.to_datetime(series, format = format)

In [9]:
# 将 InvoiceDate 数据列处理为 datatime 类型
df['InvoiceDate'] = process_date(df['InvoiceDate'])

In [10]:
# 数量 与 单价 乘积的都用户每个笔交易M指标
df['Monetary'] = df['Quantity'] * df['UnitPrice']

In [11]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Monetary
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [12]:
# 得到最大日期数据 (用做基准，也可以用今天？)
max_invoice_date = df['InvoiceDate'].max()
max_invoice_date

Timestamp('2011-12-09 12:50:00')

In [13]:
# 得到CustomerID唯一值
unique_ID = df['CustomerID'].nunique()
unique_ID

4372

In [14]:
# 计算客户F指标
# 计算客户R指标
# 计算M指标
RFM = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (max_invoice_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'Monetary': 'sum'
})
RFM.head()

Unnamed: 0_level_0,InvoiceDate,InvoiceNo,Monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,325,2,0.0
12347,1,7,4310.0
12348,74,4,1797.24
12349,18,1,1757.55
12350,309,1,334.4


In [15]:
RFM[['InvoiceDate', 'InvoiceNo', 'Monetary']].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
InvoiceDate,4372.0,91.047118,100.765435,0.0,16.0,49.0,142.0,373.0
InvoiceNo,4372.0,5.07548,9.338754,1.0,1.0,3.0,5.0,248.0
Monetary,4372.0,1898.459701,8219.345141,-4287.63,293.3625,648.075,1611.725,279489.02


# 根据RFM五等分法对用户进行分类

In [20]:
# 查看RFM各个指标的统计量
# 定义R/F/M评分函数（五等分，评分1-5，5为最高）
def RScore(x):
    """R评分：Recency越小（最近消费），评分越高，5分最高，1分最低"""
    # qcut分箱，因R是越小越好，分箱后逆序赋值评分
    labels = [5,4,3,2,1]
    return pd.qcut(x, q=5, labels=labels, duplicates='drop')  # duplicates='drop'处理重复值导致的分箱失败

def FScore(x):
    """F评分：Frequency越大（消费频次越高），评分越高，5分最高，1分最低"""
    return pd.cut(x, bins = 5, labels=[1,2,3,4,5], include_lowest = True)

def MScore(x):
    """M评分：Monetary越大（消费金额越高），评分越高，5分最高，1分最低"""
    labels = [1,2,3,4,5]
    return pd.qcut(x, q=5, labels=labels, duplicates='drop')

In [21]:
# 根据RFM指标值，标记每个客户RFM所属类别（评分1-5）
RFM['R_Score'] = RScore(RFM['InvoiceDate'])
RFM['F_Score'] = FScore(RFM['InvoiceNo'])
RFM['M_Score'] = MScore(RFM['Monetary'])
RFM.head()

Unnamed: 0_level_0,InvoiceDate,InvoiceNo,Monetary,R_Score,F_Score,M_Score
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12346,325,2,0.0,1,1,1
12347,1,7,4310.0,5,1,5
12348,74,4,1797.24,2,1,4
12349,18,1,1757.55,4,1,4
12350,309,1,334.4,1,1,2


In [22]:
# 合并RFM各类别标记
RFM['RFM_Segment'] = RFM['R_Score'].astype(str) + RFM['F_Score'].astype(str) + RFM['M_Score'].astype(str)
RFM.head()

Unnamed: 0_level_0,InvoiceDate,InvoiceNo,Monetary,R_Score,F_Score,M_Score,RFM_Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
12346,325,2,0.0,1,1,1,111
12347,1,7,4310.0,5,1,5,515
12348,74,4,1797.24,2,1,4,214
12349,18,1,1757.55,4,1,4,414
12350,309,1,334.4,1,1,2,112


In [24]:
# 客户共可分为63个类别，这里统计出31个，怀疑前面数据筛选有误解
rfm_segment_count = RFM['RFM_Segment'].nunique()
print(f"客户共可分为 {rfm_segment_count} 个类别")

客户共可分为 31 个类别
