In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
time_parse = lambda x:datetime.strptime(x,"%d-%m-%Y %H:%M")
user_data = pd.read_csv('./data/OnlineRetail.csv',parse_dates=['InvoiceDate'],date_parser=time_parse)
user_data = user_data.dropna(how='any',axis=0)
index_valid_quantity = user_data['Quantity']>0 
index_valid_price = user_data['UnitPrice']>0
user_data['CustomerID'] = user_data['CustomerID'].astype(np.int64)
index_valid = [all(x) for x in zip(index_valid_quantity,index_valid_price)]
user_data = user_data[index_valid]
user_data.reset_index(drop=True,inplace=True)
user_data['orderAmount'] = user_data['Quantity'] * user_data['UnitPrice']
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397884 entries, 0 to 397883
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   CustomerID   397884 non-null  int64         
 1   Quantity     397884 non-null  int64         
 2   InvoiceDate  397884 non-null  datetime64[ns]
 3   UnitPrice    397884 non-null  float64       
 4   orderAmount  397884 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(2)
memory usage: 15.2 MB


In [3]:
user_pivot = user_data.pivot_table(index='CustomerID',aggfunc={
    'Quantity':'sum',
    'InvoiceDate':'max',
    'orderAmount':'sum'
})
date_now = user_pivot['InvoiceDate'].max()
date_gap_mean = date_now - user_pivot['InvoiceDate'].mean()
quantity_mean = user_pivot['Quantity'].mean()
order_amount_mean = user_pivot['orderAmount'].mean()
user_pivot['M'] = user_pivot['orderAmount'] - order_amount_mean
user_pivot['F'] = user_pivot['Quantity'] - quantity_mean
user_pivot['R'] = (date_now - user_pivot['InvoiceDate'] - date_gap_mean)/np.timedelta64(1,'D')
user_pivot

Unnamed: 0_level_0,InvoiceDate,Quantity,orderAmount,M,F,R
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12346,2011-01-18 10:01:00,74215,77183.60,75129.33354,73023.710927,233.068045
12347,2011-12-07 15:52:00,2458,4310.00,2255.73354,1266.710927,-90.175705
12348,2011-09-25 13:13:00,2341,1797.24,-257.02646,1149.710927,-17.065289
12349,2011-11-21 09:51:00,631,1757.55,-296.71646,-560.289073,-73.925011
12350,2011-02-02 16:01:00,197,334.40,-1719.86646,-994.289073,217.818045
...,...,...,...,...,...,...
18280,2011-03-07 09:52:00,45,180.60,-1873.66646,-1146.289073,185.074295
18281,2011-06-12 10:53:00,54,80.82,-1973.44646,-1137.289073,88.031933
18282,2011-12-02 11:43:00,103,178.05,-1876.21646,-1088.289073,-85.002789
18283,2011-12-06 12:02:00,1397,2094.88,40.61354,205.710927,-89.015983


In [4]:
value_map = {
    '111':'重要价值客户',
    '011':'重要保持客户',
    '101':'重要挽留客户',
    '001':'重要发展客户',
    '110':'一般价值客户',
    '010':'一般保持客户',
    '100':'一般挽留客户',
    '000':'一般发展客户'
}
def add_label(x):
    level = x.map(lambda x : '1' if x >= 0 else '0')
    label = level.R + level.F + level.M
    return value_map[label]
RFM = user_pivot[['R','F','M']]
RFM['label'] = RFM.apply(add_label,axis=1)
RFM.to_csv('./data/RFMTable.csv')
RFM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RFM['label'] = RFM.apply(add_label,axis=1)


Unnamed: 0_level_0,R,F,M,label
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12346,233.068045,73023.710927,75129.33354,重要价值客户
12347,-90.175705,1266.710927,2255.73354,重要保持客户
12348,-17.065289,1149.710927,-257.02646,一般保持客户
12349,-73.925011,-560.289073,-296.71646,一般发展客户
12350,217.818045,-994.289073,-1719.86646,一般挽留客户
...,...,...,...,...
18280,185.074295,-1146.289073,-1873.66646,一般挽留客户
18281,88.031933,-1137.289073,-1973.44646,一般挽留客户
18282,-85.002789,-1088.289073,-1876.21646,一般发展客户
18283,-89.015983,205.710927,40.61354,重要保持客户
