# Prepare data -- extract RFM features

In [2]:
import pandas as pd

trade = pd.read_csv('data/transactions.txt')
trade['Date'] = pd.to_datetime(trade['Date'])

RFM = trade.groupby('CardID').agg(
    {'Date': 'max', 'CardID': 'count', 'Amount': 'sum'}
)
RFM.head(10)

Unnamed: 0_level_0,Date,CardID,Amount
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0100000199,2016-12-29,3,597.0
C0100000343,2016-09-07,6,700.94
C0100000375,2016-11-01,4,223.98
C0100000482,2016-12-10,4,197.98
C0100000689,2016-12-26,2,428.0
C0100000789,2016-12-29,3,777.0
C0100000915,2016-12-20,1,49.0
C0100001116,2016-08-20,6,942.97
C0100001139,2016-09-10,4,339.49
C0100001156,2016-10-26,2,528.0


# Standardize the amount field to have a mean of 0 and a variance of 1

In [3]:
from sklearn import preprocessing

#StandardScaler
std = preprocessing.StandardScaler()
Amount = RFM['Amount'].values.reshape(-1, 1) 
# reshape是numpy方法，这里是-1是自动计算行, 1表示一列数组，以此得到一个一列的二维数组从而匹配模型输入特征数据的要求

std.fit(Amount)
RFM['Amount_std'] = std.transform(Amount)

RFM.head(5)

Unnamed: 0_level_0,Date,CardID,Amount,Amount_std
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
C0100000199,2016-12-29,3,597.0,0.473691
C0100000343,2016-09-07,6,700.94,0.766177
C0100000375,2016-11-01,4,223.98,-0.575983
C0100000482,2016-12-10,4,197.98,-0.649147
C0100000689,2016-12-26,2,428.0,-0.001874


# Min-Max normalize the Amount column to [0, 1]

In [4]:
#MinMaxScaler
min_max_scaler = preprocessing.MinMaxScaler()
Amonut = RFM['Amount'].values.reshape(-1, 1)

min_max_scaler.fit(Amount)
RFM['Amount_range'] = min_max_scaler.transform(Amount)

RFM.head(5)

Unnamed: 0_level_0,Date,CardID,Amount,Amount_std,Amount_range
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
C0100000199,2016-12-29,3,597.0,0.473691,0.082819
C0100000343,2016-09-07,6,700.94,0.766177,0.097274
C0100000375,2016-11-01,4,223.98,-0.575983,0.030942
C0100000482,2016-12-10,4,197.98,-0.649147,0.027326
C0100000689,2016-12-26,2,428.0,-0.001874,0.059316


# Log transformation on the Amount field

In [5]:
# 名词描述式的注释↑

In [6]:
import numpy as np
RFM['Amount_log'] = np.log(RFM['Amount'])
RFM.head(5)

Unnamed: 0_level_0,Date,CardID,Amount,Amount_std,Amount_range,Amount_log
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
C0100000199,2016-12-29,3,597.0,0.473691,0.082819,6.391917
C0100000343,2016-09-07,6,700.94,0.766177,0.097274,6.552422
C0100000375,2016-11-01,4,223.98,-0.575983,0.030942,5.411557
C0100000482,2016-12-10,4,197.98,-0.649147,0.027326,5.288166
C0100000689,2016-12-26,2,428.0,-0.001874,0.059316,6.059123


# Perform sqrt transformation for the Amount column

In [7]:
# 短语形式的标题↑

In [8]:
RFM['Amount_sqrt'] = np.sqrt(RFM['Amount'])
RFM.head(5)

Unnamed: 0_level_0,Date,CardID,Amount,Amount_std,Amount_range,Amount_log,Amount_sqrt
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
C0100000199,2016-12-29,3,597.0,0.473691,0.082819,6.391917,24.433583
C0100000343,2016-09-07,6,700.94,0.766177,0.097274,6.552422,26.475271
C0100000375,2016-11-01,4,223.98,-0.575983,0.030942,5.411557,14.965961
C0100000482,2016-12-10,4,197.98,-0.649147,0.027326,5.288166,14.070537
C0100000689,2016-12-26,2,428.0,-0.001874,0.059316,6.059123,20.688161


# Discretize the Amount field into custom intervals

In [9]:
# 动词语句，适合流程书面的描述↑

In [10]:
cut_points = [0,200,500,800,1000]
RFM['Amount_bin'] = pd.cut(RFM['Amount'], bins = cut_points)
RFM.head(5)

Unnamed: 0_level_0,Date,CardID,Amount,Amount_std,Amount_range,Amount_log,Amount_sqrt,Amount_bin
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
C0100000199,2016-12-29,3,597.0,0.473691,0.082819,6.391917,24.433583,"(500, 800]"
C0100000343,2016-09-07,6,700.94,0.766177,0.097274,6.552422,26.475271,"(500, 800]"
C0100000375,2016-11-01,4,223.98,-0.575983,0.030942,5.411557,14.965961,"(200, 500]"
C0100000482,2016-12-10,4,197.98,-0.649147,0.027326,5.288166,14.070537,"(0, 200]"
C0100000689,2016-12-26,2,428.0,-0.001874,0.059316,6.059123,20.688161,"(200, 500]"


# Equal-width discretization on the Amount field

In [11]:
RFM['Amount_width_bin'] = pd.cut(RFM.Amount, 20)
RFM.head(5)

Unnamed: 0_level_0,Date,CardID,Amount,Amount_std,Amount_range,Amount_log,Amount_sqrt,Amount_bin,Amount_width_bin
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
C0100000199,2016-12-29,3,597.0,0.473691,0.082819,6.391917,24.433583,"(500, 800]","(361.016, 720.541]"
C0100000343,2016-09-07,6,700.94,0.766177,0.097274,6.552422,26.475271,"(500, 800]","(361.016, 720.541]"
C0100000375,2016-11-01,4,223.98,-0.575983,0.030942,5.411557,14.965961,"(200, 500]","(-5.701, 361.016]"
C0100000482,2016-12-10,4,197.98,-0.649147,0.027326,5.288166,14.070537,"(0, 200]","(-5.701, 361.016]"
C0100000689,2016-12-26,2,428.0,-0.001874,0.059316,6.059123,20.688161,"(200, 500]","(361.016, 720.541]"


In [12]:
grouped = RFM.groupby('Amount_width_bin', observed = True)
grouped['CardID'].count()

Amount_width_bin
(-5.701, 361.016]       6747
(361.016, 720.541]      4037
(720.541, 1080.067]     1137
(1080.067, 1439.592]     430
(1439.592, 1799.118]     135
(1799.118, 2158.643]      51
(2158.643, 2518.168]      28
(2518.168, 2877.694]      12
(2877.694, 3237.22]        6
(3237.22, 3596.745]        2
(3596.745, 3956.27]        1
(5034.847, 5394.372]       1
(5753.898, 6113.424]       1
(6832.474, 7192.0]         1
Name: CardID, dtype: int64

# Equal-depth discretization on the Amount field

In [13]:
RFM['Amount_depth_bin'] = pd.qcut(RFM.Amount, 20)
RFM.head(5)

Unnamed: 0_level_0,Date,CardID,Amount,Amount_std,Amount_range,Amount_log,Amount_sqrt,Amount_bin,Amount_width_bin,Amount_depth_bin
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C0100000199,2016-12-29,3,597.0,0.473691,0.082819,6.391917,24.433583,"(500, 800]","(361.016, 720.541]","(528.0, 598.612]"
C0100000343,2016-09-07,6,700.94,0.766177,0.097274,6.552422,26.475271,"(500, 800]","(361.016, 720.541]","(700.738, 873.006]"
C0100000375,2016-11-01,4,223.98,-0.575983,0.030942,5.411557,14.965961,"(200, 500]","(-5.701, 361.016]","(206.8, 230.186]"
C0100000482,2016-12-10,4,197.98,-0.649147,0.027326,5.288166,14.070537,"(0, 200]","(-5.701, 361.016]","(179.0, 206.8]"
C0100000689,2016-12-26,2,428.0,-0.001874,0.059316,6.059123,20.688161,"(200, 500]","(361.016, 720.541]","(399.94, 437.0]"


In [14]:
grouped = RFM.groupby('Amount_depth_bin', observed = True)
grouped['CardID'].count()

Amount_depth_bin
(1.489, 74.94]        631
(74.94, 118.998]      628
(118.998, 149.99]     656
(149.99, 179.0]       653
(179.0, 206.8]        580
(206.8, 230.186]      629
(230.186, 256.842]    629
(256.842, 282.99]     634
(282.99, 308.99]      628
(308.99, 341.44]      627
(341.44, 368.98]      634
(368.98, 399.94]      625
(399.94, 437.0]       636
(437.0, 477.976]      622
(477.976, 528.0]      636
(528.0, 598.612]      623
(598.612, 700.738]    629
(700.738, 873.006]    630
(873.006, 1098.0]     631
(1098.0, 7192.0]      628
Name: CardID, dtype: int64