In [5]:
import pandas as pd

trade = pd.read_csv('data/transactions.txt')
trade['Date'] = pd.to_datetime(trade['Date'])

RFM = trade.groupby('CardID').agg(
    {'Date': 'max', 'CardID': 'count', 'Amount': 'sum'}
)
RFM.head(10)

Unnamed: 0_level_0,Date,CardID,Amount
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C0100000199,2016-12-29,3,597.0
C0100000343,2016-09-07,6,700.94
C0100000375,2016-11-01,4,223.98
C0100000482,2016-12-10,4,197.98
C0100000689,2016-12-26,2,428.0
C0100000789,2016-12-29,3,777.0
C0100000915,2016-12-20,1,49.0
C0100001116,2016-08-20,6,942.97
C0100001139,2016-09-10,4,339.49
C0100001156,2016-10-26,2,528.0


# One-Hot Encoding

In [6]:
# 1: Use Pandas
one_hot = pd.get_dummies(RFM['CardID'], drop_first = False, prefix = 'Freq')
one_hot.head()

Unnamed: 0_level_0,Freq_1,Freq_2,Freq_3,Freq_4,Freq_5,Freq_6,Freq_7,Freq_8,Freq_9,Freq_10,...,Freq_21,Freq_22,Freq_23,Freq_24,Freq_25,Freq_26,Freq_27,Freq_28,Freq_29,Freq_30
CardID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C0100000199,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C0100000343,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C0100000375,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C0100000482,False,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
C0100000689,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
## 2: Use sklearn OneHotEncoder
from sklearn import preprocessing

#OneHotEncoder
OneHot = preprocessing.OneHotEncoder()
Freq = RFM['CardID'].values.reshape(-1, 1)
OneHot.fit(Freq)

Freq_OneHot = OneHot.transform(Freq).toarray()
Freq_OneHot

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
df = pd.DataFrame(Freq_OneHot)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
