* 피처엔지니어링 통합본
* 생각한 피처와 참고한 커널 등 모든 피처에 대해 정리

# 사전작업

## 모듈 임포트

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', 400)

## 데이터 로드

In [6]:
path = '../data/'

In [7]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
history = pd.read_csv(path + 'historical_transactions.csv')
new_history = pd.read_csv(path + 'new_merchant_transactions.csv')
# merchant = pd.read_csv(path + 'merchants.csv')

In [8]:
debug = True

if debug:
    history = history[:10000]
    new_history = new_history[:10000]

## 데이터 크기 줄이기

In [55]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [110]:
for df in [train, test, history, new_history]:
    df = reduce_mem_usage(df)
    gc.collect()

Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)
Mem. usage decreased to  0.57 Mb (46.4% reduction)
Mem. usage decreased to  0.57 Mb (46.4% reduction)


# Feature Engineering

* 1) 단일 피처에서의 엔지니어링 (2월 3일)
* 2) 피처 사이의 엔지니어링 (2월 5일)

## 단일 피처 엔지니어링

### train & test

In [7]:
for df in [train, test]:
    # first_active_month
    df['first_active'] = pd.to_datetime(df['first_active_month'])
    df['first_active_year'] = df['first_active'].dt.year
    df['first_active_month'] = df['first_active'].dt.month
    df['first_active_quarter'] = df['first_active'].dt.quarter
    df['first_active_weekofyear'] = df['first_active'].dt.weekofyear
    df['first_active_dayofweek'] = df['first_active'].dt.dayofweek
    # 모든 데이터의 마지막 거래날짜가 2018년 4월 30일 23시 59분 59초
    df['first_active_elapsed_time_from_trade'] = (datetime.datetime(2018, 4, 30, 23, 59, 59) - df['first_active']).dt.days
    df['first_active_elapsed_time_from_today'] = (datetime.datetime.today() - df['first_active']).dt.days

* feature_1, feature_2, feature_3 추가 피처엔지니어링 필요

### history & new_history

#### util functions

In [14]:
from scipy import stats
def mode(x):
    return stats.mode(x)[0][0]

In [15]:
def null_cnt(x):
    return np.sum(x.isna() * 1)

In [16]:
def over_550(data):
    return len(data[data > 550])

In [76]:
def ref_date(data):
    ref_month = data.purchase_month - data.month_lag
    
    if ref_month % 12 == 0:
        ref_year = ref_month // 12 - 1
        ref_month = 12
        
    else:
        ref_year = ref_month // 12
        ref_month = ref_month % 12
        
    ref_year = data.purchase_year + ref_year
    return datetime.datetime(ref_year, ref_month, 1, 0, 0, 0)

#### 통합 전처리

In [77]:
for df in [history, new_history]:
    # fillna - category_3, merchant_id, category_2, installments, city_id, state_id
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna(np.nan,inplace=True)
    df['installments'].replace(-1, np.nan,inplace=True)
    df['installments'].replace(999, np.nan,inplace=True)
    
    # label encoding
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0})
    df['category_3'] = df['category_3'].map({'A':0, 'B':1, 'C':2})
    
    # trim purchase amount
    df['purchase_amount_trim'] = df['purchase_amount'].apply(lambda x: min(x, 0.8))
    
    # purchase date
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['purchase_year'] = df['purchase_date'].dt.year
    df['purchase_month'] = df['purchase_date'].dt.month
    df['purchase_day'] = df['purchase_date'].dt.day
    df['purchase_hour'] = df['purchase_date'].dt.hour
    df['purchase_dayofweek'] = df['purchase_date'].dt.dayofweek
    df['purchase_weekofyear'] = df['purchase_date'].dt.weekofyear
    df['purchase_weekend'] = (df['purchase_date'].dt.weekday >=5).astype(int)
    df['ref_date'] = df.apply(ref_date, axis = 1)

In [78]:
history

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,purchase_amount_trim,purchase_year,purchase_month,purchase_day,purchase_hour,purchase_dayofweek,purchase_weekofyear,purchase_weekend,ref_date
0,,C_ID_4e6213e9bc,88,,0.0,,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,-0.703331,2017,6,25,15,6,25,1,2018-02-01
1,,C_ID_4e6213e9bc,88,,0.0,,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,-0.733128,2017,7,15,12,5,28,1,2018-02-01
2,,C_ID_4e6213e9bc,88,,0.0,,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,-0.720386,2017,8,9,22,2,32,0,2018-02-01
3,,C_ID_4e6213e9bc,88,,0.0,,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,-0.735352,2017,9,2,10,5,35,1,2018-02-01
4,,C_ID_4e6213e9bc,88,,0.0,,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,-0.722865,2017,3,10,1,4,10,0,2018-02-01
5,,C_ID_4e6213e9bc,333,,0.0,,80,M_ID_50af771f8d,0,-0.734887,2018-02-24 08:45:05,1.0,9,37,-0.734887,2018,2,24,8,5,8,1,2018-02-01
6,,C_ID_4e6213e9bc,88,,0.0,,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21 00:10:51,1.0,16,37,-0.716855,2017,3,21,0,1,12,0,2018-02-01
7,,C_ID_4e6213e9bc,3,,0.0,,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18 20:05:55,1.0,16,37,-0.657049,2017,11,18,20,5,46,1,2018-02-01
8,,C_ID_4e6213e9bc,88,,0.0,,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01 22:02:56,1.0,16,37,-0.737967,2017,6,1,22,3,22,0,2018-02-01
9,,C_ID_4e6213e9bc,88,,0.0,,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16 15:41:22,1.0,16,37,-0.715352,2017,3,16,15,3,11,0,2018-02-01


In [60]:
history.apply(ref_date, axis = 1)

authorized_flag                           1
card_id                     C_ID_4e6213e9bc
city_id                                  88
category_1                                0
installments                              0
category_3                                0
merchant_category_id                     80
merchant_id                 M_ID_e020e9b302
month_lag                                -8
purchase_amount                   -0.703331
purchase_date           2017-06-25 15:33:07
category_2                                1
state_id                                 16
subsector_id                             37
purchase_amount_trim              -0.703331
purchase_year                          2017
purchase_month                            6
purchase_day                             25
purchase_hour                            15
purchase_dayofweek                        6
purchase_weekofyear                      25
purchase_weekend                          1
Name: 0, dtype: object
authorize

Name: 353, dtype: object
authorized_flag                           1
card_id                     C_ID_4e6213e9bc
city_id                                  88
category_1                                0
installments                              0
category_3                                0
merchant_category_id                     80
merchant_id                 M_ID_979ed661fc
month_lag                                -4
purchase_amount                   -0.726321
purchase_date           2017-10-11 00:57:54
category_2                                1
state_id                                 16
subsector_id                             37
purchase_amount_trim              -0.726321
purchase_year                          2017
purchase_month                           10
purchase_day                             11
purchase_hour                             0
purchase_dayofweek                        2
purchase_weekofyear                      41
purchase_weekend                          0
Name: 3

Name: 704, dtype: object
authorized_flag                           1
card_id                     C_ID_0e171c1b48
city_id                                 277
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    309
merchant_id                 M_ID_5d686bfc66
month_lag                                -8
purchase_amount                   -0.716855
purchase_date           2017-06-10 11:59:31
category_2                                4
state_id                                 13
subsector_id                             21
purchase_amount_trim              -0.716855
purchase_year                          2017
purchase_month                            6
purchase_day                             10
purchase_hour                            11
purchase_dayofweek                        5
purchase_weekofyear                      23
purchase_weekend                          1
Name: 7

Name: 1014, dtype: object
authorized_flag                           1
card_id                     C_ID_fc8e41b9cf
city_id                                 170
category_1                                0
installments                              1
category_3                                1
merchant_category_id                    367
merchant_id                 M_ID_68e26f3cca
month_lag                                -4
purchase_amount                   -0.622488
purchase_date           2017-10-07 00:00:00
category_2                                3
state_id                                  8
subsector_id                             16
purchase_amount_trim              -0.622488
purchase_year                          2017
purchase_month                           10
purchase_day                              7
purchase_hour                             0
purchase_dayofweek                        5
purchase_weekofyear                      40
purchase_weekend                          1
Name: 

Name: 1356, dtype: object
authorized_flag                           1
card_id                     C_ID_4bed29d75c
city_id                                  19
category_1                                0
installments                              1
category_3                                1
merchant_category_id                    414
merchant_id                 M_ID_62ebcb8b8b
month_lag                                -4
purchase_amount                   -0.659754
purchase_date           2017-09-16 14:58:53
category_2                                1
state_id                                  9
subsector_id                             29
purchase_amount_trim              -0.659754
purchase_year                          2017
purchase_month                            9
purchase_day                             16
purchase_hour                            14
purchase_dayofweek                        5
purchase_weekofyear                      37
purchase_weekend                          1
Name: 

Name: 1663, dtype: object
authorized_flag                           1
card_id                     C_ID_3fff3df454
city_id                                  30
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    683
merchant_id                 M_ID_e053bbe5fb
month_lag                                 0
purchase_amount                   -0.698147
purchase_date           2018-02-18 15:41:00
category_2                                3
state_id                                 17
subsector_id                             34
purchase_amount_trim              -0.698147
purchase_year                          2018
purchase_month                            2
purchase_day                             18
purchase_hour                            15
purchase_dayofweek                        6
purchase_weekofyear                       7
purchase_weekend                          1
Name: 

authorized_flag                           1
card_id                     C_ID_3fff3df454
city_id                                  69
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    879
merchant_id                 M_ID_00a6ca8a8a
month_lag                                -2
purchase_amount                   -0.581932
purchase_date           2017-12-20 22:33:50
category_2                                1
state_id                                  9
subsector_id                             29
purchase_amount_trim              -0.581932
purchase_year                          2017
purchase_month                           12
purchase_day                             20
purchase_hour                            22
purchase_dayofweek                        2
purchase_weekofyear                      51
purchase_weekend                          0
Name: 2013, dtype: object
author

Name: 2317, dtype: object
authorized_flag                           1
card_id                     C_ID_3898518baa
city_id                                  69
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    884
merchant_id                 M_ID_66fd5e43ae
month_lag                               -10
purchase_amount                   -0.641737
purchase_date           2017-04-15 18:45:59
category_2                                1
state_id                                  9
subsector_id                             27
purchase_amount_trim              -0.641737
purchase_year                          2017
purchase_month                            4
purchase_day                             15
purchase_hour                            18
purchase_dayofweek                        5
purchase_weekofyear                      15
purchase_weekend                          1
Name: 

Name: 2590, dtype: object
authorized_flag                           1
card_id                     C_ID_058b254a5b
city_id                                 233
category_1                                0
installments                              1
category_3                                1
merchant_category_id                     80
merchant_id                 M_ID_83dd584bfc
month_lag                                -7
purchase_amount                   -0.651039
purchase_date           2017-07-21 20:43:29
category_2                                1
state_id                                  9
subsector_id                             37
purchase_amount_trim              -0.651039
purchase_year                          2017
purchase_month                            7
purchase_day                             21
purchase_hour                            20
purchase_dayofweek                        4
purchase_weekofyear                      29
purchase_weekend                          0
Name: 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




authorized_flag                           1
card_id                     C_ID_21117571cf
city_id                                  53
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    309
merchant_id                 M_ID_06c8870594
month_lag                                -8
purchase_amount                   -0.676584
purchase_date           2017-06-22 10:04:38
category_2                                5
state_id                                 20
subsector_id                             21
purchase_amount_trim              -0.676584
purchase_year                          2017
purchase_month                            6
purchase_day                             22
purchase_hour                            10
purchase_dayofweek                        3
purchase_weekofyear                      25
purchase_weekend                          0
Name: 5410, dtype: object
autho

Name: 5747, dtype: object
authorized_flag                           1
card_id                     C_ID_198914cf94
city_id                                  36
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    705
merchant_id                 M_ID_82e1b2a782
month_lag                                -6
purchase_amount                   -0.590377
purchase_date           2017-08-18 19:45:31
category_2                                2
state_id                                 24
subsector_id                             33
purchase_amount_trim              -0.590377
purchase_year                          2017
purchase_month                            8
purchase_day                             18
purchase_hour                            19
purchase_dayofweek                        4
purchase_weekofyear                      33
purchase_weekend                          0
Name: 

Name: 6063, dtype: object
authorized_flag                           0
card_id                     C_ID_198914cf94
city_id                                  36
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    307
merchant_id                 M_ID_cfa40e419c
month_lag                                -5
purchase_amount                   -0.671775
purchase_date           2017-09-11 15:54:16
category_2                                2
state_id                                 24
subsector_id                             19
purchase_amount_trim              -0.671775
purchase_year                          2017
purchase_month                            9
purchase_day                             11
purchase_hour                            15
purchase_dayofweek                        0
purchase_weekofyear                      37
purchase_weekend                          0
Name: 

authorized_flag                           1
card_id                     C_ID_ad99f08d4c
city_id                                 261
category_1                                0
installments                              1
category_3                                1
merchant_category_id                    307
merchant_id                 M_ID_57bbb8f378
month_lag                                -5
purchase_amount                   -0.671775
purchase_date           2017-09-06 21:36:07
category_2                                1
state_id                                  9
subsector_id                             19
purchase_amount_trim              -0.671775
purchase_year                          2017
purchase_month                            9
purchase_day                              6
purchase_hour                            21
purchase_dayofweek                        2
purchase_weekofyear                      36
purchase_weekend                          0
Name: 6475, dtype: object
author

Name: 6857, dtype: object
authorized_flag                           1
card_id                     C_ID_12b7811978
city_id                                  57
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    419
merchant_id                 M_ID_7d87558807
month_lag                                -1
purchase_amount                   -0.709342
purchase_date           2018-01-06 09:07:00
category_2                                5
state_id                                  5
subsector_id                              1
purchase_amount_trim              -0.709342
purchase_year                          2018
purchase_month                            1
purchase_day                              6
purchase_hour                             9
purchase_dayofweek                        5
purchase_weekofyear                       1
purchase_weekend                          1
Name: 

Name: 7233, dtype: object
authorized_flag                           1
card_id                     C_ID_25236b70f6
city_id                                  17
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    683
merchant_id                 M_ID_309752ddea
month_lag                                -7
purchase_amount                   -0.677786
purchase_date           2017-07-12 21:42:46
category_2                                4
state_id                                 22
subsector_id                             34
purchase_amount_trim              -0.677786
purchase_year                          2017
purchase_month                            7
purchase_day                             12
purchase_hour                            21
purchase_dayofweek                        2
purchase_weekofyear                      28
purchase_weekend                          0
Name: 

authorized_flag                           1
card_id                     C_ID_25236b70f6
city_id                                  17
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    560
merchant_id                 M_ID_c50d5d8042
month_lag                                -4
purchase_amount                   -0.735623
purchase_date           2017-10-20 21:35:33
category_2                                4
state_id                                 22
subsector_id                             34
purchase_amount_trim              -0.735623
purchase_year                          2017
purchase_month                           10
purchase_day                             20
purchase_hour                            21
purchase_dayofweek                        4
purchase_weekofyear                      42
purchase_weekend                          0
Name: 7505, dtype: object
author

Name: 7908, dtype: object
authorized_flag                           1
card_id                     C_ID_9e6b3e491a
city_id                                 116
category_1                                0
installments                              0
category_3                                0
merchant_category_id                    307
merchant_id                 M_ID_bd7c95b5f6
month_lag                               -11
purchase_amount                   -0.710844
purchase_date           2017-03-08 22:17:32
category_2                                1
state_id                                  9
subsector_id                             19
purchase_amount_trim              -0.710844
purchase_year                          2017
purchase_month                            3
purchase_day                              8
purchase_hour                            22
purchase_dayofweek                        2
purchase_weekofyear                      10
purchase_weekend                          0
Name: 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



#### history

In [19]:
aggs = {}
aggs['authorized_flag'] = [mode, 'sum', 'mean']
aggs['card_id'] = ['size']
aggs['city_id'] = [mode, 'nunique']
aggs['category_1'] = [mode, 'sum', 'mean']
aggs['installments'] = [mode, 'sum', 'mean', 'var', 'max', 'min', null_cnt]
aggs['category_3'] = [mode, 'mean']
aggs['merchant_category_id'] = [mode, 'nunique']
aggs['merchant_id'] = ['nunique']
aggs['month_lag'] = [mode, 'sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_amount'] = ['sum', 'mean', 'var', 'max', 'min', 'skew', over_550]
aggs['category_2'] = [mode, 'mean']
aggs['state_id'] = [mode, 'nunique']
aggs['subsector_id'] = [mode, 'nunique']
aggs['purchase_amount_trim'] = ['sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_year'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_month'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_day'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_hour'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_dayofweek'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekofyear'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekend'] = [mode, 'sum', 'mean']

# aggregation
history_agg = history.groupby('card_id').agg(aggs)

# change column name
history_agg.columns = pd.Index([e[0] + "_" + e[1] for e in history_agg.columns.tolist()])
history_agg.columns = ['hist_'+ c for c in history_agg.columns]

# reduce memory usage
# history_agg = reduce_mem_usage(history_agg)

In [20]:
history_agg

Unnamed: 0_level_0,hist_authorized_flag_mode,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_card_id_size,hist_city_id_mode,hist_city_id_nunique,hist_category_1_mode,hist_category_1_sum,hist_category_1_mean,hist_installments_mode,hist_installments_sum,hist_installments_mean,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_null_cnt,hist_category_3_mode,hist_category_3_mean,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_month_lag_mode,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_skew,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_category_2_mode,hist_category_2_mean,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode,hist_subsector_id_nunique,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean
card_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1
C_ID_058b254a5b,1,461,0.946612,487,233,17,0,57,0.117043,1.0,549.0,1.12963,0.793471,12.0,1.0,1.0,1,1.030801,307,31,142,-3,-1806,-3.708419,5.713159,0,-8,-0.159905,-329.192856,-0.675961,0.005045,0.139656,-0.741649,4.372763,0.0,1.0,1.112936,9,4,37,16,-329.192856,-0.675961,0.005045,0.139656,-0.741649,4.372763,2017,2,2017.219713,2018,2017,11,9,7.655031,12,1,25,31,16.856263,88.226211,31,1,-0.307781,0,24,13.63655,57.779155,23,0,-0.675573,4,7,3.090349,6,0,48,37,31.560575,52,1,0,125,0.256674
C_ID_0d6b8c2d0f,1,35,0.972222,36,19,2,0,0,0.0,1.0,46.0,1.314286,0.515966,3.0,1.0,1.0,1,1.138889,367,11,15,-2,-219,-6.083333,11.507143,-2,-13,-0.338128,-23.222407,-0.645067,0.008894,-0.243234,-0.726021,2.640387,0.0,1.0,1.0,9,1,16,6,-23.222407,-0.645067,0.008894,-0.243234,-0.726021,2.640387,2017,1,2017.0,2017,2017,12,10,7.916667,12,1,8,11,9.833333,15.8,21,5,1.191065,20,13,16.166667,11.4,21,9,-0.491747,2,7,2.861111,6,0,50,14,31.638889,51,1,0,6,0.166667
C_ID_0e171c1b48,1,261,0.935484,279,277,4,0,0,0.0,0.0,3.0,0.010753,0.010675,1.0,0.0,0.0,0,0.010753,705,36,72,-8,-1550,-5.555556,12.046363,0,-12,0.108347,-197.34928,-0.707345,0.004387,-0.296112,-0.745405,4.023859,0.0,4.0,3.695341,13,3,33,17,-197.34928,-0.707345,0.004387,-0.296112,-0.745405,4.023859,2017,2,2017.175627,2018,2017,2,12,6.336918,12,1,20,31,15.842294,78.212449,31,1,-0.064511,20,20,15.716846,45.153348,23,0,-1.161603,5,7,3.476703,6,0,8,50,25.584229,52,1,0,112,0.401434
C_ID_12b7811978,1,36,0.878049,41,57,2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,419,14,19,-8,-190,-4.634146,9.037805,0,-9,0.000204,-27.723178,-0.676175,0.005849,-0.319253,-0.731881,3.033629,0.0,5.0,4.804878,5,2,27,9,-27.723178,-0.676175,0.005849,-0.319253,-0.731881,3.033629,2017,2,2017.195122,2018,2017,6,9,7.02439,12,1,6,17,11.585366,75.59878,31,1,0.741202,11,11,12.195122,15.560976,19,0,-1.213682,0,6,1.780488,5,0,22,18,28.219512,50,1,0,1,0.02439
C_ID_190bd090c9,1,29,0.828571,35,286,3,0,2,0.057143,1.0,36.0,1.028571,0.028571,2.0,1.0,0.0,1,1.028571,705,10,17,-9,-241,-6.885714,17.515966,-1,-11,0.650099,-23.946204,-0.684177,0.003484,-0.508963,-0.745405,1.436238,0.0,3.0,2.742857,3,3,33,8,-23.946204,-0.684177,0.003484,-0.508963,-0.745405,1.436238,2017,1,2017.0,2017,2017,3,5,5.114286,11,1,21,14,21.6,16.835294,31,12,-0.283093,21,13,17.228571,26.946218,23,0,-1.511275,0,7,2.885714,6,0,47,9,21.171429,48,3,0,13,0.371429
C_ID_198914cf94,1,623,0.912152,683,36,8,0,0,0.0,0.0,5.0,0.007321,0.007278,1.0,0.0,0.0,0,0.007321,705,48,109,-4,-3927,-5.749634,9.097049,0,-13,-0.846061,10835.490261,15.864554,185973.380946,11269.666518,-0.744654,26.134242,1.0,2.0,1.98243,24,3,33,19,-442.881388,-0.648435,0.04663,0.8,-0.744654,5.067596,2017,2,2017.039531,2018,2017,10,12,7.775988,12,1,24,31,16.704246,71.622079,31,1,-0.196467,13,24,13.853587,31.817242,23,0,-0.523192,4,7,3.2694,6,0,34,43,31.979502,52,1,0,227,0.332357
C_ID_21117571cf,1,209,0.924779,226,53,4,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,705,30,66,-11,-1651,-7.30531,11.884149,-2,-13,0.009694,-160.10882,-0.708446,0.002659,-0.296112,-0.745405,3.925919,0.0,5.0,4.982301,20,2,33,14,-160.10882,-0.708446,0.002659,-0.296112,-0.745405,3.925919,2017,1,2017.0,2017,2017,3,12,6.69469,12,1,6,31,16.159292,66.996735,31,1,0.026309,17,18,14.119469,25.438997,23,0,-0.953562,3,7,2.783186,6,0,49,43,27.256637,51,1,0,31,0.137168
C_ID_2223b33279,1,60,0.923077,65,344,8,0,1,0.015385,0.0,1.0,0.015385,0.015385,1.0,0.0,0.0,0,0.015385,705,20,34,-4,-330,-5.076923,11.478365,0,-10,-0.058635,-44.193137,-0.679894,0.007746,-0.297014,-0.745405,2.711541,0.0,2.0,2.015385,18,6,33,14,-44.193137,-0.679894,0.007746,-0.297014,-0.745405,2.711541,2017,1,2017.0,2017,2017,7,11,5.923077,11,1,18,30,15.184615,80.184135,31,1,0.106993,11,17,13.384615,31.177885,22,0,-0.749624,6,7,3.030769,6,0,29,34,24.523077,52,1,0,20,0.307692
C_ID_23518d5fe7,1,361,0.932817,387,69,4,0,0,0.0,0.0,2.0,0.005168,0.005155,1.0,0.0,0.0,0,0.005168,437,41,81,-2,-2399,-6.198966,17.030258,0,-13,-0.150813,-230.180075,-0.594781,0.153356,3.776072,-0.743587,7.915193,0.0,1.0,1.0,9,1,15,22,-236.253254,-0.610474,0.06329,0.8,-0.743587,3.490433,2017,2,2017.152455,2018,2017,2,12,5.971576,12,1,15,31,15.043928,72.088739,31,1,0.091457,16,20,14.997416,20.655434,23,0,-0.740145,5,7,3.093023,6,0,8,50,23.963824,52,1,0,117,0.302326
C_ID_25236b70f6,1,528,0.96,550,17,7,0,0,0.0,0.0,6.0,0.010909,0.01081,1.0,0.0,0.0,0,0.010909,560,34,68,-7,-2599,-4.725455,3.831591,0,-13,0.538942,-353.502492,-0.642732,0.036889,2.933085,-0.74518,12.365108,0.0,4.0,3.836364,22,4,34,21,-355.635577,-0.64661,0.017375,0.8,-0.74518,4.24125,2017,2,2017.105455,2018,2017,7,8,8.009091,12,1,16,31,15.649091,67.907604,31,1,0.046905,19,23,14.86,25.107869,23,0,-0.841753,0,7,2.609091,6,0,29,31,32.949091,52,1,0,96,0.174545


#### new_history

In [63]:
aggs = {}
aggs['authorized_flag'] = [mode, 'sum', 'mean']
aggs['card_id'] = ['size']
aggs['city_id'] = [mode, 'nunique']
aggs['category_1'] = [mode, 'sum', 'mean']
aggs['installments'] = [mode, 'sum', 'mean', 'var', 'max', 'min', null_cnt]
aggs['category_3'] = [mode, 'mean']
aggs['merchant_category_id'] = [mode, 'nunique']
aggs['merchant_id'] = ['nunique']
aggs['month_lag'] = [mode, 'sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_amount'] = ['sum', 'mean', 'var', 'max', 'min', 'skew', over_550]
aggs['category_2'] = [mode, 'mean']
aggs['state_id'] = [mode, 'nunique']
aggs['subsector_id'] = [mode, 'nunique']
aggs['purchase_amount_trim'] = ['sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_year'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_month'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_day'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_hour'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_dayofweek'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekofyear'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekend'] = [mode, 'sum', 'mean']

# aggregation
new_history_agg = new_history.groupby('card_id').agg(aggs)

# change column name
new_history_agg.columns = pd.Index([e[0] + "_" + e[1] for e in new_history_agg.columns.tolist()])
new_history_agg.columns = ['new_'+ c for c in new_history_agg.columns]

# reduce memory usage
new_history_agg = reduce_mem_usage(new_history_agg)

Mem. usage decreased to  0.10 Mb (79.8% reduction)


## 피처 사이 엔지니어링

### history & new_history

In [80]:
for df in [history, new_history]:
    for col in ['city_id', 'authorized_flag', 'category_1', 'category_2', 'category_3']:
        df[col+'_mean'] = df.groupby([col])['purchase_amount'].transform('mean')
        df[col+'_min'] = df.groupby([col])['purchase_amount'].transform('min')
        df[col+'_max'] = df.groupby([col])['purchase_amount'].transform('max')
        df[col+'_sum'] = df.groupby([col])['purchase_amount'].transform('sum')

In [81]:
history

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,purchase_amount_trim,purchase_year,purchase_month,purchase_day,purchase_hour,purchase_dayofweek,purchase_weekofyear,purchase_weekend,category_2_mean,category_2_min,category_2_max,category_2_sum,category_3_mean,category_3_min,category_3_max,category_3_sum,city_id_mean,city_id_min,city_id_max,city_id_sum,authorized_flag_mean,authorized_flag_min,authorized_flag_max,authorized_flag_sum,category_1_mean,category_1_min,category_1_max,category_1_sum
0,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,-0.703331,2017,6,25,15,6,25,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
1,1,C_ID_4e6213e9bc,88,0,0.0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,-0.733128,2017,7,15,12,5,28,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
2,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,-0.720386,2017,8,9,22,2,32,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
3,1,C_ID_4e6213e9bc,88,0,0.0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,-0.735352,2017,9,2,10,5,35,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
4,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,-0.722865,2017,3,10,1,4,10,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
5,1,C_ID_4e6213e9bc,333,0,0.0,0,80,M_ID_50af771f8d,0,-0.734887,2018-02-24 08:45:05,1.0,9,37,-0.734887,2018,2,24,8,5,8,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.639567,-0.745405,1.397375,-58.200565,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
6,1,C_ID_4e6213e9bc,88,0,0.0,0,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21 00:10:51,1.0,16,37,-0.716855,2017,3,21,0,1,12,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
7,1,C_ID_4e6213e9bc,3,0,0.0,0,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18 20:05:55,1.0,16,37,-0.657049,2017,11,18,20,5,46,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.650099,-0.735638,-0.249230,-5.200796,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
8,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01 22:02:56,1.0,16,37,-0.737967,2017,6,1,22,3,22,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
9,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16 15:41:22,1.0,16,37,-0.715352,2017,3,16,15,3,11,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
