* 피처엔지니어링 통합본
* 생각한 피처와 참고한 커널 등 모든 피처에 대해 정리

# 사전작업

## 모듈 임포트

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', 400)

## 데이터 로드

In [4]:
path = './data/'

In [5]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
history = pd.read_csv(path + 'historical_transactions.csv')
new_history = pd.read_csv(path + 'new_merchant_transactions.csv')
# merchant = pd.read_csv(path + 'merchants.csv')

In [6]:
debug = True

if debug:
    history = history[:10000]
    new_history = new_history[:10000]

## 데이터 크기 줄이기

In [55]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [110]:
for df in [train, test, history, new_history]:
    df = reduce_mem_usage(df)
    gc.collect()

Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)
Mem. usage decreased to  0.57 Mb (46.4% reduction)
Mem. usage decreased to  0.57 Mb (46.4% reduction)


# Feature Engineering

* 1) 단일 피처에서의 엔지니어링 (2월 3일)
* 2) 피처 사이의 엔지니어링 (2월 5일)

## 단일 피처 엔지니어링

### train & test

In [7]:
for df in [train, test]:
    # first_active_month
    df['first_active'] = pd.to_datetime(df['first_active_month'])
    df['first_active_year'] = df['first_active'].dt.year
    df['first_active_month'] = df['first_active'].dt.month
    df['first_active_quarter'] = df['first_active'].dt.quarter
    df['first_active_weekofyear'] = df['first_active'].dt.weekofyear
    df['first_active_dayofweek'] = df['first_active'].dt.dayofweek
    # 모든 데이터의 마지막 거래날짜가 2018년 4월 30일 23시 59분 59초
    df['first_active_elapsed_time_from_trade'] = (datetime.datetime(2018, 4, 30, 23, 59, 59) - df['first_active']).dt.days
    df['first_active_elapsed_time_from_today'] = (datetime.datetime.today() - df['first_active']).dt.days

* feature_1, feature_2, feature_3 추가 피처엔지니어링 필요

### history & new_history

#### util functions

In [9]:
from scipy import stats
def mode(x):
    return stats.mode(x)[0][0]

In [10]:
def null_cnt(x):
    return np.sum(x.isna() * 1)

In [11]:
def over_550(data):
    return len(data[data > 550])

#### 통합 전처리

In [8]:
for df in [history, new_history]:
    # fillna - category_3, merchant_id, category_2, installments, city_id, state_id
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna(np.nan,inplace=True)
    df['installments'].replace(-1, np.nan,inplace=True)
    df['installments'].replace(999, np.nan,inplace=True)
    
    # label encoding
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0})
    df['category_3'] = df['category_3'].map({'A':0, 'B':1, 'C':2})
    
    # trim purchase amount
    df['purchase_amount_trim'] = df['purchase_amount'].apply(lambda x: min(x, 0.8))
    
    # purchase date
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['purchase_year'] = df['purchase_date'].dt.year
    df['purchase_month'] = df['purchase_date'].dt.month
    df['purchase_day'] = df['purchase_date'].dt.day
    df['purchase_hour'] = df['purchase_date'].dt.hour
    df['purchase_dayofweek'] = df['purchase_date'].dt.dayofweek
    df['purchase_weekofyear'] = df['purchase_date'].dt.weekofyear
    df['purchase_weekend'] = (df['purchase_date'].dt.weekday >=5).astype(int)

#### history

In [56]:
aggs = {}
aggs['authorized_flag'] = [mode, 'sum', 'mean']
aggs['card_id'] = ['size']
aggs['city_id'] = [mode, 'nunique']
aggs['category_1'] = [mode, 'sum', 'mean']
aggs['installments'] = [mode, 'sum', 'mean', 'var', 'max', 'min', null_cnt]
aggs['category_3'] = [mode, 'mean']
aggs['merchant_category_id'] = [mode, 'nunique']
aggs['merchant_id'] = ['nunique']
aggs['month_lag'] = [mode, 'sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_amount'] = ['sum', 'mean', 'var', 'max', 'min', 'skew', over_550]
aggs['category_2'] = [mode, 'mean']
aggs['state_id'] = [mode, 'nunique']
aggs['subsector_id'] = [mode, 'nunique']
aggs['purchase_amount_trim'] = ['sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_year'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_month'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_day'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_hour'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_dayofweek'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekofyear'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekend'] = [mode, 'sum', 'mean']

# aggregation
history_agg = history.groupby('card_id').agg(aggs)

# change column name
history_agg.columns = pd.Index([e[0] + "_" + e[1] for e in history_agg.columns.tolist()])
history_agg.columns = ['hist_'+ c for c in history_agg.columns]

# reduce memory usage
history_agg = reduce_mem_usage(history_agg)

Mem. usage decreased to  0.01 Mb (78.6% reduction)


#### new_history

In [63]:
aggs = {}
aggs['authorized_flag'] = [mode, 'sum', 'mean']
aggs['card_id'] = ['size']
aggs['city_id'] = [mode, 'nunique']
aggs['category_1'] = [mode, 'sum', 'mean']
aggs['installments'] = [mode, 'sum', 'mean', 'var', 'max', 'min', null_cnt]
aggs['category_3'] = [mode, 'mean']
aggs['merchant_category_id'] = [mode, 'nunique']
aggs['merchant_id'] = ['nunique']
aggs['month_lag'] = [mode, 'sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_amount'] = ['sum', 'mean', 'var', 'max', 'min', 'skew', over_550]
aggs['category_2'] = [mode, 'mean']
aggs['state_id'] = [mode, 'nunique']
aggs['subsector_id'] = [mode, 'nunique']
aggs['purchase_amount_trim'] = ['sum', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_year'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_month'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_day'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_hour'] = [mode, 'nunique', 'mean', 'var', 'max', 'min', 'skew']
aggs['purchase_dayofweek'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekofyear'] = [mode, 'nunique', 'mean', 'max', 'min']
aggs['purchase_weekend'] = [mode, 'sum', 'mean']

# aggregation
new_history_agg = new_history.groupby('card_id').agg(aggs)

# change column name
new_history_agg.columns = pd.Index([e[0] + "_" + e[1] for e in new_history_agg.columns.tolist()])
new_history_agg.columns = ['new_'+ c for c in new_history_agg.columns]

# reduce memory usage
new_history_agg = reduce_mem_usage(new_history_agg)

Mem. usage decreased to  0.10 Mb (79.8% reduction)


## 피처 사이 엔지니어링

### history & new_history

In [80]:
for df in [history, new_history]:
    for col in ['city_id', 'authorized_flag', 'category_1', 'category_2', 'category_3']:
        df[col+'_mean'] = df.groupby([col])['purchase_amount'].transform('mean')
        df[col+'_min'] = df.groupby([col])['purchase_amount'].transform('min')
        df[col+'_max'] = df.groupby([col])['purchase_amount'].transform('max')
        df[col+'_sum'] = df.groupby([col])['purchase_amount'].transform('sum')

In [81]:
history

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,purchase_amount_trim,purchase_year,purchase_month,purchase_day,purchase_hour,purchase_dayofweek,purchase_weekofyear,purchase_weekend,category_2_mean,category_2_min,category_2_max,category_2_sum,category_3_mean,category_3_min,category_3_max,category_3_sum,city_id_mean,city_id_min,city_id_max,city_id_sum,authorized_flag_mean,authorized_flag_min,authorized_flag_max,authorized_flag_sum,category_1_mean,category_1_min,category_1_max,category_1_sum
0,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,-0.703331,2017,6,25,15,6,25,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
1,1,C_ID_4e6213e9bc,88,0,0.0,0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,-0.733128,2017,7,15,12,5,28,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
2,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,-0.720386,2017,8,9,22,2,32,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
3,1,C_ID_4e6213e9bc,88,0,0.0,0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,-0.735352,2017,9,2,10,5,35,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
4,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,-0.722865,2017,3,10,1,4,10,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
5,1,C_ID_4e6213e9bc,333,0,0.0,0,80,M_ID_50af771f8d,0,-0.734887,2018-02-24 08:45:05,1.0,9,37,-0.734887,2018,2,24,8,5,8,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.639567,-0.745405,1.397375,-58.200565,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
6,1,C_ID_4e6213e9bc,88,0,0.0,0,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21 00:10:51,1.0,16,37,-0.716855,2017,3,21,0,1,12,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
7,1,C_ID_4e6213e9bc,3,0,0.0,0,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18 20:05:55,1.0,16,37,-0.657049,2017,11,18,20,5,46,1,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.650099,-0.735638,-0.249230,-5.200796,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
8,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01 22:02:56,1.0,16,37,-0.737967,2017,6,1,22,3,22,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
9,1,C_ID_4e6213e9bc,88,0,0.0,0,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16 15:41:22,1.0,16,37,-0.715352,2017,3,16,15,3,11,0,-0.619981,-0.746156,14.279604,-3957.336887,0.861164,-0.745405,11269.666518,6401.896332,-0.686702,-0.745405,-0.297615,-501.979139,-0.630085,-0.746156,5.490207,-5950.521587,0.540542,-0.745405,11269.666518,5201.099539
