# 사전 작업

## 모듈 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc
from tqdm import tqdm_notebook

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
pd.set_option('display.max_columns', 400)

# 통합 데이터

In [6]:
def drop_features(data, keyword, debug=False):
    if debug:
        data = data[:1000]
        
    logits = []
    for col in data.columns:
        if keyword in col:
            logits.append(col)
        else:
            pass
    
    return logits

## 데이터 로드

In [7]:
path = './data/'

In [8]:
# train = pd.read_csv(path + 'train_v4.csv')
# test = pd.read_csv(path + 'test_v4.csv')

In [9]:
# train.new_card_id_size.fillna(0, inplace=True)
# test.new_card_id_size.fillna(0, inplace=True)

In [10]:
raw_history = pd.read_csv(path + 'historical_transactions.csv')

In [11]:
history = raw_history.copy()

## Feature Engineering History

In [12]:
def null_cnt(x):
    return np.sum(x.isna() * 1)

In [13]:
def null_rate(x):
    return np.sum(x.isna() * 1) / len(x)

In [14]:
from scipy import stats
def mode(x):
    return stats.mode(x)[0][0]

In [15]:
def most_value_cnt(x):
    return x.value_counts().values[0]

In [16]:
history.head(3)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37


### 거래가 승인 되고, 오프라인 거래의 city_id

In [17]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'N']

In [18]:
temp = temp.groupby('card_id').agg({'city_id': [mode, 'nunique']})
temp.head(1)

Unnamed: 0_level_0,city_id,city_id
Unnamed: 0_level_1,mode,nunique
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2
C_ID_00007093c1,244,3


In [19]:
temp.columns = ['hist_city_id_mode_authorized_flag_category_1_Y', 'hist_city_id_nunique_authorized_flag_category_1_Y']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_city_id_mode_authorized_flag_category_1_Y,hist_city_id_nunique_authorized_flag_category_1_Y
0,C_ID_00007093c1,244,3


In [20]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [21]:
del temp
gc.collect()

28

### 거래가 승인된 category_1

In [22]:
temp = history[history.authorized_flag == 'Y']

In [23]:
temp.category_1 = temp.category_1.map({'Y':1, 'N':0})

In [24]:
temp = temp.groupby('card_id').agg({'category_1':['mean', 'sum', 'size']})
temp.head(1)

Unnamed: 0_level_0,category_1,category_1,category_1
Unnamed: 0_level_1,mean,sum,size
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
C_ID_00007093c1,0.210526,24,114


In [25]:
temp.columns = ['hist_category_1_authorized_flag_Y_mean', 'hist_category_1_authorized_flag_Y_sum', 'hist_category_1_authorized_flag_Y_size']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_category_1_authorized_flag_Y_mean,hist_category_1_authorized_flag_Y_sum,hist_category_1_authorized_flag_Y_size
0,C_ID_00007093c1,0.210526,24,114


In [26]:
temp.hist_category_1_authorized_flag_Y_mean = np.round(temp.hist_category_1_authorized_flag_Y_mean, 4)

In [27]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [28]:
del temp
gc.collect()

35

### 거래가 승인된 installments

In [17]:
def has_999(data):
    # 999가 없을 때
    if data[data == 999].__len__() == 0:
        return 0
    # 999가 있을 때
    else:
        return 1

In [18]:
def cnt_std(data):
    value = data.value_counts().std()
    if np.isnan(value):
        return 0
    else:
        return value

In [31]:
history.installments.replace(-1, np.nan, inplace=True)

In [32]:
temp = history[history.authorized_flag == 'Y']

In [33]:
temp = temp.groupby('card_id').agg({'installments':[mode, 'size', 'mean', 'max', 'var', 'min', null_cnt, null_rate, cnt_std]})
temp.head(1)

Unnamed: 0_level_0,installments,installments,installments,installments,installments,installments,installments,installments,installments
Unnamed: 0_level_1,mode,size,mean,max,var,min,null_cnt,null_rate,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
C_ID_00007093c1,1.0,114,1.289474,6.0,0.632278,1.0,0.0,0.0,41.069453


In [34]:
temp.columns = ['hist_installments_authorized_flag_Y_mode', 'hist_installments_authorized_flag_Y_size', 'hist_installments_authorized_flag_Y_mean', 'hist_installments_authorized_flag_Y_max', 'hist_installments_authorized_flag_Y_var', 'hist_installments_authorized_flag_Y_min', 'hist_installments_authorized_flag_Y_null_cnt', 'hist_installments_authorized_flag_Y_null_rate', 'hist_installments_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_installments_authorized_flag_Y_mode,hist_installments_authorized_flag_Y_size,hist_installments_authorized_flag_Y_mean,hist_installments_authorized_flag_Y_max,hist_installments_authorized_flag_Y_var,hist_installments_authorized_flag_Y_min,hist_installments_authorized_flag_Y_null_cnt,hist_installments_authorized_flag_Y_null_rate,hist_installments_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,1.0,114,1.289474,6.0,0.632278,1.0,0.0,0.0,41.069453


In [35]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [36]:
del temp
gc.collect()

42

### category_3

In [37]:
history.category_3 = history.category_3.fillna('D')

### 승인된 거래 중 merchant_category_id

In [38]:
temp = history[history.authorized_flag == 'Y']

In [39]:
temp = temp.groupby('card_id').agg({'merchant_category_id':[mode, 'nunique', null_cnt, cnt_std]})

In [40]:
temp.columns = ['hist_merchant_category_id_authorized_flag_Y_mode', 'hist_merchant_category_id_authorized_flag_Y_nunique', 'hist_merchant_category_id_authorized_flag_Y_null_cnt', 'hist_merchant_category_id_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_category_id_authorized_flag_Y_mode,hist_merchant_category_id_authorized_flag_Y_nunique,hist_merchant_category_id_authorized_flag_Y_null_cnt,hist_merchant_category_id_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,307,18,0,10.278476


In [41]:
temp.hist_merchant_category_id_authorized_flag_Y_cnt_std = np.round(temp.hist_merchant_category_id_authorized_flag_Y_cnt_std, 3)

In [42]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [43]:
del temp
gc.collect()

28

### merchant_id

In [44]:
le = LabelEncoder()
le.fit(history.merchant_id.fillna('NULL').values)
history.merchant_id = le.transform(history.merchant_id.fillna('NULL'))
history.loc[history.merchant_id == 326311, 'merchant_id'] = np.nan

In [45]:
temp = history[history.authorized_flag == 'Y']

In [46]:
temp = temp.groupby('card_id').agg({'merchant_id': [cnt_std]})
temp.head(1)

Unnamed: 0_level_0,merchant_id
Unnamed: 0_level_1,cnt_std
card_id,Unnamed: 1_level_2
C_ID_00007093c1,6.943841


In [47]:
temp.columns = ['hist_merchant_id_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_id_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,6.943841


In [48]:
temp.hist_merchant_id_authorized_flag_Y_cnt_std = np.round(temp.hist_merchant_id_authorized_flag_Y_cnt_std, 3)

In [49]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [50]:
del temp
gc.collect()

42

In [51]:
temp = history.groupby('card_id').agg({'merchant_id': [mode, 'nunique', null_cnt, cnt_std]})
temp.head(1)

Unnamed: 0_level_0,merchant_id,merchant_id,merchant_id,merchant_id
Unnamed: 0_level_1,mode,nunique,null_cnt,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,188773.0,29,0.0,8.794171


In [52]:
temp.columns = ['hist_merchant_id_mode', 'hist_merchant_id_nunique2', 'hist_merchant_id_null_cnt', 'hist_merchant_id_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_id_mode,hist_merchant_id_nunique2,hist_merchant_id_null_cnt,hist_merchant_id_cnt_std
0,C_ID_00007093c1,188773.0,29,0.0,8.794171


In [53]:
temp.hist_merchant_id_cnt_std = np.round(temp.hist_merchant_id_cnt_std, 3)

In [54]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [55]:
del temp
gc.collect()

28

### month_lag

In [56]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'Y']
temp = history.groupby('card_id').agg({'month_lag': [mode, 'min', 'max', cnt_std]})
temp.head(1)

Unnamed: 0_level_0,month_lag,month_lag,month_lag,month_lag
Unnamed: 0_level_1,mode,min,max,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,-8,-12,0,4.701336


In [57]:
temp.columns = ['hist_month_lag_authorized_flag_Y_category_1_Y_mode', 'hist_month_lag_authorized_flag_Y_category_1_Y_min', 'hist_month_lag_authorized_flag_Y_category_1_Y_max', 'hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_lag_authorized_flag_Y_category_1_Y_mode,hist_month_lag_authorized_flag_Y_category_1_Y_min,hist_month_lag_authorized_flag_Y_category_1_Y_max,hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std
0,C_ID_00007093c1,-8,-12,0,4.701336


In [58]:
temp.hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std = np.round(temp.hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std, 3)

In [59]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [60]:
del temp
gc.collect()

28

In [61]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'N']
temp = history.groupby('card_id').agg({'month_lag': [mode, 'min', 'max', cnt_std]})
temp.head(1)

Unnamed: 0_level_0,month_lag,month_lag,month_lag,month_lag
Unnamed: 0_level_1,mode,min,max,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,-8,-12,0,4.701336


In [62]:
temp.columns = ['hist_month_lag_authorized_flag_Y_category_1_N_mode', 'hist_month_lag_authorized_flag_Y_category_1_N_min', 'hist_month_lag_authorized_flag_Y_category_1_N_max', 'hist_month_lag_authorized_flag_Y_category_1_N_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_lag_authorized_flag_Y_category_1_N_mode,hist_month_lag_authorized_flag_Y_category_1_N_min,hist_month_lag_authorized_flag_Y_category_1_N_max,hist_month_lag_authorized_flag_Y_category_1_N_cnt_std
0,C_ID_00007093c1,-8,-12,0,4.701336


In [63]:
temp.hist_month_lag_authorized_flag_Y_category_1_N_cnt_std = np.round(temp.hist_month_lag_authorized_flag_Y_category_1_N_cnt_std, 3)

In [64]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [65]:
del temp
gc.collect()

28

### category_2

In [66]:
temp = history[history.authorized_flag == 'Y']

In [67]:
temp = temp.groupby('card_id').agg({'category_2': [mode, 'nunique', null_cnt, cnt_std]})

In [68]:
temp.head(1)

Unnamed: 0_level_0,category_2,category_2,category_2,category_2
Unnamed: 0_level_1,mode,nunique,null_cnt,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,3.0,2,24.0,62.225397


In [69]:
temp.columns = ['hist_category_2_authorized_flag_Y_mode', 'hist_category_2_authorized_flag_Y_nunique', 'hist_category_2_authorized_flag_Y_null_cnt', 'hist_category_2_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_category_2_authorized_flag_Y_mode,hist_category_2_authorized_flag_Y_nunique,hist_category_2_authorized_flag_Y_null_cnt,hist_category_2_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,3.0,2,24.0,62.225397


In [70]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [71]:
del temp
gc.collect()

28

### subsector

In [72]:
# temp = history[history.authorized_flag == 'Y']
# temp = temp.groupby('card_id').agg({'subsector_id': [mode, 'nunique', null_cnt, cnt_std]})

In [73]:
# temp.columns = ['hist_subsector_id_authorized_flag_Y_mode', 'hist_subsector_id_authorized_flag_Y_nunique', 'hist_subsector_id_authorized_flag_Y_null_cnt', 'hist_subsector_id_authorized_flag_Y_cnt_std']
# temp.reset_index(inplace=True)
# temp.head(1)

In [74]:
# train = train.merge(temp, on='card_id', how='left')
# test = test.merge(temp, on='card_id', how='left')

In [75]:
# del temp
# gc.collect()

## logging

In [18]:
# train.to_csv('./data_feature_engineering/train_3-73218.csv', index=False)
# test.to_csv('./data_feature_engineering/test_3-73218.csv', index=False)
train = pd.read_csv('./data_feature_engineering/train_3-73218.csv')
test = pd.read_csv('./data_feature_engineering/test_3-73218.csv')

### purchase_date

In [33]:
for new_merchant_df in [history]:
    # purchase date
    new_merchant_df['purchase_year_month'] = new_merchant_df.purchase_date.str[:7]
    new_merchant_df['purchase_year_month_day'] = new_merchant_df.purchase_date.str[:10]
    new_merchant_df['purchase_date'] = pd.to_datetime(new_merchant_df['purchase_date'])
    new_merchant_df['purchase_year_month_day'] = pd.to_datetime(new_merchant_df['purchase_year_month_day'])
    print(1)
    new_merchant_df['purchase_year'] = new_merchant_df['purchase_date'].dt.year
    print(2)
    new_merchant_df['purchase_month'] = new_merchant_df['purchase_date'].dt.month
    print(3)
    new_merchant_df['purchase_day'] = new_merchant_df['purchase_date'].dt.day
    new_merchant_df['purchase_hour'] = new_merchant_df['purchase_date'].dt.hour
    new_merchant_df['purchase_dayofweek'] = new_merchant_df['purchase_date'].dt.dayofweek
    new_merchant_df['purchase_weekofyear'] = new_merchant_df['purchase_date'].dt.weekofyear
    new_merchant_df['purchase_weekend'] = (new_merchant_df['purchase_date'].dt.weekday >=5).astype(int)
    print(4)
    new_merchant_df['purchase_date_total_day'] = pd.to_timedelta(new_merchant_df['purchase_year_month_day']).dt.total_seconds() / (60 * 60 * 24)
    print(5)
    new_merchant_df['month_diff_from_last_trade'] = ((datetime.datetime(2018, 4, 30, 23, 59, 59) - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff_from_last_trade'] += new_merchant_df['month_lag']
    new_merchant_df['month_diff_from_first_trade'] = ((datetime.datetime(2017, 1, 1, 0, 0, 0) - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff_from_first_trade'] += new_merchant_df['month_lag']
    new_merchant_df['month_diff_from_today'] = ((datetime.datetime.today() - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff_from_today'] += new_merchant_df['month_lag']
    print(6)
    

1
2
3
4
5
6


#### purchase total day

In [40]:
temp = history.groupby('card_id').agg({'purchase_date_total_day': [mode, 'sum', 'mean', 'std', 'max', 'min', cnt_std]})

In [41]:
temp.columns = ['hist_purchase_date_total_day_mode', 'hist_purchase_date_total_day_sum', 'hist_purchase_date_total_day_mean', 'hist_purchase_date_total_day_std', 'hist_purchase_date_total_day_max', 'hist_purchase_date_total_day_min' ,'hist_purchase_date_total_day_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_purchase_date_total_day_mode,hist_purchase_date_total_day_sum,hist_purchase_date_total_day_mean,hist_purchase_date_total_day_std,hist_purchase_date_total_day_max,hist_purchase_date_total_day_min,hist_purchase_date_total_day_cnt_std
0,C_ID_00007093c1,17273.0,2591980.0,17395.838926,104.216256,17589.0,17211.0,1.288961


In [42]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

275

#### purchase year month

In [86]:
temp = history.groupby('card_id').agg({'purchase_year_month': [mode, cnt_std]})

In [87]:
temp.columns = ['hist_purchase_year_month_mode', 'hist_purchase_year_month_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_purchase_year_month_mode,hist_purchase_year_month_cnt_std
0,C_ID_00007093c1,2017-06,4.701336


In [88]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

167

#### month diff

In [110]:
temp = history.groupby('card_id').agg({'month_diff_from_last_trade': [mode, 'sum', 'max', 'min', 'std', 'var', cnt_std], 'month_diff_from_first_trade': [mode, 'sum', 'max', 'min', 'std', 'var', cnt_std], 'month_diff_from_today':[mode, 'sum', 'max', 'min', 'std', 'var', cnt_std]})

In [113]:
temp

Unnamed: 0_level_0,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today
Unnamed: 0_level_1,mode,sum,max,min,std,var,cnt_std,mode,sum,max,min,std,var,cnt_std,mode,sum,max,min,std,var,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C_ID_00007093c1,2,318,3,2,0.342047,0.116996,77.074639,-14,-2099,-14,-15,0.283150,0.080174,86.974134,12,1776,12,11,0.273040,0.074551,88.388348
C_ID_0001238066,2,247,3,2,0.090167,0.008130,85.559921,-14,-1731,-14,-15,0.261482,0.068373,74.246212,12,1468,12,11,0.247606,0.061309,75.660426
C_ID_0001506ef0,2,139,3,2,0.310275,0.096270,36.769553,-14,-926,-14,-15,0.172733,0.029837,43.840620,12,791,12,11,0.123091,0.015152,45.254834
C_ID_0001793786,6,1322,7,6,0.326150,0.106374,115.965512,-10,-2174,-10,-11,0.246771,0.060896,132.936075,16,3442,16,15,0.246771,0.060896,132.936075
C_ID_000183fdda,2,288,3,1,0.236525,0.055944,76.210236,-14,-2038,-14,-15,0.361029,0.130342,70.710678,12,1706,12,11,0.361029,0.130342,70.710678
C_ID_00024e244b,2,158,3,2,0.440215,0.193789,24.041631,-14,-984,-14,-15,0.233791,0.054658,43.840620,12,838,13,11,0.293476,0.086128,35.232561
C_ID_0002709b5a,2,149,3,2,0.199886,0.039954,47.376154,-14,-1024,-14,-15,0.164368,0.027017,48.790368,12,874,12,11,0.164368,0.027017,48.790368
C_ID_00027503e2,3,126,3,3,0.000000,0.000000,0.000000,-13,-549,-13,-14,0.260661,0.067944,25.455844,13,543,13,12,0.260661,0.067944,25.455844
C_ID_000298032a,3,90,3,3,0.000000,0.000000,0.000000,-13,-393,-13,-14,0.305129,0.093103,16.970563,13,387,13,12,0.305129,0.093103,16.970563
C_ID_0002ba3c2e,6,428,7,6,0.320455,0.102692,38.183766,-10,-703,-10,-11,0.203997,0.041615,45.254834,16,1117,16,15,0.203997,0.041615,45.254834


In [117]:
col = ['month_diff_from_last_trade', 'month_diff_from_first_trade', 'month_diff_from_today']
final_col = []
for c in col:
    final_col += ['hist_' + c + '_' + v for v in ['mode', 'sum', 'max', 'min', 'std', 'var', 'cnt_std']]

final_col
temp.columns = final_col
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_diff_from_last_trade_mode,hist_month_diff_from_last_trade_sum,hist_month_diff_from_last_trade_max,hist_month_diff_from_last_trade_min,hist_month_diff_from_last_trade_std,hist_month_diff_from_last_trade_var,hist_month_diff_from_last_trade_cnt_std,hist_month_diff_from_first_trade_mode,hist_month_diff_from_first_trade_sum,hist_month_diff_from_first_trade_max,hist_month_diff_from_first_trade_min,hist_month_diff_from_first_trade_std,hist_month_diff_from_first_trade_var,hist_month_diff_from_first_trade_cnt_std,hist_month_diff_from_today_mode,hist_month_diff_from_today_sum,hist_month_diff_from_today_max,hist_month_diff_from_today_min,hist_month_diff_from_today_std,hist_month_diff_from_today_var,hist_month_diff_from_today_cnt_std
0,C_ID_00007093c1,2,318,3,2,0.342047,0.116996,77.074639,-14,-2099,-14,-15,0.28315,0.080174,86.974134,12,1776,12,11,0.27304,0.074551,88.388348


In [118]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

91

In [153]:
temp = history.groupby('card_id').agg({'month_diff_from_last_trade': ['skew']})

In [155]:
temp.columns = ['hist_month_diff_from_last_trade_skew']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_diff_from_last_trade_skew
0,C_ID_00007093c1,2.16782


In [156]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

20

#### purchase_date

In [222]:
def duration(data):
    return (data.max() - data.min()).days

In [223]:
temp = history.groupby(['card_id']).agg({'purchase_year_month_day': ['max', 'min', duration]})

In [224]:
temp.columns = ['hist_purchase_year_month_day_max', 'hist_purchase_year_month_day_min', 'hist_purchase_year_month_day_duration']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_purchase_year_month_day_max,hist_purchase_year_month_day_min,hist_purchase_year_month_day_duration
0,C_ID_00007093c1,2018-02-27,2017-02-14,378


In [225]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

7

### purchase_amount

In [19]:
# history['purchase_amount_new'] = np.round(history['purchase_amount'] / 0.00150265118 + 497.06,2)

In [20]:
# temp = history[history.authorized_flag == 'Y']
# temp = temp.groupby(['card_id']).agg({'purchase_amount_new':[mode, 'sum', 'mean', 'var', 'max', 'min', 'skew', 'std']})

In [21]:
# temp.columns = ['hist_purchase_amount_authorized_flag_Y_new_mode', 'hist_purchase_amount_authorized_flag_Y_new_sum', 'hist_purchase_amount_authorized_flag_Y_new_mean', 'hist_purchase_amount_authorized_flag_Y_new_var', 'hist_purchase_amount_authorized_flag_Y_new_max', 'hist_purchase_amount_authorized_flag_Y_new_min', 'hist_purchase_amount_authorized_flag_Y_new_skew', 'hist_purchase_amount_authorized_flag_Y_new_std']
# temp.reset_index(inplace=True)
# temp.head(1)

In [22]:
# train = train.merge(temp, on='card_id', how='left')
# test = test.merge(temp, on='card_id', how='left')

In [23]:
# del temp
# gc.collect()

In [24]:
# temp = history[history.authorized_flag == 'N']
# temp = temp.groupby(['card_id']).agg({'purchase_amount_new':[mode, 'sum', 'mean', 'var', 'max', 'min', 'skew', 'std']})

In [25]:
# temp.columns = ['hist_purchase_amount_authorized_flag_N_new_mode', 'hist_purchase_amount_authorized_flag_N_new_sum', 'hist_purchase_amount_authorized_flag_N_new_mean', 'hist_purchase_amount_authorized_flag_N_new_var', 'hist_purchase_amount_authorized_flag_N_new_max', 'hist_purchase_amount_authorized_flag_N_new_min', 'hist_purchase_amount_authorized_flag_N_new_skew', 'hist_purchase_amount_authorized_flag_N_new_std']
# temp.reset_index(inplace=True)
# temp.head(1)

In [26]:
# train = train.merge(temp, on='card_id', how='left')
# test = test.merge(temp, on='card_id', how='left')

In [27]:
# del temp
# gc.collect()

In [28]:
# history.purchase_amount_new.describe()

## logging

In [29]:
train = pd.read_csv('./data_feature_engineering/train_3-70479.csv')
test = pd.read_csv('./data_feature_engineering/test_3-70479.csv')

## Feature Engineering New History

In [114]:
new_history = pd.read_csv('./data/new_merchant_transactions.csv')

### 거래가 승인 된 city_id

In [149]:
temp = new_history[new_history.authorized_flag == 'Y']
# temp = temp[temp.category_1 == 'N']

In [150]:
temp = temp.groupby('card_id').agg({'city_id': [mode, 'nunique']})
temp.head(1)

Unnamed: 0_level_0,city_id,city_id
Unnamed: 0_level_1,mode,nunique
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2
C_ID_00007093c1,69,2


In [151]:
temp.columns = ['new_city_id_mode_authorized_flag_Y', 'new_city_id_nunique_authorized_flag_Y']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,new_city_id_mode_authorized_flag_Y,new_city_id_nunique_authorized_flag_Y
0,C_ID_00007093c1,69,2


In [152]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [153]:
del temp
gc.collect()

14

### 거래가 승인된 category_1

In [171]:
temp = new_history[new_history.authorized_flag == 'Y']

In [172]:
temp.category_1 = temp.category_1.map({'Y':1, 'N':0})

In [173]:
temp = temp.groupby('card_id').agg({'category_1':['mean', 'sum', 'size']})
temp.head(1)

Unnamed: 0_level_0,category_1,category_1,category_1
Unnamed: 0_level_1,mean,sum,size
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
C_ID_00007093c1,0.0,0,2


In [174]:
temp.columns = ['new_category_1_authorized_flag_Y_mean', 'new_category_1_authorized_flag_Y_sum', 'new_category_1_authorized_flag_Y_size']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,new_category_1_authorized_flag_Y_mean,new_category_1_authorized_flag_Y_sum,new_category_1_authorized_flag_Y_size
0,C_ID_00007093c1,0.0,0,2


In [176]:
temp.new_category_1_authorized_flag_Y_mean = np.round(temp.new_category_1_authorized_flag_Y_mean, 4)

In [177]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [178]:
del temp
gc.collect()

104

### 거래가 승인된 installments

In [201]:
def has_999(data):
    # 999가 없을 때
    if data[data == 999].__len__() == 0:
        return 0
    # 999가 있을 때
    else:
        return 1

In [202]:
def cnt_std(data):
    value = data.value_counts().std()
    if np.isnan(value):
        return 0
    else:
        return value

In [203]:
new_history.installments.replace(-1, np.nan, inplace=True)

In [204]:
temp = new_history[new_history.authorized_flag == 'Y']

In [205]:
temp = temp.groupby('card_id').agg({'installments':[mode, 'size', 'mean', 'max', 'var', 'min', null_cnt, null_rate, cnt_std]})
temp.head(1)

Unnamed: 0_level_0,installments,installments,installments,installments,installments,installments,installments,installments,installments
Unnamed: 0_level_1,mode,size,mean,max,var,min,null_cnt,null_rate,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
C_ID_00007093c1,1.0,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [206]:
temp.columns = ['new_installments_authorized_flag_Y_mode', 'new_installments_authorized_flag_Y_size', 'new_installments_authorized_flag_Y_mean', 'new_installments_authorized_flag_Y_max', 'new_installments_authorized_flag_Y_var', 'new_installments_authorized_flag_Y_min', 'new_installments_authorized_flag_Y_null_cnt', 'new_installments_authorized_flag_Y_null_rate', 'new_installments_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,new_installments_authorized_flag_Y_mode,new_installments_authorized_flag_Y_size,new_installments_authorized_flag_Y_mean,new_installments_authorized_flag_Y_max,new_installments_authorized_flag_Y_var,new_installments_authorized_flag_Y_min,new_installments_authorized_flag_Y_null_cnt,new_installments_authorized_flag_Y_null_rate,new_installments_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,1.0,2,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [207]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [208]:
del temp
gc.collect()

57

## history + new_history

In [305]:
result = history.append(new_history)

In [306]:
result['purchase_date_year_month'] = result.purchase_date.str[:7]

In [307]:
temp = result.groupby(['card_id', 'purchase_date_year_month']).size()

In [308]:
date_list = ['2017-01','2017-02','2017-03','2017-04','2017-05','2017-06','2017-07','2017-08','2017-09','2017-10','2017-11','2017-12','2018-01','2018-02','2018-03','2018-04']

In [310]:
meta = result.groupby('card_id').size().to_frame().reset_index().drop([0], axis=1)

In [312]:
for date in date_list:
    try:
        meta = meta.merge(temp.loc[:, date].to_frame().reset_index().rename(columns={0:date}), on='card_id', how='left')
        print(date)
    except:
        meta[date] = 0
        print(date)

2017-01
2017-02
2017-03
2017-04
2017-05
2017-06
2017-07
2017-08
2017-09
2017-10
2017-11
2017-12
2018-01
2018-02
2018-03
2018-04


In [316]:
train = train.merge(meta, on='card_id', how='left')

In [317]:
test = test.merge(meta, on='card_id', how='left')

In [None]:
result[:100].groupby(['card_id', 'purchase_date_year_month']).size()

In [441]:
hist2 = pd.read_csv('./data/newFE_0213.csv')

In [443]:
hist2.drop(list(set(train.columns).intersection(set(hist2.columns))),axis=1, inplace=True)

In [444]:
hist2.head()

Unnamed: 0,new_card_id,new_fromRefDate_sum,new_fromRefDate_min,new_fromRefDate_max,new_fromRefDate_mean,new_fromRefDate_median,new_fromRefDate_var,new_fromRefDate_quantileRange,new_month_lag_median,new_month_lag_quantileRange,new_purchase_date_total_day_max,new_purchase_date_total_day_min,new_purchase_date_total_day_mean,new_purchase_date_total_day_var,new_purchase_date_total_day_skew,new_month_diff_from_trade_max,new_month_diff_from_trade_min,new_month_diff_from_trade_mean,new_month_diff_from_trade_var,new_month_diff_from_trade_skew,new_month_diff_from_today_max,new_month_diff_from_today_min,new_month_diff_from_today_mean,new_month_diff_from_today_var,new_month_diff_from_today_skew,new_city_id_count,new_merchant_id_mode,new_merchant_visit_sum,new_merchant_visit_mean,new_merchant_visit_min,new_merchant_visit_max,new_merchant_visit_nunique,new_merchant_visit_size,new_merchant_visit_mode,new_merchant_try_mean,new_merchant_try_std,new_merchant_try_min,new_merchant_try_max,new_merchant_try_nunique,new_merchant_try_size,new_merchant_try_mode,new_installments_median,new_installments_quantileRange,new_purchase_amount_median,new_purchase_amount_quantileRange,new_category_1_nunique,new_category_1_var,new_category_3_nunique,new_category_3_var,new_category_2_var,new_category_2_nunique,new_category_2_min_mean,new_category_2_max_mean,new_category_2_sum_mean,new_category_3_min_mean,new_category_3_max_mean,new_category_3_sum_mean,new_merchant_group_id_mode,new_merchant_group_id_nunique,new_numerical_1_sum,new_numerical_1_max,new_numerical_1_min,new_numerical_1_mean,new_numerical_1_median,new_numerical_1_var,new_numerical_1_quantileRange,new_numerical_2_sum,new_numerical_2_max,new_numerical_2_min,new_numerical_2_mean,new_numerical_2_median,new_numerical_2_var,new_numerical_2_quantileRange,new_most_recent_sales_range_mode,new_most_recent_sales_range_nunique,new_most_recent_sales_range_mean,new_most_recent_sales_range_var,new_most_recent_purchases_range_mode,new_most_recent_purchases_range_nunique,new_most_recent_purchases_range_mean,new_most_recent_purchases_range_var,new_category_4_mode,new_category_4_nunique,new_category_4_mean,new_category_4_var,new_category_5_mode,new_category_5_nunique,new_category_5_mean,new_category_5_var,new_avg_sales_lag3_sum,new_avg_sales_lag3_max,new_avg_sales_lag3_min,new_avg_sales_lag3_mean,new_avg_sales_lag3_median,new_avg_sales_lag3_var,new_avg_sales_lag3_quantileRange,new_avg_purchases_lag3_sum,new_avg_purchases_lag3_max,new_avg_purchases_lag3_min,new_avg_purchases_lag3_mean,new_avg_purchases_lag3_median,new_avg_purchases_lag3_var,new_avg_purchases_lag3_quantileRange,new_active_months_lag3_sum,new_active_months_lag3_max,new_active_months_lag3_min,new_active_months_lag3_mean,new_active_months_lag3_median,new_active_months_lag3_var,new_active_months_lag3_quantileRange,new_avg_sales_lag6_sum,new_avg_sales_lag6_max,new_avg_sales_lag6_min,new_avg_sales_lag6_mean,new_avg_sales_lag6_median,new_avg_sales_lag6_var,new_avg_sales_lag6_quantileRange,new_avg_purchases_lag6_sum,new_avg_purchases_lag6_max,new_avg_purchases_lag6_min,new_avg_purchases_lag6_mean,new_avg_purchases_lag6_median,new_avg_purchases_lag6_var,new_avg_purchases_lag6_quantileRange,new_active_months_lag6_sum,new_active_months_lag6_max,new_active_months_lag6_min,new_active_months_lag6_mean,new_active_months_lag6_median,new_active_months_lag6_var,new_active_months_lag6_quantileRange,new_avg_sales_lag12_sum,new_avg_sales_lag12_max,new_avg_sales_lag12_min,new_avg_sales_lag12_mean,new_avg_sales_lag12_median,new_avg_sales_lag12_var,new_avg_sales_lag12_quantileRange,new_avg_purchases_lag12_sum,new_avg_purchases_lag12_max,new_avg_purchases_lag12_min,new_avg_purchases_lag12_mean,new_avg_purchases_lag12_median,new_avg_purchases_lag12_var,new_avg_purchases_lag12_quantileRange,new_active_months_lag12_sum,new_active_months_lag12_max,new_active_months_lag12_min,new_active_months_lag12_mean,new_active_months_lag12_median,new_active_months_lag12_var,new_active_months_lag12_quantileRange,new_city_ym_rate_mode,new_city_ym_rate_sum,new_city_ym_rate_mean,new_city_ym_rate_median,new_city_ym_rate_var,new_city_ym_rate_quantileRange
0,C_ID_00007093c1,1383,457,463,461.0,463.0,12.0,3.0,2.0,0.0,17630.683322,17624.467766,17628.61147,12.87771,-1.732051,2,2,2.0,0.0,0.0,12,12,12.0,0.0,0.0,3,65434.0,5.0,1.666667,1.0,2.0,2,3,2.0,1.666667,0.57735,1.0,2.0,2,3,2.0,1.0,0.0,-0.656749,0.007513,1,0.0,1,0.0,1.333333,2,-0.746893,197.72913,-463553.943885,-0.746893,72.452641,-514857.076705,35.0,2,-0.162497,-0.047556,-0.057471,-0.054166,-0.057471,3.3e-05,0.004957,-0.172412,-0.057471,-0.057471,-0.057471,-0.057471,0.0,0.0,0.0,3,2.333333,4.333333,4.0,2,2.666667,5.333333,1.0,1,1.0,0.0,3.0,1,3.0,,8.76,6.93,0.9,2.92,0.93,12.0603,3.015,14.597919,12.705128,0.943503,4.865973,0.949288,46.089272,5.880813,9.0,3.0,3.0,3.0,3.0,0.0,0.0,10.35,8.42,0.86,3.45,1.07,18.5367,3.78,17.749582,15.855769,0.871214,5.916527,1.022599,74.097125,7.492278,18.0,6.0,6.0,6.0,6.0,0.0,0.0,10.41,8.57,0.76,3.47,1.08,19.5331,3.905,18.002082,16.21978,0.752641,6.000694,1.029661,78.341478,7.73357,31.0,12.0,7.0,10.333333,12.0,8.333333,2.5,17.154857,35.040763,11.680254,17.154857,89.91382,8.211904
1,C_ID_0001238066,12204,424,484,452.0,447.0,303.076923,27.5,1.0,1.0,17651.831597,17591.700313,17619.630376,300.805069,0.343982,3,2,2.037037,0.037037,5.196152,12,11,11.481481,0.259259,0.078558,27,65434.0,28.0,1.076923,1.0,2.0,2,27,1.0,1.076923,0.271746,1.0,2.0,2,27,1.0,1.0,,-0.656749,0.18276,2,0.071225,2,0.135385,1.826087,2,-0.746893,239.561017,-556424.521579,-0.746791,101.791849,-434311.838228,35.0,19,194.840699,129.322116,-0.057471,7.493873,-0.047556,777.261099,,192.728824,128.548754,-0.057471,7.412647,-0.057471,765.313376,,0.0,5,1.692308,1.661538,0.0,5,1.769231,2.024615,0.0,2,0.384615,0.246154,1.0,2,1.6,2.147368,31.9,6.93,0.81,1.226923,0.995,1.363734,,38.471731,12.705128,0.873514,1.479682,1.054336,5.24891,,78.0,3.0,3.0,3.0,3.0,0.0,,34.2,8.42,0.73,1.315385,1.03,2.137522,,42.426994,15.855769,0.738174,1.631807,1.093606,8.450152,,156.0,6.0,6.0,6.0,6.0,0.0,,34.36,8.57,0.53,1.321538,1.08,2.262246,,42.76086,16.21978,0.540334,1.644648,1.115656,8.917858,,304.0,12.0,7.0,11.692308,12.0,1.101538,,18.618442,139.382138,5.162301,1.237045,53.876511,4.837929
2,C_ID_0001506ef0,884,439,445,442.0,442.0,18.0,3.0,1.0,0.0,17612.385069,17606.931921,17609.658495,14.868412,,2,2,2.0,0.0,,12,11,11.5,0.5,,2,214550.0,1.0,1.0,1.0,1.0,1,2,1.0,1.0,,1.0,1.0,1,2,1.0,0.0,0.0,-0.723677,0.008325,1,0.0,1,0.0,0.0,1,-0.746893,66.872395,-159509.34352,-0.746893,14.279604,-590147.161107,33224.0,1,-0.047556,-0.047556,-0.047556,-0.047556,-0.047556,,,-0.047556,-0.047556,-0.047556,-0.047556,-0.047556,,,2.0,1,2.0,,2.0,1,2.0,,1.0,1,1.0,,3.0,1,3.0,,1.04,1.04,1.04,1.04,1.04,,,1.054117,1.054117,1.054117,1.054117,1.054117,,,3.0,3.0,3.0,3.0,3.0,,,1.03,1.03,1.03,1.03,1.03,,,1.040961,1.040961,1.040961,1.040961,1.040961,,,6.0,6.0,6.0,6.0,6.0,,,0.98,0.98,0.98,0.98,0.98,,,1.003116,1.003116,1.003116,1.003116,1.003116,,,12.0,12.0,12.0,12.0,12.0,,,2.276536,4.553072,2.276536,2.276536,0.0,0.0
3,C_ID_0001793786,10425,318,364,336.290323,330.0,243.87957,20.0,1.0,1.0,17531.733287,17485.655787,17503.784133,246.256992,0.878699,6,6,6.0,0.0,0.0,16,15,15.290323,0.212903,0.971526,31,25426.0,31.0,1.0,1.0,1.0,1,31,1.0,1.0,0.0,1.0,1.0,1,31,1.0,0.0,0.0,-0.372748,0.72708,1,0.0,1,0.0,0.975369,4,-0.746893,159.377048,-360877.788646,-0.746893,14.279604,-590147.161107,35.0,21,197.805434,117.830742,-0.057471,6.38082,-0.047556,516.586187,0.084277,192.550535,116.888826,-0.057471,6.211308,-0.057471,509.639169,0.029745,3.0,5,1.935484,1.795699,3.0,5,2.096774,2.156989,1.0,2,0.709677,0.212903,1.0,3,1.740741,0.584046,37.38,4.0,0.61,1.205806,1.03,0.371565,0.15,42.354339,8.917189,0.549491,1.366269,1.04226,2.037576,0.222188,93.0,3.0,3.0,3.0,3.0,0.0,0.0,182.08,147.69,0.76,5.873548,1.06,692.936544,0.205,538.583543,504.322881,0.663121,17.373663,1.065146,8167.593063,0.187624,186.0,6.0,6.0,6.0,6.0,0.0,0.0,201.36,166.68,0.5,6.495484,1.08,884.011119,0.26,589.45811,554.397813,0.438104,19.014778,1.109964,9873.1874,0.217493,366.0,12.0,9.0,11.806452,12.0,0.427957,0.0,18.291304,286.320263,9.236138,2.413043,78.984504,17.78788
4,C_ID_000183fdda,4875,425,484,443.181818,433.0,475.163636,26.0,1.0,0.5,17651.624919,17592.518356,17610.852464,473.346871,1.157099,2,2,2.0,0.0,0.0,12,11,11.727273,0.218182,-1.189373,11,129607.0,11.0,1.0,1.0,1.0,1,11,1.0,1.0,0.0,1.0,1.0,1,11,1.0,1.0,,-0.665765,0.11901,1,0.0,2,0.266667,0.0,1,-0.746893,66.872395,-159509.34352,-0.746628,148.734584,-305439.456665,35.0,9,2.282805,1.162063,-0.057471,0.207528,-0.037641,0.217492,0.183426,1.836634,1.112488,-0.057471,0.166967,-0.057471,0.215215,0.069404,2.0,4,2.545455,1.272727,3.0,5,2.272727,1.418182,1.0,1,1.0,0.0,3.0,1,3.0,0.0,11.65,1.3,0.78,1.059091,1.06,0.023069,0.175,11.160314,1.258632,0.75,1.014574,1.033639,0.01924,0.15544,33.0,3.0,3.0,3.0,3.0,0.0,0.0,11.78,1.41,0.71,1.070909,1.03,0.054489,0.315,11.196193,1.377511,0.688131,1.017836,1.019983,0.035975,0.190131,66.0,6.0,6.0,6.0,6.0,0.0,0.0,11.95,1.42,0.82,1.086364,0.98,0.046925,0.285,11.494231,1.40062,0.779672,1.04493,0.996132,0.026847,0.184057,132.0,12.0,12.0,12.0,12.0,0.0,0.0,0.697292,8.446538,0.767867,0.697292,0.051481,0.007236


In [446]:
train = train.merge(hist2, left_on='card_id', right_on='new_card_id', how='left')

In [447]:
test = test.merge(hist2, left_on='card_id', right_on='new_card_id', how='left')

## Feature Selection

In [448]:
# le = LabelEncoder()
# le.fit(train.append(test).hist_purchase_year_month_mode.values)
# for df in [train, test]:
#     df['first_active'] = pd.to_datetime(df['first_active'])
#     df['temp'] = 1 - df.hist_category_1_label_mean\
#     df['hist_category_1_authorized_flag_Y_0_cnt'] = df.hist_category_1_authorized_flag_Y_size - df.hist_category_1_authorized_flag_Y_sum
#     df.hist_installments_authorized_flag_Y_cnt_std = np.around(df.hist_installments_authorized_flag_Y_cnt_std, 4)
#     df['temp'] = df.hist_month_lag_max - df.hist_month_lag_min
#     df.hist_purchase_year_month_mode = le.transform(df.hist_purchase_year_month_mode)
#     df['has_new_merchant'] = 1
#     df.loc[df.new_card_id_size == 0, 'has_new_merchant'] = 0
#     df['hist_purchase_year_max_first_active_year'] = (df.hist_purchase_year_month_day_max - df.first_active).dt.days
#     df['hist_purchase_year_min_first_active_year'] = (df.hist_purchase_year_month_day_min - df.first_active).dt.days

In [449]:
# train.to_csv('./data_feature_engineering/train_3-70479.csv', index=False)
# test.to_csv('./data_feature_engineering/test_3-70479.csv', index=False)

In [450]:
    'new_installments_authorized_flag_Y_mode', 'new_installments_authorized_flag_Y_size', 'new_installments_authorized_flag_Y_mean', 'new_installments_authorized_flag_Y_max', 'new_installments_authorized_flag_Y_var', 'new_installments_authorized_flag_Y_min', 'new_installments_authorized_flag_Y_null_cnt', 'new_installments_authorized_flag_Y_null_rate', 'new_installments_authorized_flag_Y_cnt_std'  


('new_installments_authorized_flag_Y_mode',
 'new_installments_authorized_flag_Y_size',
 'new_installments_authorized_flag_Y_mean',
 'new_installments_authorized_flag_Y_max',
 'new_installments_authorized_flag_Y_var',
 'new_installments_authorized_flag_Y_min',
 'new_installments_authorized_flag_Y_null_cnt',
 'new_installments_authorized_flag_Y_null_rate',
 'new_installments_authorized_flag_Y_cnt_std')

In [539]:
trainable_feature = [
    'feature_1', 'feature_2', 'feature_3', 
    'first_active_month', 'first_active_year', 'first_active_elapsed_time_from_trade', 'first_active_total_day',
    'hist_card_id_size', 'new_card_id_size',
    'hist_authorized_flag_label_mean', 'hist_authorized_flag_label_sum',
    'hist_city_id_mode_authorized_flag_category_1_Y', 'hist_city_id_nunique_authorized_flag_category_1_Y',
    'hist_category_1_authorized_flag_Y_mean',
    'hist_installments_authorized_flag_Y_max', 'hist_installments_authorized_flag_Y_min', 'hist_installments_authorized_flag_Y_null_cnt', 'hist_installments_authorized_flag_Y_var', 'hist_installments_authorized_flag_Y_cnt_std',
    'hist_merchant_category_id_authorized_flag_Y_cnt_std',
    'hist_merchant_id_mode', 'hist_merchant_id_nunique2', 'hist_merchant_id_null_cnt', 'hist_merchant_id_authorized_flag_Y_cnt_std',
    'hist_month_lag_min', 'hist_month_lag_max', 'hist_month_lag_mode',
    'hist_subsector_id_mode', 'hist_subsector_id_nunique',
    'hist_purchase_year_nunique', 'hist_purchase_year_max',
    'hist_purchase_month_nunique', 'hist_purchase_month_mode',
    'hist_purchase_hour_skew', 'hist_purchase_hour_var',
    'hist_purchase_weekofyear_max', 'hist_purchase_weekofyear_nunique',
    'hist_purchase_amount_trim_var',
    'hist_purchase_date_total_day_max',
    'hist_month_diff_from_last_trade_mode',
    'hist_purchase_year_month_day_duration', 'hist_purchase_year_max_first_active_year',
    'new_city_id_mode_authorized_flag_Y', 'new_city_id_nunique_authorized_flag_Y',
    'new_category_1_authorized_flag_Y_mean',
    'new_installments_mode', 'new_installments_sum', 'new_installments_mean', 'new_installments_var', 'new_installments_max', 'new_installments_min', 'new_installments_null_cnt',
    'new_month_lag_mode', 'new_month_lag_sum', 'new_month_lag_mean', 'new_month_lag_var', 'new_month_lag_max', 'new_month_lag_min', 'new_month_lag_skew',
    'new_purchase_amount_trim_sum', 'new_purchase_amount_trim_mean', 'new_purchase_amount_trim_var', 'new_purchase_amount_trim_max', 'new_purchase_amount_trim_min', 'new_purchase_amount_trim_skew',
    'hist_month_lag_median'
] + get_col('new_fromRefDate_sum', 'new_fromRefDate_quantileRange') + get_col('new_purchase_date_total_day_max', 'new_purchase_date_total_day_skew') + get_col('new_avg_sales_lag3_sum', 'new_city_ym_rate_quantileRange')

In [534]:
def get_col(start, end):
    result = []
    logit = 999
    for i, v in enumerate(train.columns.tolist()):
        if v == key:
            result.append(v)
            logit = i
        
        if logit < i:
            result.append(v)
        
        if v == end:
            break
    return result

In [535]:
train.head(3)

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,first_active,first_active_year,first_active_elapsed_time_from_trade,first_active_total_day,hist_authorized_flag_label_mode,hist_authorized_flag_label_sum,hist_authorized_flag_label_mean,hist_card_id_size,hist_city_id_mode,hist_city_id_nunique,hist_category_1_label_mode,hist_category_1_label_sum,hist_category_1_label_mean,hist_installments_mode,hist_installments_sum,hist_installments_mean,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_null_cnt,hist_category_3_label_mode,hist_category_3_label_mean,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_month_lag_mode,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_skew,hist_purchase_date_max,hist_purchase_date_min,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_category_2_mode,hist_category_2_mean,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode,hist_subsector_id_nunique,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_month_diff_max,hist_month_diff_min,hist_month_diff_mean,hist_month_diff_var,hist_month_diff_skew,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_duration_mean,hist_duration_min,hist_duration_max,hist_duration_var,hist_duration_skew,hist_amount_month_ratio_mean,hist_amount_month_ratio_min,hist_amount_month_ratio_max,hist_amount_month_ratio_var,hist_amount_month_ratio_skew,hist_category_2_mean_mean,hist_category_2_min_mean,hist_category_2_max_mean,hist_category_2_sum_mean,hist_category_3_mean_mean,hist_category_3_min_mean,hist_category_3_max_mean,hist_category_3_sum_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin,new_authorized_flag_mode,new_authorized_flag_sum,new_authorized_flag_mean,new_card_id_size,new_city_id_mode,new_city_id_nunique,new_category_1_mode,new_category_1_sum,new_category_1_mean,new_installments_mode,new_installments_sum,new_installments_mean,new_installments_var,new_installments_max,new_installments_min,new_installments_null_cnt,new_category_3_mode,new_category_3_mean,new_merchant_category_id_mode,new_merchant_category_id_nunique,new_merchant_id_nunique,new_month_lag_mode,new_month_lag_sum,new_month_lag_mean,new_month_lag_var,new_month_lag_max,new_month_lag_min,new_month_lag_skew,new_purchase_date_max,new_purchase_date_min,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_skew,new_purchase_amount_over_550,new_category_2_mode,new_category_2_mean,new_state_id_mode,new_state_id_nunique,new_subsector_id_mode,new_subsector_id_nunique,new_purchase_amount_trim_sum,new_purchase_amount_trim_mean,new_purchase_amount_trim_var,new_purchase_amount_trim_max,new_purchase_amount_trim_min,new_purchase_amount_trim_skew,new_purchase_year_mode,new_purchase_year_nunique,new_purchase_year_mean,new_purchase_year_max,new_purchase_year_min,new_purchase_month_mode,new_purchase_month_nunique,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_day_mode,new_purchase_day_nunique,new_purchase_day_mean,new_purchase_day_var,new_purchase_day_max,...,hist_avg_sales_lag6_quantileRange,hist_avg_purchases_lag6_sum,hist_avg_purchases_lag6_max,hist_avg_purchases_lag6_min,hist_avg_purchases_lag6_mean,hist_avg_purchases_lag6_median,hist_avg_purchases_lag6_var,hist_avg_purchases_lag6_quantileRange,hist_active_months_lag6_sum,hist_active_months_lag6_max,hist_active_months_lag6_min,hist_active_months_lag6_mean,hist_active_months_lag6_median,hist_active_months_lag6_var,hist_active_months_lag6_quantileRange,hist_avg_sales_lag12_sum,hist_avg_sales_lag12_max,hist_avg_sales_lag12_min,hist_avg_sales_lag12_mean,hist_avg_sales_lag12_median,hist_avg_sales_lag12_var,hist_avg_sales_lag12_quantileRange,hist_avg_purchases_lag12_sum,hist_avg_purchases_lag12_max,hist_avg_purchases_lag12_min,hist_avg_purchases_lag12_mean,hist_avg_purchases_lag12_median,hist_avg_purchases_lag12_var,hist_avg_purchases_lag12_quantileRange,hist_active_months_lag12_sum,hist_active_months_lag12_max,hist_active_months_lag12_min,hist_active_months_lag12_mean,hist_active_months_lag12_median,hist_active_months_lag12_var,hist_active_months_lag12_quantileRange,hist_city_ym_rate_mode,hist_city_ym_rate_sum,hist_city_ym_rate_mean,hist_city_ym_rate_median,hist_city_ym_rate_var,hist_city_ym_rate_quantileRange,new_card_id,new_fromRefDate_sum,new_fromRefDate_min,new_fromRefDate_max,new_fromRefDate_mean,new_fromRefDate_median,new_fromRefDate_var,new_fromRefDate_quantileRange,new_month_lag_median,new_month_lag_quantileRange,new_purchase_date_total_day_max,new_purchase_date_total_day_min,new_purchase_date_total_day_mean,new_purchase_date_total_day_var,new_purchase_date_total_day_skew,new_month_diff_from_trade_max,new_month_diff_from_trade_min,new_month_diff_from_trade_mean,new_month_diff_from_trade_var,new_month_diff_from_trade_skew,new_month_diff_from_today_max,new_month_diff_from_today_min,new_month_diff_from_today_mean,new_month_diff_from_today_var,new_month_diff_from_today_skew,new_city_id_count,new_merchant_id_mode,new_merchant_visit_sum,new_merchant_visit_mean,new_merchant_visit_min,new_merchant_visit_max,new_merchant_visit_nunique,new_merchant_visit_size,new_merchant_visit_mode,new_merchant_try_mean,new_merchant_try_std,new_merchant_try_min,new_merchant_try_max,new_merchant_try_nunique,new_merchant_try_size,new_merchant_try_mode,new_installments_median,new_installments_quantileRange,new_purchase_amount_median,new_purchase_amount_quantileRange,new_category_1_nunique,new_category_1_var,new_category_3_nunique,new_category_3_var,new_category_2_var,new_category_2_nunique,new_category_2_min_mean,new_category_2_max_mean,new_category_2_sum_mean,new_category_3_min_mean,new_category_3_max_mean,new_category_3_sum_mean,new_merchant_group_id_mode,new_merchant_group_id_nunique,new_numerical_1_sum,new_numerical_1_max,new_numerical_1_min,new_numerical_1_mean,new_numerical_1_median,new_numerical_1_var,new_numerical_1_quantileRange,new_numerical_2_sum,new_numerical_2_max,new_numerical_2_min,new_numerical_2_mean,new_numerical_2_median,new_numerical_2_var,new_numerical_2_quantileRange,new_most_recent_sales_range_mode,new_most_recent_sales_range_nunique,new_most_recent_sales_range_mean,new_most_recent_sales_range_var,new_most_recent_purchases_range_mode,new_most_recent_purchases_range_nunique,new_most_recent_purchases_range_mean,new_most_recent_purchases_range_var,new_category_4_mode,new_category_4_nunique,new_category_4_mean,new_category_4_var,new_category_5_mode,new_category_5_nunique,new_category_5_mean,new_category_5_var,new_avg_sales_lag3_sum,new_avg_sales_lag3_max,new_avg_sales_lag3_min,new_avg_sales_lag3_mean,new_avg_sales_lag3_median,new_avg_sales_lag3_var,new_avg_sales_lag3_quantileRange,new_avg_purchases_lag3_sum,new_avg_purchases_lag3_max,new_avg_purchases_lag3_min,new_avg_purchases_lag3_mean,new_avg_purchases_lag3_median,new_avg_purchases_lag3_var,new_avg_purchases_lag3_quantileRange,new_active_months_lag3_sum,new_active_months_lag3_max,new_active_months_lag3_min,new_active_months_lag3_mean,new_active_months_lag3_median,new_active_months_lag3_var,new_active_months_lag3_quantileRange,new_avg_sales_lag6_sum,new_avg_sales_lag6_max,new_avg_sales_lag6_min,new_avg_sales_lag6_mean,new_avg_sales_lag6_median,new_avg_sales_lag6_var,new_avg_sales_lag6_quantileRange,new_avg_purchases_lag6_sum,new_avg_purchases_lag6_max,new_avg_purchases_lag6_min,new_avg_purchases_lag6_mean,new_avg_purchases_lag6_median,new_avg_purchases_lag6_var,new_avg_purchases_lag6_quantileRange,new_active_months_lag6_sum,new_active_months_lag6_max,new_active_months_lag6_min,new_active_months_lag6_mean,new_active_months_lag6_median,new_active_months_lag6_var,new_active_months_lag6_quantileRange,new_avg_sales_lag12_sum,new_avg_sales_lag12_max,new_avg_sales_lag12_min,new_avg_sales_lag12_mean,new_avg_sales_lag12_median,new_avg_sales_lag12_var,new_avg_sales_lag12_quantileRange,new_avg_purchases_lag12_sum,new_avg_purchases_lag12_max,new_avg_purchases_lag12_min,new_avg_purchases_lag12_mean,new_avg_purchases_lag12_median,new_avg_purchases_lag12_var,new_avg_purchases_lag12_quantileRange,new_active_months_lag12_sum,new_active_months_lag12_max,new_active_months_lag12_min,new_active_months_lag12_mean,new_active_months_lag12_median,new_active_months_lag12_var,new_active_months_lag12_quantileRange,new_city_ym_rate_mode,new_city_ym_rate_sum,new_city_ym_rate_mean,new_city_ym_rate_median,new_city_ym_rate_var,new_city_ym_rate_quantileRange
0,C_ID_92a2005557,5,2,1,6,0.0,-0.820283,2017-06-01,2017,333,17318.0,1,247.0,0.95,260,69,7,0,0.0,0.0,0.0,4.0,0.01538,0.015205,1.0,0.0,0.0,0,0.01538,560,41,94,-2,-1017.0,-3.912,5.75,0,-8,0.066,1e-09,1e-09,-165.96873,-0.638341,0.045003,2.258395,-0.7393,10.24,0.0,1.0,1.046,9,3,34,21,-167.4,-0.644,0.02057,0.8,-0.7393,5.133,2017,2,2017.0,2018,2017,12,9,8.055,12,1,11,31,15.51,76.9,31,1,0.10236,14,23,13.31,24.69,23,0,-0.887,5,7,3.21,6,0,50,35,33.06,52,1,0,90.0,0.3462,0.0,0.0,inf,-inf,0.0,3,2,2.072,0.068,3.299,13.125,0.0,6.266,26.77,0.0,16.47,7.754,-1.325042,-2.201,4.516789,0.216691,7.734,-0.311,-0.3696,1.129197,0.011823,9.35,0.0725,-0.747,5942464.5,1309718.6,0.3467,-0.747,5920398.5,5429670.5,242,0.9307,346,589,1.0,23.0,1.0,23.0,69.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278.0,14.0,23.0,1.0,34.0,1.479,0.261,2.0,1.0,0.09326,1e-09,1e-09,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.895996,0.0,1.0,1.0,9.0,1.0,37.0,10.0,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.896,2018.0,1.0,2018.0,2018.0,2018.0,3.0,2.0,3.479,4.0,3.0,6.0,17.0,16.44,88.8,31.0,...,0.0,5458.81153,504.322881,0.320988,21.076492,0.980583,9451.139039,0.0,1554.0,6.0,6.0,6.0,6.0,0.0,0.0,2146.41,194.61,0.53,8.287297,0.97,1157.068144,0.0,6016.9902,554.397813,0.252963,23.231622,0.981699,11461.658777,0.0,3081.0,12.0,7.0,11.895753,12.0,0.48909,0.0,22.105127,5151.261826,19.438724,20.749006,20.015077,3.268274,C_ID_92a2005557,10448.0,428.0,483.0,454.26087,454.0,275.29249,25.5,1.0,1.0,17650.474363,17595.586528,17621.821734,274.954866,-0.035528,2.0,2.0,2.0,0.0,0.0,12.0,11.0,11.608696,0.249012,-0.477134,23.0,13708.0,23.0,1.0,1.0,1.0,1.0,23.0,1.0,1.0,0.0,1.0,1.0,1.0,23.0,1.0,0.0,0.0,-0.58118,0.195991,1.0,0.0,1.0,0.0,0.0,1.0,-0.746893,263.157498,-615576.244067,-0.746893,14.279604,-590147.161107,35.0,17.0,633.757544,183.735111,-0.057471,27.554676,0.002019,3156.826505,7.718753,626.083408,182.079322,-0.057471,27.221018,-0.037641,3083.435646,7.485753,0.0,5.0,1.826087,2.059289,0.0,5.0,1.913043,2.173913,1.0,2.0,0.826087,0.150198,1.0,1.0,1.0,0.0,25.31,2.99,0.68,1.100435,1.01,0.180577,0.095,25.009007,2.222222,0.812757,1.087348,1.01664,0.069379,0.133171,69.0,3.0,3.0,3.0,3.0,0.0,0.0,25.02,3.11,0.65,1.087826,1.03,0.2156,0.16,24.618138,2.333333,0.777778,1.070354,1.007515,0.092094,0.174524,137.0,6.0,5.0,5.956522,6.0,0.043478,0.0,28.15,6.48,0.63,1.223913,0.99,1.344107,0.215,26.103974,3.972222,0.676331,1.134955,0.999061,0.410256,0.231759,267.0,12.0,5.0,11.608696,12.0,2.249012,0.0,17.154857,348.327812,15.144687,17.154857,35.070408,1.463585
1,C_ID_3d0044924f,4,1,0,1,0.0,0.392913,2017-01-01,2017,484,17167.0,1,339.0,0.9688,350,69,9,0,31.0,0.08856,1.0,545.0,1.566,2.258,10.0,1.0,2.0,1,1.2,307,57,142,0,-1761.0,-5.03,14.48,0,-12,-0.258,1e-09,1e-09,-210.00633,-0.600018,0.1482,4.6303,-0.742,8.81,0.0,1.0,1.0,9,3,34,24,-215.4,-0.615,0.0586,0.8,-0.742,3.744,2017,2,2017.0,2018,2017,1,12,6.22,12,1,19,31,16.67,77.4,31,1,-0.2357,12,24,14.72,31.16,23,0,-0.8936,5,7,3.363,6,0,3,50,25.22,52,1,0,132.0,0.3772,-200.113283,-0.575038,2.31515,-0.7424,0.082442,4,2,3.092,0.1005,2.0,10.65,8.734,12.555,13.79,6.57,9.836,0.0,-1.853766,-2.951,13.890898,1.405803,8.25,-0.196,-0.3667,1.543433,0.016649,8.555,0.0746,-0.747,6010604.0,1329550.4,-0.2952,-0.747,156963.11,-3653208.0,390,1.114,370,761,1.0,6.0,1.0,6.0,69.0,1.0,0.0,0.0,0.0,1.0,6.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,307.0,5.0,6.0,1.0,9.0,1.5,0.3,2.0,1.0,0.0,1e-09,1e-09,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.255859,0.0,1.0,1.0,9.0,1.0,19.0,4.0,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.256,2018.0,1.0,2018.0,2018.0,2018.0,2.0,2.0,2.5,3.0,2.0,5.0,4.0,13.5,131.5,30.0,...,0.13,1540.508953,504.322881,0.307888,4.220572,1.081449,1387.226497,0.12301,2190.0,6.0,6.0,6.0,6.0,0.0,0.0,811.35,166.68,0.27,2.222877,1.07,152.084819,0.19,1673.471042,554.397813,0.209035,4.584852,1.110395,1677.400351,0.190098,4329.0,12.0,7.0,11.860274,12.0,0.631522,0.0,21.186055,6053.777617,16.585692,18.005138,37.725431,4.83495,C_ID_3d0044924f,2535.0,396.0,453.0,422.5,421.5,697.1,43.0,1.5,1.0,17620.283634,17563.713819,17589.9845,690.446756,0.0681,3.0,3.0,3.0,0.0,0.0,13.0,12.0,12.5,0.3,0.0,6.0,157429.0,6.0,1.0,1.0,1.0,1.0,6.0,1.0,1.0,0.0,1.0,1.0,1.0,6.0,1.0,1.0,0.0,-0.732633,0.014226,1.0,0.0,1.0,0.0,0.0,1.0,-0.746893,263.157498,-615576.244067,-0.746893,72.452641,-514857.076705,35.0,6.0,3.274116,3.392916,-0.057471,0.545686,-0.032683,1.94713,0.076841,3.224542,3.373086,-0.057471,0.537424,-0.032683,1.930599,0.054532,3.0,3.0,1.833333,1.766667,2.0,3.0,1.333333,0.666667,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,6.08,1.16,0.94,1.013333,0.98,0.006507,0.065,5.759112,1.051867,0.739749,0.959852,0.990524,0.012267,0.018838,18.0,3.0,3.0,3.0,3.0,0.0,0.0,6.33,1.34,0.81,1.055,1.0,0.04167,0.265,5.757135,1.325019,0.694037,0.959522,0.951439,0.043664,0.111878,36.0,6.0,6.0,6.0,6.0,0.0,0.0,6.1,1.34,0.84,1.016667,0.935,0.044347,0.3,5.562803,1.330626,0.643866,0.927134,0.884415,0.05298,0.148201,72.0,12.0,12.0,12.0,12.0,0.0,0.0,16.58189,105.600997,17.600166,17.600166,1.244263,2.036552
2,C_ID_d639edf6cd,2,2,0,8,0.0,0.688056,2016-08-01,2016,637,17014.0,1,41.0,0.9536,43,143,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,705,8,13,-10,-370.0,-8.6,14.766,0,-13,0.7256,1e-09,1e-09,-29.16739,-0.678311,0.007635,-0.145847,-0.73,5.625,0.0,5.0,4.63,5,2,33,7,-29.17,-0.678,0.007637,-0.1459,-0.73,5.62,2017,2,2017.0,2018,2017,1,10,4.56,12,1,21,19,19.33,62.3,30,2,-0.7563,19,14,17.9,12.375,23,8,-0.8867,4,7,3.303,6,0,4,22,18.38,49,2,0,11.0,0.2559,-inf,-inf,-inf,-inf,0.0,3,2,2.07,0.06647,3.5,9.67,17.0,8.91,7.51,17.66,8.91,1.721,-1.406243,-2.182,-0.291695,0.070888,0.3125,-0.3308,-0.365,-0.072924,0.002562,3.719,-0.0878,-0.747,654527.7,-229313.4,0.3584,-0.747,6010604.0,5588325.0,412,9.58,344,756,1.0,1.0,1.0,1.0,143.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,528.0,1.0,1.0,2.0,2.0,2.0,0.0,2.0,2.0,0.0,1e-09,1e-09,-0.7,-0.7,0.0,-0.7,-0.7,0.0,0.0,5.0,5.0,5.0,1.0,25.0,1.0,-0.7,-0.7,0.0,-0.7,-0.7,0.0,2018.0,1.0,2018.0,2018.0,2018.0,4.0,1.0,4.0,4.0,4.0,28.0,1.0,28.0,0.0,28.0,...,0.0,60.6816,15.855769,0.871214,1.379127,1.033525,4.991645,0.0,264.0,6.0,6.0,6.0,6.0,0.0,0.0,52.61,8.57,0.76,1.195682,1.02,1.299769,0.0,61.731488,16.21978,0.752641,1.402988,1.052752,5.233572,0.0,523.0,12.0,7.0,11.886364,12.0,0.568182,0.0,2.151798,120.601534,2.740944,2.151798,8.506266,0.266678,C_ID_d639edf6cd,482.0,482.0,482.0,482.0,482.0,,0.0,2.0,0.0,17649.738322,17649.738322,17649.738322,,,2.0,2.0,2.0,,,11.0,11.0,11.0,,,1.0,220897.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,0.0,0.0,-0.700326,0.0,1.0,,1.0,,,1.0,-0.746893,74.385651,-142362.464162,-0.746893,14.279604,-590147.161107,3124.0,1.0,-0.027726,-0.027726,-0.027726,-0.027726,-0.027726,,0.0,-0.037641,-0.037641,-0.037641,-0.037641,-0.037641,,0.0,2.0,1.0,2.0,,2.0,1.0,2.0,,1.0,1.0,1.0,,5.0,1.0,5.0,,1.33,1.33,1.33,1.33,1.33,,0.0,1.297775,1.297775,1.297775,1.297775,1.297775,,0.0,3.0,3.0,3.0,3.0,3.0,,0.0,1.27,1.27,1.27,1.27,1.27,,0.0,1.291121,1.291121,1.291121,1.291121,1.291121,,0.0,6.0,6.0,6.0,6.0,6.0,,0.0,1.08,1.08,1.08,1.08,1.08,,0.0,1.099875,1.099875,1.099875,1.099875,1.099875,,0.0,10.0,10.0,10.0,10.0,10.0,,0.0,2.755332,2.755332,2.755332,2.755332,,0.0


In [540]:
    'new_city_id_mode', 'new_city_id_nunique',
    'new_category_1_mode', 'new_category_1_sum', 'new_category_1_mean',
    'new_installments_mode', 'new_installments_sum', 'new_installments_mean', 'new_installments_var', 'new_installments_max', 'new_installments_min', 'new_installments_null_cnt',
    'new_month_lag_mode', 'new_month_lag_sum', 'new_month_lag_mean', 'new_month_lag_var', 'new_month_lag_max', 'new_month_lag_min', 'new_month_lag_skew',
    'new_purchase_amount_trim_sum', 'new_purchase_amount_trim_mean', 'new_purchase_amount_trim_var', 'new_purchase_amount_trim_max', 'new_purchase_amount_trim_min', 'new_purchase_amount_trim_skew',
    'new_month_diff_max', 'new_month_diff_min', 'new_month_diff_mean', 'new_month_diff_var', 'new_month_diff_skew',
    'card_id_total_size', 'card_id_size_ratio', 'purchase_amount_total', 'purchase_amount_mean', 'purchase_amount_max', 'purchase_amount_min', 'purchase_amount_ratio',
    'month_diff_mean', 'month_diff_ratio', 'month_lag_mean', 'month_lag_max', 'month_lag_min',
    'category_1_mean', 'installments_total', 'installments_mean', 'installments_max', 'installments_ratio',
    'price_total', 'price_mean', 'price_max', 'duration_mean', 'duration_min', 'duration_max',
    'amount_month_ratio_mean', 'amount_month_ratio_min', 'amount_month_ratio_max',
    'new_CLV', 'hist_CLV', 'CLV_ratio'

('new_CLV', 'hist_CLV', 'CLV_ratio')

In [541]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 1,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 4590}

In [542]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

# train_columns = [f for f in train.columns if f not in FEATS_EXCLUDED]
train_columns = trainable_feature

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=train.iloc[trn_idx]['target'])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=train.iloc[val_idx]['target'])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print("CV score: {:<8.5f}".format(mean_squared_error(train.target.values, oof_lgb)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.65943	valid_1's rmse: 3.71304
[200]	training's rmse: 3.57708	valid_1's rmse: 3.68496
[300]	training's rmse: 3.52599	valid_1's rmse: 3.67374


KeyboardInterrupt: 

score : 3.70479

In [479]:
sub = pd.read_csv('./data/sample_submission.csv')

In [480]:
sub.target = predictions_lgb

In [481]:
sub.to_csv('./data/sub_3-65066.csv', index=False)