# 사전 작업

## 모듈 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc
from tqdm import tqdm_notebook

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
pd.set_option('display.max_columns', 400)

# 통합 데이터

In [6]:
def drop_features(data, keyword, debug=False):
    if debug:
        data = data[:1000]
        
    logits = []
    for col in data.columns:
        if keyword in col:
            logits.append(col)
        else:
            pass
    
    return logits

## 데이터 로드

In [7]:
path = './data/'

In [28]:
# train = pd.read_csv(path + 'train_v4.csv')
# test = pd.read_csv(path + 'test_v4.csv')

In [29]:
# train.new_card_id_size.fillna(0, inplace=True)
# test.new_card_id_size.fillna(0, inplace=True)

In [10]:
raw_history = pd.read_csv(path + 'historical_transactions.csv')

In [30]:
history = raw_history.copy()

## Feature Engineering

In [12]:
def null_cnt(x):
    return np.sum(x.isna() * 1)

In [13]:
def null_rate(x):
    return np.sum(x.isna() * 1) / len(x)

In [14]:
from scipy import stats
def mode(x):
    return stats.mode(x)[0][0]

In [15]:
def most_value_cnt(x):
    return x.value_counts().values[0]

In [16]:
history.head(3)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37


### 거래가 승인 되고, 오프라인 거래의 city_id

In [17]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'N']

In [18]:
temp = temp.groupby('card_id').agg({'city_id': [mode, 'nunique']})
temp.head(1)

Unnamed: 0_level_0,city_id,city_id
Unnamed: 0_level_1,mode,nunique
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2
C_ID_00007093c1,244,3


In [19]:
temp.columns = ['hist_city_id_mode_authorized_flag_category_1_Y', 'hist_city_id_nunique_authorized_flag_category_1_Y']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_city_id_mode_authorized_flag_category_1_Y,hist_city_id_nunique_authorized_flag_category_1_Y
0,C_ID_00007093c1,244,3


In [20]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [21]:
del temp
gc.collect()

28

### 거래가 승인된 category_1

In [22]:
temp = history[history.authorized_flag == 'Y']

In [23]:
temp.category_1 = temp.category_1.map({'Y':1, 'N':0})

In [24]:
temp = temp.groupby('card_id').agg({'category_1':['mean', 'sum', 'size']})
temp.head(1)

Unnamed: 0_level_0,category_1,category_1,category_1
Unnamed: 0_level_1,mean,sum,size
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
C_ID_00007093c1,0.210526,24,114


In [25]:
temp.columns = ['hist_category_1_authorized_flag_Y_mean', 'hist_category_1_authorized_flag_Y_sum', 'hist_category_1_authorized_flag_Y_size']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_category_1_authorized_flag_Y_mean,hist_category_1_authorized_flag_Y_sum,hist_category_1_authorized_flag_Y_size
0,C_ID_00007093c1,0.210526,24,114


In [26]:
temp.hist_category_1_authorized_flag_Y_mean = np.round(temp.hist_category_1_authorized_flag_Y_mean, 4)

In [27]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [28]:
del temp
gc.collect()

35

### 거래가 승인된 installments

In [36]:
def has_999(data):
    # 999가 없을 때
    if data[data == 999].__len__() == 0:
        return 0
    # 999가 있을 때
    else:
        return 1

In [37]:
def cnt_std(data):
    value = data.value_counts().std()
    if np.isnan(value):
        return 0
    else:
        return value

In [31]:
history.installments.replace(-1, np.nan, inplace=True)

In [32]:
temp = history[history.authorized_flag == 'Y']

In [33]:
temp = temp.groupby('card_id').agg({'installments':[mode, 'size', 'mean', 'max', 'var', 'min', null_cnt, null_rate, cnt_std]})
temp.head(1)

Unnamed: 0_level_0,installments,installments,installments,installments,installments,installments,installments,installments,installments
Unnamed: 0_level_1,mode,size,mean,max,var,min,null_cnt,null_rate,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
C_ID_00007093c1,1.0,114,1.289474,6.0,0.632278,1.0,0.0,0.0,41.069453


In [34]:
temp.columns = ['hist_installments_authorized_flag_Y_mode', 'hist_installments_authorized_flag_Y_size', 'hist_installments_authorized_flag_Y_mean', 'hist_installments_authorized_flag_Y_max', 'hist_installments_authorized_flag_Y_var', 'hist_installments_authorized_flag_Y_min', 'hist_installments_authorized_flag_Y_null_cnt', 'hist_installments_authorized_flag_Y_null_rate', 'hist_installments_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_installments_authorized_flag_Y_mode,hist_installments_authorized_flag_Y_size,hist_installments_authorized_flag_Y_mean,hist_installments_authorized_flag_Y_max,hist_installments_authorized_flag_Y_var,hist_installments_authorized_flag_Y_min,hist_installments_authorized_flag_Y_null_cnt,hist_installments_authorized_flag_Y_null_rate,hist_installments_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,1.0,114,1.289474,6.0,0.632278,1.0,0.0,0.0,41.069453


In [35]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [36]:
del temp
gc.collect()

42

### category_3

In [37]:
history.category_3 = history.category_3.fillna('D')

### 승인된 거래 중 merchant_category_id

In [38]:
temp = history[history.authorized_flag == 'Y']

In [39]:
temp = temp.groupby('card_id').agg({'merchant_category_id':[mode, 'nunique', null_cnt, cnt_std]})

In [40]:
temp.columns = ['hist_merchant_category_id_authorized_flag_Y_mode', 'hist_merchant_category_id_authorized_flag_Y_nunique', 'hist_merchant_category_id_authorized_flag_Y_null_cnt', 'hist_merchant_category_id_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_category_id_authorized_flag_Y_mode,hist_merchant_category_id_authorized_flag_Y_nunique,hist_merchant_category_id_authorized_flag_Y_null_cnt,hist_merchant_category_id_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,307,18,0,10.278476


In [41]:
temp.hist_merchant_category_id_authorized_flag_Y_cnt_std = np.round(temp.hist_merchant_category_id_authorized_flag_Y_cnt_std, 3)

In [42]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [43]:
del temp
gc.collect()

28

### merchant_id

In [44]:
le = LabelEncoder()
le.fit(history.merchant_id.fillna('NULL').values)
history.merchant_id = le.transform(history.merchant_id.fillna('NULL'))
history.loc[history.merchant_id == 326311, 'merchant_id'] = np.nan

In [45]:
temp = history[history.authorized_flag == 'Y']

In [46]:
temp = temp.groupby('card_id').agg({'merchant_id': [cnt_std]})
temp.head(1)

Unnamed: 0_level_0,merchant_id
Unnamed: 0_level_1,cnt_std
card_id,Unnamed: 1_level_2
C_ID_00007093c1,6.943841


In [47]:
temp.columns = ['hist_merchant_id_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_id_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,6.943841


In [48]:
temp.hist_merchant_id_authorized_flag_Y_cnt_std = np.round(temp.hist_merchant_id_authorized_flag_Y_cnt_std, 3)

In [49]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [50]:
del temp
gc.collect()

42

In [51]:
temp = history.groupby('card_id').agg({'merchant_id': [mode, 'nunique', null_cnt, cnt_std]})
temp.head(1)

Unnamed: 0_level_0,merchant_id,merchant_id,merchant_id,merchant_id
Unnamed: 0_level_1,mode,nunique,null_cnt,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,188773.0,29,0.0,8.794171


In [52]:
temp.columns = ['hist_merchant_id_mode', 'hist_merchant_id_nunique2', 'hist_merchant_id_null_cnt', 'hist_merchant_id_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_id_mode,hist_merchant_id_nunique2,hist_merchant_id_null_cnt,hist_merchant_id_cnt_std
0,C_ID_00007093c1,188773.0,29,0.0,8.794171


In [53]:
temp.hist_merchant_id_cnt_std = np.round(temp.hist_merchant_id_cnt_std, 3)

In [54]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [55]:
del temp
gc.collect()

28

### month_lag

In [56]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'Y']
temp = history.groupby('card_id').agg({'month_lag': [mode, 'min', 'max', cnt_std]})
temp.head(1)

Unnamed: 0_level_0,month_lag,month_lag,month_lag,month_lag
Unnamed: 0_level_1,mode,min,max,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,-8,-12,0,4.701336


In [57]:
temp.columns = ['hist_month_lag_authorized_flag_Y_category_1_Y_mode', 'hist_month_lag_authorized_flag_Y_category_1_Y_min', 'hist_month_lag_authorized_flag_Y_category_1_Y_max', 'hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_lag_authorized_flag_Y_category_1_Y_mode,hist_month_lag_authorized_flag_Y_category_1_Y_min,hist_month_lag_authorized_flag_Y_category_1_Y_max,hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std
0,C_ID_00007093c1,-8,-12,0,4.701336


In [58]:
temp.hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std = np.round(temp.hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std, 3)

In [59]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [60]:
del temp
gc.collect()

28

In [61]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'N']
temp = history.groupby('card_id').agg({'month_lag': [mode, 'min', 'max', cnt_std]})
temp.head(1)

Unnamed: 0_level_0,month_lag,month_lag,month_lag,month_lag
Unnamed: 0_level_1,mode,min,max,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,-8,-12,0,4.701336


In [62]:
temp.columns = ['hist_month_lag_authorized_flag_Y_category_1_N_mode', 'hist_month_lag_authorized_flag_Y_category_1_N_min', 'hist_month_lag_authorized_flag_Y_category_1_N_max', 'hist_month_lag_authorized_flag_Y_category_1_N_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_lag_authorized_flag_Y_category_1_N_mode,hist_month_lag_authorized_flag_Y_category_1_N_min,hist_month_lag_authorized_flag_Y_category_1_N_max,hist_month_lag_authorized_flag_Y_category_1_N_cnt_std
0,C_ID_00007093c1,-8,-12,0,4.701336


In [63]:
temp.hist_month_lag_authorized_flag_Y_category_1_N_cnt_std = np.round(temp.hist_month_lag_authorized_flag_Y_category_1_N_cnt_std, 3)

In [64]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [65]:
del temp
gc.collect()

28

### category_2

In [66]:
temp = history[history.authorized_flag == 'Y']

In [67]:
temp = temp.groupby('card_id').agg({'category_2': [mode, 'nunique', null_cnt, cnt_std]})

In [68]:
temp.head(1)

Unnamed: 0_level_0,category_2,category_2,category_2,category_2
Unnamed: 0_level_1,mode,nunique,null_cnt,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,3.0,2,24.0,62.225397


In [69]:
temp.columns = ['hist_category_2_authorized_flag_Y_mode', 'hist_category_2_authorized_flag_Y_nunique', 'hist_category_2_authorized_flag_Y_null_cnt', 'hist_category_2_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_category_2_authorized_flag_Y_mode,hist_category_2_authorized_flag_Y_nunique,hist_category_2_authorized_flag_Y_null_cnt,hist_category_2_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,3.0,2,24.0,62.225397


In [70]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [71]:
del temp
gc.collect()

28

### subsector

In [72]:
# temp = history[history.authorized_flag == 'Y']
# temp = temp.groupby('card_id').agg({'subsector_id': [mode, 'nunique', null_cnt, cnt_std]})

In [73]:
# temp.columns = ['hist_subsector_id_authorized_flag_Y_mode', 'hist_subsector_id_authorized_flag_Y_nunique', 'hist_subsector_id_authorized_flag_Y_null_cnt', 'hist_subsector_id_authorized_flag_Y_cnt_std']
# temp.reset_index(inplace=True)
# temp.head(1)

In [74]:
# train = train.merge(temp, on='card_id', how='left')
# test = test.merge(temp, on='card_id', how='left')

In [75]:
# del temp
# gc.collect()

## logging

In [18]:
# train.to_csv('./data_feature_engineering/train_3-73218.csv', index=False)
# test.to_csv('./data_feature_engineering/test_3-73218.csv', index=False)
train = pd.read_csv('./data_feature_engineering/train_3-73218.csv')
test = pd.read_csv('./data_feature_engineering/test_3-73218.csv')

### purchase_date

In [33]:
for new_merchant_df in [history]:
    # purchase date
    new_merchant_df['purchase_year_month'] = new_merchant_df.purchase_date.str[:7]
    new_merchant_df['purchase_year_month_day'] = new_merchant_df.purchase_date.str[:10]
    new_merchant_df['purchase_date'] = pd.to_datetime(new_merchant_df['purchase_date'])
    new_merchant_df['purchase_year_month_day'] = pd.to_datetime(new_merchant_df['purchase_year_month_day'])
    print(1)
    new_merchant_df['purchase_year'] = new_merchant_df['purchase_date'].dt.year
    print(2)
    new_merchant_df['purchase_month'] = new_merchant_df['purchase_date'].dt.month
    print(3)
    new_merchant_df['purchase_day'] = new_merchant_df['purchase_date'].dt.day
    new_merchant_df['purchase_hour'] = new_merchant_df['purchase_date'].dt.hour
    new_merchant_df['purchase_dayofweek'] = new_merchant_df['purchase_date'].dt.dayofweek
    new_merchant_df['purchase_weekofyear'] = new_merchant_df['purchase_date'].dt.weekofyear
    new_merchant_df['purchase_weekend'] = (new_merchant_df['purchase_date'].dt.weekday >=5).astype(int)
    print(4)
    new_merchant_df['purchase_date_total_day'] = pd.to_timedelta(new_merchant_df['purchase_year_month_day']).dt.total_seconds() / (60 * 60 * 24)
    print(5)
    new_merchant_df['month_diff_from_last_trade'] = ((datetime.datetime(2018, 4, 30, 23, 59, 59) - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff_from_last_trade'] += new_merchant_df['month_lag']
    new_merchant_df['month_diff_from_first_trade'] = ((datetime.datetime(2017, 1, 1, 0, 0, 0) - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff_from_first_trade'] += new_merchant_df['month_lag']
    new_merchant_df['month_diff_from_today'] = ((datetime.datetime.today() - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff_from_today'] += new_merchant_df['month_lag']
    print(6)
    

1
2
3
4
5
6


#### purchase total day

In [40]:
temp = history.groupby('card_id').agg({'purchase_date_total_day': [mode, 'sum', 'mean', 'std', 'max', 'min', cnt_std]})

In [41]:
temp.columns = ['hist_purchase_date_total_day_mode', 'hist_purchase_date_total_day_sum', 'hist_purchase_date_total_day_mean', 'hist_purchase_date_total_day_std', 'hist_purchase_date_total_day_max', 'hist_purchase_date_total_day_min' ,'hist_purchase_date_total_day_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_purchase_date_total_day_mode,hist_purchase_date_total_day_sum,hist_purchase_date_total_day_mean,hist_purchase_date_total_day_std,hist_purchase_date_total_day_max,hist_purchase_date_total_day_min,hist_purchase_date_total_day_cnt_std
0,C_ID_00007093c1,17273.0,2591980.0,17395.838926,104.216256,17589.0,17211.0,1.288961


In [42]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

275

#### purchase year month

In [86]:
temp = history.groupby('card_id').agg({'purchase_year_month': [mode, cnt_std]})

In [87]:
temp.columns = ['hist_purchase_year_month_mode', 'hist_purchase_year_month_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_purchase_year_month_mode,hist_purchase_year_month_cnt_std
0,C_ID_00007093c1,2017-06,4.701336


In [88]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

167

#### month diff

In [110]:
temp = history.groupby('card_id').agg({'month_diff_from_last_trade': [mode, 'sum', 'max', 'min', 'std', 'var', cnt_std], 'month_diff_from_first_trade': [mode, 'sum', 'max', 'min', 'std', 'var', cnt_std], 'month_diff_from_today':[mode, 'sum', 'max', 'min', 'std', 'var', cnt_std]})

In [113]:
temp

Unnamed: 0_level_0,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_last_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_first_trade,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today,month_diff_from_today
Unnamed: 0_level_1,mode,sum,max,min,std,var,cnt_std,mode,sum,max,min,std,var,cnt_std,mode,sum,max,min,std,var,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C_ID_00007093c1,2,318,3,2,0.342047,0.116996,77.074639,-14,-2099,-14,-15,0.283150,0.080174,86.974134,12,1776,12,11,0.273040,0.074551,88.388348
C_ID_0001238066,2,247,3,2,0.090167,0.008130,85.559921,-14,-1731,-14,-15,0.261482,0.068373,74.246212,12,1468,12,11,0.247606,0.061309,75.660426
C_ID_0001506ef0,2,139,3,2,0.310275,0.096270,36.769553,-14,-926,-14,-15,0.172733,0.029837,43.840620,12,791,12,11,0.123091,0.015152,45.254834
C_ID_0001793786,6,1322,7,6,0.326150,0.106374,115.965512,-10,-2174,-10,-11,0.246771,0.060896,132.936075,16,3442,16,15,0.246771,0.060896,132.936075
C_ID_000183fdda,2,288,3,1,0.236525,0.055944,76.210236,-14,-2038,-14,-15,0.361029,0.130342,70.710678,12,1706,12,11,0.361029,0.130342,70.710678
C_ID_00024e244b,2,158,3,2,0.440215,0.193789,24.041631,-14,-984,-14,-15,0.233791,0.054658,43.840620,12,838,13,11,0.293476,0.086128,35.232561
C_ID_0002709b5a,2,149,3,2,0.199886,0.039954,47.376154,-14,-1024,-14,-15,0.164368,0.027017,48.790368,12,874,12,11,0.164368,0.027017,48.790368
C_ID_00027503e2,3,126,3,3,0.000000,0.000000,0.000000,-13,-549,-13,-14,0.260661,0.067944,25.455844,13,543,13,12,0.260661,0.067944,25.455844
C_ID_000298032a,3,90,3,3,0.000000,0.000000,0.000000,-13,-393,-13,-14,0.305129,0.093103,16.970563,13,387,13,12,0.305129,0.093103,16.970563
C_ID_0002ba3c2e,6,428,7,6,0.320455,0.102692,38.183766,-10,-703,-10,-11,0.203997,0.041615,45.254834,16,1117,16,15,0.203997,0.041615,45.254834


In [117]:
col = ['month_diff_from_last_trade', 'month_diff_from_first_trade', 'month_diff_from_today']
final_col = []
for c in col:
    final_col += ['hist_' + c + '_' + v for v in ['mode', 'sum', 'max', 'min', 'std', 'var', 'cnt_std']]

final_col
temp.columns = final_col
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_diff_from_last_trade_mode,hist_month_diff_from_last_trade_sum,hist_month_diff_from_last_trade_max,hist_month_diff_from_last_trade_min,hist_month_diff_from_last_trade_std,hist_month_diff_from_last_trade_var,hist_month_diff_from_last_trade_cnt_std,hist_month_diff_from_first_trade_mode,hist_month_diff_from_first_trade_sum,hist_month_diff_from_first_trade_max,hist_month_diff_from_first_trade_min,hist_month_diff_from_first_trade_std,hist_month_diff_from_first_trade_var,hist_month_diff_from_first_trade_cnt_std,hist_month_diff_from_today_mode,hist_month_diff_from_today_sum,hist_month_diff_from_today_max,hist_month_diff_from_today_min,hist_month_diff_from_today_std,hist_month_diff_from_today_var,hist_month_diff_from_today_cnt_std
0,C_ID_00007093c1,2,318,3,2,0.342047,0.116996,77.074639,-14,-2099,-14,-15,0.28315,0.080174,86.974134,12,1776,12,11,0.27304,0.074551,88.388348


In [118]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

91

In [153]:
temp = history.groupby('card_id').agg({'month_diff_from_last_trade': ['skew']})

In [155]:
temp.columns = ['hist_month_diff_from_last_trade_skew']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_diff_from_last_trade_skew
0,C_ID_00007093c1,2.16782


In [156]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

20

#### purchase_date

In [222]:
def duration(data):
    return (data.max() - data.min()).days

In [223]:
temp = history.groupby(['card_id']).agg({'purchase_year_month_day': ['max', 'min', duration]})

In [224]:
temp.columns = ['hist_purchase_year_month_day_max', 'hist_purchase_year_month_day_min', 'hist_purchase_year_month_day_duration']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_purchase_year_month_day_max,hist_purchase_year_month_day_min,hist_purchase_year_month_day_duration
0,C_ID_00007093c1,2018-02-27,2017-02-14,378


In [225]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

del temp
gc.collect()

7

### purchase_amount

In [226]:
# history['purchase_amount_new'] = np.round(history['purchase_amount'] / 0.00150265118 + 497.06,2)

In [227]:
# temp = history[history.authorized_flag == 'Y']
# temp = temp.groupby(['card_id']).agg({'purchase_amount_new':[mode, 'sum', 'mean', 'var', 'max', 'min', 'skew', 'std']})

In [228]:
# temp.columns = ['hist_purchase_amount_authorized_flag_Y_new_mode', 'hist_purchase_amount_authorized_flag_Y_new_sum', 'hist_purchase_amount_authorized_flag_Y_new_mean', 'hist_purchase_amount_authorized_flag_Y_new_var', 'hist_purchase_amount_authorized_flag_Y_new_max', 'hist_purchase_amount_authorized_flag_Y_new_min', 'hist_purchase_amount_authorized_flag_Y_new_skew', 'hist_purchase_amount_authorized_flag_Y_new_std']
# temp.reset_index(inplace=True)
# temp.head(1)

In [229]:
# train = train.merge(temp, on='card_id', how='left')
# test = test.merge(temp, on='card_id', how='left')

In [230]:
# del temp
# gc.collect()

In [231]:
# temp = history[history.authorized_flag == 'N']
# temp = temp.groupby(['card_id']).agg({'purchase_amount_new':[mode, 'sum', 'mean', 'var', 'max', 'min', 'skew', 'std']})

In [232]:
# temp.columns = ['hist_purchase_amount_authorized_flag_N_new_mode', 'hist_purchase_amount_authorized_flag_N_new_sum', 'hist_purchase_amount_authorized_flag_N_new_mean', 'hist_purchase_amount_authorized_flag_N_new_var', 'hist_purchase_amount_authorized_flag_N_new_max', 'hist_purchase_amount_authorized_flag_N_new_min', 'hist_purchase_amount_authorized_flag_N_new_skew', 'hist_purchase_amount_authorized_flag_N_new_std']
# temp.reset_index(inplace=True)
# temp.head(1)

In [233]:
# train = train.merge(temp, on='card_id', how='left')
# test = test.merge(temp, on='card_id', how='left')

In [234]:
# del temp
# gc.collect()

In [235]:
# history.purchase_amount_new.describe()

## Feature Selection

In [267]:
le = LabelEncoder()
le.fit(train.append(test).hist_purchase_year_month_mode.values)
for df in [train, test]:
    df['first_active'] = pd.to_datetime(df['first_active'])
    df['temp'] = 1 - df.hist_category_1_label_mean
    df['hist_category_1_authorized_flag_Y_0_cnt'] = df.hist_category_1_authorized_flag_Y_size - df.hist_category_1_authorized_flag_Y_sum
    df.hist_installments_authorized_flag_Y_cnt_std = np.around(df.hist_installments_authorized_flag_Y_cnt_std, 4)
    df['temp'] = df.hist_month_lag_max - df.hist_month_lag_min
    df.hist_purchase_year_month_mode = le.transform(df.hist_purchase_year_month_mode)
    df['has_new_merchant'] = 1
    df.loc[df.new_card_id_size == 0, 'has_new_merchant'] = 0
    df['hist_purchase_year_max_first_active_year'] = (df.hist_purchase_year_month_day_max - df.first_active).dt.days
    df['hist_purchase_year_min_first_active_year'] = (df.hist_purchase_year_month_day_min - df.first_active).dt.days

In [268]:
train.head(3)

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,first_active,first_active_year,first_active_elapsed_time_from_trade,first_active_total_day,hist_authorized_flag_label_mode,hist_authorized_flag_label_sum,hist_authorized_flag_label_mean,hist_card_id_size,hist_city_id_mode,hist_city_id_nunique,hist_category_1_label_mode,hist_category_1_label_sum,hist_category_1_label_mean,hist_installments_mode,hist_installments_sum,hist_installments_mean,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_null_cnt,hist_category_3_label_mode,hist_category_3_label_mean,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_month_lag_mode,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_skew,hist_purchase_date_max,hist_purchase_date_min,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_category_2_mode,hist_category_2_mean,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode,hist_subsector_id_nunique,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_month_diff_max,hist_month_diff_min,hist_month_diff_mean,hist_month_diff_var,hist_month_diff_skew,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_duration_mean,hist_duration_min,hist_duration_max,hist_duration_var,hist_duration_skew,hist_amount_month_ratio_mean,hist_amount_month_ratio_min,hist_amount_month_ratio_max,hist_amount_month_ratio_var,hist_amount_month_ratio_skew,hist_category_2_mean_mean,hist_category_2_min_mean,hist_category_2_max_mean,hist_category_2_sum_mean,hist_category_3_mean_mean,hist_category_3_min_mean,hist_category_3_max_mean,hist_category_3_sum_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin,new_authorized_flag_mode,new_authorized_flag_sum,new_authorized_flag_mean,new_card_id_size,new_city_id_mode,new_city_id_nunique,new_category_1_mode,new_category_1_sum,new_category_1_mean,new_installments_mode,new_installments_sum,new_installments_mean,new_installments_var,new_installments_max,new_installments_min,new_installments_null_cnt,new_category_3_mode,new_category_3_mean,new_merchant_category_id_mode,new_merchant_category_id_nunique,new_merchant_id_nunique,new_month_lag_mode,new_month_lag_sum,new_month_lag_mean,new_month_lag_var,new_month_lag_max,new_month_lag_min,new_month_lag_skew,new_purchase_date_max,new_purchase_date_min,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_skew,new_purchase_amount_over_550,new_category_2_mode,new_category_2_mean,new_state_id_mode,new_state_id_nunique,new_subsector_id_mode,new_subsector_id_nunique,new_purchase_amount_trim_sum,new_purchase_amount_trim_mean,new_purchase_amount_trim_var,new_purchase_amount_trim_max,new_purchase_amount_trim_min,new_purchase_amount_trim_skew,new_purchase_year_mode,new_purchase_year_nunique,new_purchase_year_mean,new_purchase_year_max,new_purchase_year_min,new_purchase_month_mode,new_purchase_month_nunique,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_day_mode,new_purchase_day_nunique,new_purchase_day_mean,new_purchase_day_var,new_purchase_day_max,new_purchase_day_min,new_purchase_day_skew,new_purchase_hour_mode,new_purchase_hour_nunique,new_purchase_hour_mean,new_purchase_hour_var,new_purchase_hour_max,new_purchase_hour_min,new_purchase_hour_skew,new_purchase_dayofweek_mode,new_purchase_dayofweek_nunique,new_purchase_dayofweek_mean,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_weekofyear_mode,new_purchase_weekofyear_nunique,new_purchase_weekofyear_mean,new_purchase_weekofyear_max,new_purchase_weekofyear_min,new_purchase_weekend_mode,new_purchase_weekend_sum,new_purchase_weekend_mean,new_price_sum,new_price_mean,new_price_max,new_price_min,new_price_var,new_month_diff_max,new_month_diff_min,new_month_diff_mean,new_month_diff_var,new_month_diff_skew,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_duration_mean,new_duration_min,new_duration_max,new_duration_var,new_duration_skew,new_amount_month_ratio_mean,new_amount_month_ratio_min,new_amount_month_ratio_max,new_amount_month_ratio_var,new_amount_month_ratio_skew,new_category_2_mean_mean,new_category_3_mean_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_purchase_date_uptomin,hist_first_buy,hist_last_buy,new_first_buy,new_last_buy,card_id_total_size,card_id_size_ratio,purchase_amount_total,purchase_amount_mean,purchase_amount_max,purchase_amount_min,purchase_amount_ratio,month_diff_mean,month_diff_ratio,month_lag_mean,month_lag_max,month_lag_min,category_1_mean,installments_total,installments_mean,installments_max,installments_ratio,price_total,price_mean,price_max,duration_mean,duration_min,duration_max,amount_month_ratio_mean,amount_month_ratio_min,amount_month_ratio_max,new_CLV,hist_CLV,CLV_ratio,hist_city_id_mode_authorized_flag_category_1_Y,hist_city_id_nunique_authorized_flag_category_1_Y,hist_category_1_authorized_flag_Y_mean,hist_category_1_authorized_flag_Y_sum,hist_category_1_authorized_flag_Y_size,hist_installments_authorized_flag_Y_mode,hist_installments_authorized_flag_Y_size,hist_installments_authorized_flag_Y_mean,hist_installments_authorized_flag_Y_max,hist_installments_authorized_flag_Y_var,hist_installments_authorized_flag_Y_min,hist_installments_authorized_flag_Y_null_cnt,hist_installments_authorized_flag_Y_null_rate,hist_installments_authorized_flag_Y_cnt_std,hist_merchant_category_id_authorized_flag_Y_mode,hist_merchant_category_id_authorized_flag_Y_nunique,hist_merchant_category_id_authorized_flag_Y_null_cnt,hist_merchant_category_id_authorized_flag_Y_cnt_std,hist_merchant_id_authorized_flag_Y_cnt_std,hist_merchant_id_mode,hist_merchant_id_nunique2,hist_merchant_id_null_cnt,hist_merchant_id_cnt_std,hist_month_lag_authorized_flag_Y_category_1_Y_mode,hist_month_lag_authorized_flag_Y_category_1_Y_min,hist_month_lag_authorized_flag_Y_category_1_Y_max,hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std,hist_month_lag_authorized_flag_Y_category_1_N_mode,hist_month_lag_authorized_flag_Y_category_1_N_min,hist_month_lag_authorized_flag_Y_category_1_N_max,hist_month_lag_authorized_flag_Y_category_1_N_cnt_std,hist_category_2_authorized_flag_Y_mode,hist_category_2_authorized_flag_Y_nunique,hist_category_2_authorized_flag_Y_null_cnt,hist_category_2_authorized_flag_Y_cnt_std,temp,hist_category_1_authorized_flag_Y_0_cnt,hist_purchase_date_total_day_mode,hist_purchase_date_total_day_sum,hist_purchase_date_total_day_mean,hist_purchase_date_total_day_std,hist_purchase_date_total_day_max,hist_purchase_date_total_day_min,hist_purchase_date_total_day_cnt_std,hist_purchase_year_month_mode,hist_purchase_year_month_cnt_std,hist_month_diff_from_last_trade_mode,hist_month_diff_from_last_trade_sum,hist_month_diff_from_last_trade_max,hist_month_diff_from_last_trade_min,hist_month_diff_from_last_trade_std,hist_month_diff_from_last_trade_var,hist_month_diff_from_last_trade_cnt_std,hist_month_diff_from_first_trade_mode,hist_month_diff_from_first_trade_sum,hist_month_diff_from_first_trade_max,hist_month_diff_from_first_trade_min,hist_month_diff_from_first_trade_std,hist_month_diff_from_first_trade_var,hist_month_diff_from_first_trade_cnt_std,hist_month_diff_from_today_mode,hist_month_diff_from_today_sum,hist_month_diff_from_today_max,hist_month_diff_from_today_min,hist_month_diff_from_today_std,hist_month_diff_from_today_var,hist_month_diff_from_today_cnt_std,hist_month_diff_from_last_trade_skew,hist_purchase_year_month_day_max,hist_purchase_year_month_day_min,hist_purchase_year_month_day_duration,has_new_merchant,hist_purchase_year_max_first_active_year,hist_purchase_year_min_first_active_year
0,C_ID_92a2005557,5,2,1,6,0.0,-0.820283,2017-06-01,2017,333,17318.0,1,247.0,0.95,260,69,7,0,0.0,0.0,0.0,4.0,0.01538,0.015205,1.0,0.0,0.0,0,0.01538,560,41,94,-2,-1017.0,-3.912,5.75,0,-8,0.066,1e-09,1e-09,-165.96873,-0.638341,0.045003,2.258395,-0.7393,10.24,0.0,1.0,1.046,9,3,34,21,-167.4,-0.644,0.02057,0.8,-0.7393,5.133,2017,2,2017.0,2018,2017,12,9,8.055,12,1,11,31,15.51,76.9,31,1,0.10236,14,23,13.31,24.69,23,0,-0.887,5,7,3.21,6,0,50,35,33.06,52,1,0,90.0,0.3462,,,inf,-inf,,3,2,2.072,0.068,3.299,13.125,0.0,6.266,26.77,0.0,16.47,7.754,-1.325042,-2.201,4.516789,0.216691,7.734,-0.311,-0.3696,1.129197,0.011823,9.35,0.0725,-0.747,5942464.5,1309718.6,0.3467,-0.747,5920398.5,5429670.5,242,0.9307,346,589,1.0,23.0,1.0,23.0,69.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278.0,14.0,23.0,1.0,34.0,1.479,0.261,2.0,1.0,0.09326,1e-09,1e-09,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.895996,0.0,1.0,1.0,9.0,1.0,37.0,10.0,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.896,2018.0,1.0,2018.0,2018.0,2018.0,3.0,2.0,3.479,4.0,3.0,6.0,17.0,16.44,88.8,31.0,5.0,0.3389,13.0,8.0,12.87,4.21,16.0,8.0,-0.6035,4.0,7.0,3.13,6.0,0.0,13.0,7.0,13.305,17.0,10.0,0.0,6.0,0.261,-inf,-inf,-inf,-inf,,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,41.75,-1.151,-1.449,-0.5923,0.0737,0.895996,-0.2878,-0.3623,-0.1481,0.00461,0.895996,-0.5503,-0.593,54.0,2.348,283.0,338.0,-17318,-17318,-17318,-17318,283.0,0.088462,-179.20873,-1.214041,1.962295,-1.4639,0.079774,4.072,0.965251,-2.433,2.0,-7.0,0.0,4.0,0.01538,1.0,0.0,-44.802183,-78.936365,1.962295,-2.476042,-3.65,3.924489,-0.5988,-0.7319,0.981097,-152.26,-20826.191988,0.007311,69.0,7.0,0.0,0,247,0.0,247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,560,41,0,11.006,4.419,33816.0,94,6.0,4.569,-2,-8,0,17.244,-2,-8,0,17.244,1.0,2,0.0,170.412734,8,247,17520.0,4538904.0,17457.323077,74.281861,17587.0,17344.0,0.939212,11,17.244162,2,539,3,2,0.260765,0.067998,156.977705,-14,-3676,-14,-15,0.34605,0.119751,132.936075,12,3092,12,11,0.310589,0.096466,144.249783,3.299776,2018-02-25,2017-06-27,243,1,269,26
1,C_ID_3d0044924f,4,1,0,1,0.0,0.392913,2017-01-01,2017,484,17167.0,1,339.0,0.9688,350,69,9,0,31.0,0.08856,1.0,545.0,1.566,2.258,10.0,1.0,2.0,1,1.2,307,57,142,0,-1761.0,-5.03,14.48,0,-12,-0.258,1e-09,1e-09,-210.00633,-0.600018,0.1482,4.6303,-0.742,8.81,0.0,1.0,1.0,9,3,34,24,-215.4,-0.615,0.0586,0.8,-0.742,3.744,2017,2,2017.0,2018,2017,1,12,6.22,12,1,19,31,16.67,77.4,31,1,-0.2357,12,24,14.72,31.16,23,0,-0.8936,5,7,3.363,6,0,3,50,25.22,52,1,0,132.0,0.3772,-200.113283,-0.575038,2.31515,-0.7424,0.082442,4,2,3.092,0.1005,2.0,10.65,8.734,12.555,13.79,6.57,9.836,0.0,-1.853766,-2.951,13.890898,1.405803,8.25,-0.196,-0.3667,1.543433,0.016649,8.555,0.0746,-0.747,6010604.0,1329550.4,-0.2952,-0.747,156963.11,-3653208.0,390,1.114,370,761,1.0,6.0,1.0,6.0,69.0,1.0,0.0,0.0,0.0,1.0,6.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,307.0,5.0,6.0,1.0,9.0,1.5,0.3,2.0,1.0,0.0,1e-09,1e-09,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.255859,0.0,1.0,1.0,9.0,1.0,19.0,4.0,-4.355,-0.726,0.000207,-0.7017,-0.7393,1.256,2018.0,1.0,2018.0,2018.0,2018.0,2.0,2.0,2.5,3.0,2.0,5.0,4.0,13.5,131.5,30.0,1.0,0.37,17.0,5.0,11.164,24.56,17.0,6.0,0.3833,0.0,4.0,1.5,4.0,0.0,6.0,4.0,9.0,13.0,5.0,0.0,0.0,0.0,-4.355735,-0.725956,-0.701858,-0.73941,0.000205,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,56.84,-2.178,-2.219,-2.105,0.001841,1.180664,-0.242,-0.2465,-0.234,2.3e-05,1.151367,-0.5503,-0.6064,56.0,9.336,313.0,370.0,-17167,-17167,-17167,-17167,356.0,0.017143,-214.36133,-1.326018,3.9286,-1.4813,0.020737,6.092,0.970246,-3.53,2.0,-11.0,0.08856,551.0,2.566,11.0,0.011009,-0.389041,-0.516765,0.357145,-4.031766,-5.17,11.785898,-0.438,-0.6132,1.309433,-8.71,-23771.738519,0.000366,69.0,8.0,0.0855,29,339,1.0,339,1.492582,10.0,1.798308,1.0,2.0,0.0059,93.3938,307,57,0,10.653,4.007,188839.0,142,0.0,4.085,0,-12,0,13.817,0,-12,0,13.817,1.0,1,29.0,0.0,12,310,17185.0,6087880.0,17393.942857,116.976167,17562.0,17172.0,1.054344,12,13.817028,3,1082,4,2,0.317016,0.100499,169.918608,-13,-4590,-13,-14,0.318613,0.101515,190.918831,13,4514,14,12,0.322495,0.104003,168.396358,2.000087,2018-01-31,2017-01-06,390,1,395,5
2,C_ID_d639edf6cd,2,2,0,8,0.0,0.688056,2016-08-01,2016,637,17014.0,1,41.0,0.9536,43,143,5,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,705,8,13,-10,-370.0,-8.6,14.766,0,-13,0.7256,1e-09,1e-09,-29.16739,-0.678311,0.007635,-0.145847,-0.73,5.625,0.0,5.0,4.63,5,2,33,7,-29.17,-0.678,0.007637,-0.1459,-0.73,5.62,2017,2,2017.0,2018,2017,1,10,4.56,12,1,21,19,19.33,62.3,30,2,-0.7563,19,14,17.9,12.375,23,8,-0.8867,4,7,3.303,6,0,4,22,18.38,49,2,0,11.0,0.2559,-inf,-inf,-inf,-inf,,3,2,2.07,0.06647,3.5,9.67,17.0,8.91,7.51,17.66,8.91,1.721,-1.406243,-2.182,-0.291695,0.070888,0.3125,-0.3308,-0.365,-0.072924,0.002562,3.719,-0.0878,-0.747,654527.7,-229313.4,0.3584,-0.747,6010604.0,5588325.0,412,9.58,344,756,1.0,1.0,1.0,1.0,143.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,528.0,1.0,1.0,2.0,2.0,2.0,,2.0,2.0,,1e-09,1e-09,-0.7,-0.7,,-0.7,-0.7,,0.0,5.0,5.0,5.0,1.0,25.0,1.0,-0.7,-0.7,,-0.7,-0.7,,2018.0,1.0,2018.0,2018.0,2018.0,4.0,1.0,4.0,4.0,4.0,28.0,1.0,28.0,,28.0,28.0,,17.0,1.0,17.0,,17.0,17.0,,5.0,1.0,5.0,5.0,5.0,17.0,1.0,17.0,17.0,17.0,1.0,1.0,1.0,-inf,-inf,-inf,-inf,,2.0,2.0,2.0,,,0.0,0.0,0.0,14.0,-1.4,-1.4,-1.4,,,-0.35,-0.35,-0.35,,,-0.549,-0.593,0.0,0.0,284.0,284.0,-17014,-17014,-17014,-17014,44.0,0.023256,-29.86739,-1.378311,-0.845847,-1.43,0.023999,4.07,0.966184,-6.6,2.0,-11.0,0.0,0.0,0.0,0.0,,-inf,-inf,-inf,-2.806243,-3.582,-1.691695,-0.6808,-0.715,-0.422924,-0.35,-605.892643,0.000578,143.0,5.0,0.0,0,41,0.0,41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,705,8,0,10.092,6.89,109672.0,13,0.0,7.443,-10,-13,0,2.314,-10,-13,0,2.314,5.0,2,0.0,23.334524,13,41,17277.0,744687.0,17318.302326,117.06338,17589.0,17177.0,0.511019,3,2.314316,2,89,3,2,0.25777,0.066445,26.162951,-14,-607,-14,-15,0.324353,0.105205,23.334524,12,511,12,11,0.324353,0.105205,23.334524,3.500952,2018-02-27,2017-01-11,412,1,575,163


In [None]:
train.to_csv('./data_feature_engineering/train_3-70479.csv', index=False)
test.to_csv('./data_feature_engineering/test_3-70479.csv', index=False)

In [239]:
train.columns[-21:-14]

Index(['hist_month_diff_from_last_trade_std',
       'hist_month_diff_from_last_trade_var',
       'hist_month_diff_from_last_trade_cnt_std',
       'hist_month_diff_from_first_trade_mode',
       'hist_month_diff_from_first_trade_sum',
       'hist_month_diff_from_first_trade_max',
       'hist_month_diff_from_first_trade_min'],
      dtype='object')

In [275]:
trainable_feature = [
    'feature_1', 'feature_2', 'feature_3', 
    'first_active_month', 'first_active_year', 'first_active_elapsed_time_from_trade', 'first_active_total_day',
    'hist_card_id_size', 'new_card_id_size',
    'hist_authorized_flag_label_mean', 'hist_authorized_flag_label_sum',
    'hist_city_id_mode_authorized_flag_category_1_Y', 'hist_city_id_nunique_authorized_flag_category_1_Y',
    'hist_category_1_authorized_flag_Y_mean',
    'hist_installments_authorized_flag_Y_max', 'hist_installments_authorized_flag_Y_min', 'hist_installments_authorized_flag_Y_null_cnt', 'hist_installments_authorized_flag_Y_var', 'hist_installments_authorized_flag_Y_cnt_std',
    'hist_merchant_category_id_authorized_flag_Y_cnt_std',
    'hist_merchant_id_mode', 'hist_merchant_id_nunique2', 'hist_merchant_id_null_cnt', 'hist_merchant_id_authorized_flag_Y_cnt_std',
    'hist_month_lag_min', 'hist_month_lag_max', 'hist_month_lag_mode',
    'hist_subsector_id_mode', 'hist_subsector_id_nunique',
    'hist_purchase_year_nunique', 'hist_purchase_year_max',
    'hist_purchase_month_nunique', 'hist_purchase_month_mode',
    'hist_purchase_hour_skew', 'hist_purchase_hour_var',
    'hist_purchase_weekofyear_max', 'hist_purchase_weekofyear_nunique',
    'hist_purchase_amount_trim_var',
    'hist_purchase_date_total_day_max',
    'hist_month_diff_from_last_trade_mode',
    'hist_purchase_year_month_day_duration', 'hist_purchase_year_max_first_active_year'
]

In [276]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 1,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 4590}

In [277]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

# train_columns = [f for f in train.columns if f not in FEATS_EXCLUDED]
train_columns = trainable_feature

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=train.iloc[trn_idx]['target'])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=train.iloc[val_idx]['target'])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print("CV score: {:<8.5f}".format(mean_squared_error(train.target.values, oof_lgb)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.69357	valid_1's rmse: 3.73897
[200]	training's rmse: 3.63089	valid_1's rmse: 3.71624
[300]	training's rmse: 3.59219	valid_1's rmse: 3.71007
[400]	training's rmse: 3.56172	valid_1's rmse: 3.70692
[500]	training's rmse: 3.53897	valid_1's rmse: 3.70444
[600]	training's rmse: 3.51897	valid_1's rmse: 3.70307
[700]	training's rmse: 3.50041	valid_1's rmse: 3.70209
[800]	training's rmse: 3.48347	valid_1's rmse: 3.70214
[900]	training's rmse: 3.46778	valid_1's rmse: 3.70195
[1000]	training's rmse: 3.45323	valid_1's rmse: 3.70186
[1100]	training's rmse: 3.43922	valid_1's rmse: 3.70165
[1200]	training's rmse: 3.42492	valid_1's rmse: 3.70129
[1300]	training's rmse: 3.41149	valid_1's rmse: 3.7015
Early stopping, best iteration is:
[1191]	training's rmse: 3.42633	valid_1's rmse: 3.70125
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.69287	valid_1's rmse: 

score : 3.70479

In [256]:
sub = pd.read_csv('./data/sample_submission.csv')

In [257]:
sub.target = predictions_lgb

In [258]:
sub.to_csv('./data/sub_3-70589.csv', index=False)