# 사전 작업

## 모듈 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc
from tqdm import tqdm_notebook

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
pd.set_option('display.max_columns', 400)

# 통합 데이터

In [6]:
def drop_features(data, keyword, debug=False):
    if debug:
        data = data[:1000]
        
    logits = []
    for col in data.columns:
        if keyword in col:
            logits.append(col)
        else:
            pass
    
    return logits

## 데이터 로드

In [7]:
path = './data/'

In [8]:
train = pd.read_csv(path + 'train_v4.csv')
test = pd.read_csv(path + 'test_v4.csv')

In [9]:
train.new_card_id_size.fillna(0, inplace=True)
test.new_card_id_size.fillna(0, inplace=True)

In [10]:
raw_history = pd.read_csv(path + 'historical_transactions.csv')

In [11]:
history = raw_history.copy()

## Feature Engineering

In [12]:
def null_cnt(x):
    return np.sum(x.isna() * 1)

In [13]:
def null_rate(x):
    return np.sum(x.isna() * 1) / len(x)

In [14]:
from scipy import stats
def mode(x):
    return stats.mode(x)[0][0]

In [15]:
def most_value_cnt(x):
    return x.value_counts().values[0]

In [16]:
history.head(3)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37


### 거래가 승인 되고, 오프라인 거래의 city_id

In [17]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'N']

In [18]:
temp = temp.groupby('card_id').agg({'city_id': [mode, 'nunique']})
temp.head(1)

Unnamed: 0_level_0,city_id,city_id
Unnamed: 0_level_1,mode,nunique
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2
C_ID_00007093c1,244,3


In [19]:
temp.columns = ['hist_city_id_mode_authorized_flag_category_1_Y', 'hist_city_id_nunique_authorized_flag_category_1_Y']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_city_id_mode_authorized_flag_category_1_Y,hist_city_id_nunique_authorized_flag_category_1_Y
0,C_ID_00007093c1,244,3


In [20]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [21]:
del temp
gc.collect()

35

### 거래가 승인된 category_1

In [22]:
temp = history[history.authorized_flag == 'Y']

In [23]:
temp.category_1 = temp.category_1.map({'Y':1, 'N':0})

In [24]:
temp = temp.groupby('card_id').agg({'category_1':['mean', 'sum', 'size']})
temp.head(1)

Unnamed: 0_level_0,category_1,category_1,category_1
Unnamed: 0_level_1,mean,sum,size
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
C_ID_00007093c1,0.210526,24,114


In [25]:
temp.columns = ['hist_category_1_authorized_flag_Y_mean', 'hist_category_1_authorized_flag_Y_sum', 'hist_category_1_authorized_flag_Y_size']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_category_1_authorized_flag_Y_mean,hist_category_1_authorized_flag_Y_sum,hist_category_1_authorized_flag_Y_size
0,C_ID_00007093c1,0.210526,24,114


In [26]:
temp.hist_category_1_authorized_flag_Y_mean = np.round(temp.hist_category_1_authorized_flag_Y_mean, 4)

In [27]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [28]:
del temp
gc.collect()

35

### 거래가 승인된 installments

In [29]:
def has_999(data):
    # 999가 없을 때
    if data[data == 999].__len__() == 0:
        return 0
    # 999가 있을 때
    else:
        return 1

In [30]:
def cnt_std(data):
    value = data.value_counts().std()
    if np.isnan(value):
        return 0
    else:
        return value

In [31]:
history.installments.replace(-1, np.nan, inplace=True)

In [32]:
temp = history[history.authorized_flag == 'Y']

In [33]:
temp = temp.groupby('card_id').agg({'installments':[mode, 'size', 'mean', 'max', 'var', 'min', null_cnt, null_rate, cnt_std]})
temp.head(1)

Unnamed: 0_level_0,installments,installments,installments,installments,installments,installments,installments,installments,installments
Unnamed: 0_level_1,mode,size,mean,max,var,min,null_cnt,null_rate,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
C_ID_00007093c1,1.0,114,1.289474,6.0,0.632278,1.0,0.0,0.0,41.069453


In [34]:
temp.columns = ['hist_installments_authorized_flag_Y_mode', 'hist_installments_authorized_flag_Y_size', 'hist_installments_authorized_flag_Y_mean', 'hist_installments_authorized_flag_Y_max', 'hist_installments_authorized_flag_Y_var', 'hist_installments_authorized_flag_Y_min', 'hist_installments_authorized_flag_Y_null_cnt', 'hist_installments_authorized_flag_Y_null_rate', 'hist_installments_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_installments_authorized_flag_Y_mode,hist_installments_authorized_flag_Y_size,hist_installments_authorized_flag_Y_mean,hist_installments_authorized_flag_Y_max,hist_installments_authorized_flag_Y_var,hist_installments_authorized_flag_Y_min,hist_installments_authorized_flag_Y_null_cnt,hist_installments_authorized_flag_Y_null_rate,hist_installments_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,1.0,114,1.289474,6.0,0.632278,1.0,0.0,0.0,41.069453


In [35]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [36]:
del temp
gc.collect()

97

### category_3

In [37]:
history.category_3 = history.category_3.fillna('D')

### 승인된 거래 중 merchant_category_id

In [38]:
temp = history[history.authorized_flag == 'Y']

In [39]:
temp = temp.groupby('card_id').agg({'merchant_category_id':[mode, 'nunique', null_cnt, cnt_std]})

In [40]:
temp.columns = ['hist_merchant_category_id_authorized_flag_Y_mode', 'hist_merchant_category_id_authorized_flag_Y_nunique', 'hist_merchant_category_id_authorized_flag_Y_null_cnt', 'hist_merchant_category_id_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_category_id_authorized_flag_Y_mode,hist_merchant_category_id_authorized_flag_Y_nunique,hist_merchant_category_id_authorized_flag_Y_null_cnt,hist_merchant_category_id_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,307,18,0,10.278476


In [41]:
temp.hist_merchant_category_id_authorized_flag_Y_cnt_std = np.round(temp.hist_merchant_category_id_authorized_flag_Y_cnt_std, 3)

In [42]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [43]:
del temp
gc.collect()

28

### merchant_id

In [44]:
le = LabelEncoder()
le.fit(history.merchant_id.fillna('NULL').values)
history.merchant_id = le.transform(history.merchant_id.fillna('NULL'))
history.loc[history.merchant_id == 326311, 'merchant_id'] = np.nan

In [45]:
temp = history[history.authorized_flag == 'Y']

In [46]:
temp = temp.groupby('card_id').agg({'merchant_id': [cnt_std]})
temp.head(1)

Unnamed: 0_level_0,merchant_id
Unnamed: 0_level_1,cnt_std
card_id,Unnamed: 1_level_2
C_ID_00007093c1,6.943841


In [47]:
temp.columns = ['hist_merchant_id_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_id_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,6.943841


In [48]:
temp.hist_merchant_id_authorized_flag_Y_cnt_std = np.round(temp.hist_merchant_id_authorized_flag_Y_cnt_std, 3)

In [49]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [50]:
del temp
gc.collect()

42

In [51]:
temp = history.groupby('card_id').agg({'merchant_id': [mode, 'nunique', null_cnt, cnt_std]})
temp.head(1)

Unnamed: 0_level_0,merchant_id,merchant_id,merchant_id,merchant_id
Unnamed: 0_level_1,mode,nunique,null_cnt,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,188773.0,29,0.0,8.794171


In [52]:
temp.columns = ['hist_merchant_id_mode', 'hist_merchant_id_nunique2', 'hist_merchant_id_null_cnt', 'hist_merchant_id_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_merchant_id_mode,hist_merchant_id_nunique2,hist_merchant_id_null_cnt,hist_merchant_id_cnt_std
0,C_ID_00007093c1,188773.0,29,0.0,8.794171


In [53]:
temp.hist_merchant_id_cnt_std = np.round(temp.hist_merchant_id_cnt_std, 3)

In [54]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [55]:
del temp
gc.collect()

28

### month_lag

In [135]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'Y']
temp = history.groupby('card_id').agg({'month_lag': [mode, 'min', 'max', cnt_std]})
temp.head(1)

Unnamed: 0_level_0,month_lag,month_lag,month_lag,month_lag
Unnamed: 0_level_1,mode,min,max,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,-8,-12,0,4.701336


In [136]:
temp.columns = ['hist_month_lag_authorized_flag_Y_category_1_Y_mode', 'hist_month_lag_authorized_flag_Y_category_1_Y_min', 'hist_month_lag_authorized_flag_Y_category_1_Y_max', 'hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_lag_authorized_flag_Y_category_1_Y_mode,hist_month_lag_authorized_flag_Y_category_1_Y_min,hist_month_lag_authorized_flag_Y_category_1_Y_max,hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std
0,C_ID_00007093c1,-8,-12,0,4.701336


In [138]:
temp.hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std = np.round(temp.hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std, 3)

In [139]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [140]:
del temp
gc.collect()

900

In [141]:
temp = history[history.authorized_flag == 'Y']
temp = temp[temp.category_1 == 'N']
temp = history.groupby('card_id').agg({'month_lag': [mode, 'min', 'max', cnt_std]})
temp.head(1)

Unnamed: 0_level_0,month_lag,month_lag,month_lag,month_lag
Unnamed: 0_level_1,mode,min,max,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,-8,-12,0,4.701336


In [142]:
temp.columns = ['hist_month_lag_authorized_flag_Y_category_1_N_mode', 'hist_month_lag_authorized_flag_Y_category_1_N_min', 'hist_month_lag_authorized_flag_Y_category_1_N_max', 'hist_month_lag_authorized_flag_Y_category_1_N_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_month_lag_authorized_flag_Y_category_1_N_mode,hist_month_lag_authorized_flag_Y_category_1_N_min,hist_month_lag_authorized_flag_Y_category_1_N_max,hist_month_lag_authorized_flag_Y_category_1_N_cnt_std
0,C_ID_00007093c1,-8,-12,0,4.701336


In [143]:
temp.hist_month_lag_authorized_flag_Y_category_1_N_cnt_std = np.round(temp.hist_month_lag_authorized_flag_Y_category_1_N_cnt_std, 3)

In [144]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [145]:
del temp
gc.collect()

28

### category_2

In [171]:
temp = history[history.authorized_flag == 'Y']

In [172]:
temp = temp.groupby('card_id').agg({'category_2': [mode, 'nunique', null_cnt, cnt_std]})

In [173]:
temp.head(1)

Unnamed: 0_level_0,category_2,category_2,category_2,category_2
Unnamed: 0_level_1,mode,nunique,null_cnt,cnt_std
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
C_ID_00007093c1,3.0,2,24.0,62.225397


In [174]:
temp.columns = ['hist_category_2_authorized_flag_Y_mode', 'hist_category_2_authorized_flag_Y_nunique', 'hist_category_2_authorized_flag_Y_null_cnt', 'hist_category_2_authorized_flag_Y_cnt_std']
temp.reset_index(inplace=True)
temp.head(1)

Unnamed: 0,card_id,hist_category_2_authorized_flag_Y_mode,hist_category_2_authorized_flag_Y_nunique,hist_category_2_authorized_flag_Y_null_cnt,hist_category_2_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,3.0,2,24.0,62.225397


In [177]:
train = train.merge(temp, on='card_id', how='left')
test = test.merge(temp, on='card_id', how='left')

In [178]:
del temp
gc.collect()

80

### subsector

In [233]:
# temp = history[history.authorized_flag == 'Y']
# temp = temp.groupby('card_id').agg({'subsector_id': [mode, 'nunique', null_cnt, cnt_std]})

In [234]:
# temp.columns = ['hist_subsector_id_authorized_flag_Y_mode', 'hist_subsector_id_authorized_flag_Y_nunique', 'hist_subsector_id_authorized_flag_Y_null_cnt', 'hist_subsector_id_authorized_flag_Y_cnt_std']
# temp.reset_index(inplace=True)
# temp.head(1)

Unnamed: 0,card_id,hist_subsector_id_authorized_flag_Y_mode,hist_subsector_id_authorized_flag_Y_nunique,hist_subsector_id_authorized_flag_Y_null_cnt,hist_subsector_id_authorized_flag_Y_cnt_std
0,C_ID_00007093c1,19,13,0,11.380933


In [235]:
# train = train.merge(temp, on='card_id', how='left')
# test = test.merge(temp, on='card_id', how='left')

In [236]:
# del temp
# gc.collect()

91

## Feature Selection

In [237]:
for df in [train, test]:
    df['temp'] = 1 - df.hist_category_1_label_mean
    df['hist_category_1_authorized_flag_Y_0_cnt'] = df.hist_category_1_authorized_flag_Y_size - df.hist_category_1_authorized_flag_Y_sum
    df.hist_installments_authorized_flag_Y_cnt_std = np.around(df.hist_installments_authorized_flag_Y_cnt_std, 4)
    df['temp'] = df.hist_month_lag_max - df.hist_month_lag_min

In [238]:
train.head(1)

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,first_active,first_active_year,first_active_elapsed_time_from_trade,first_active_total_day,hist_authorized_flag_label_mode,hist_authorized_flag_label_sum,hist_authorized_flag_label_mean,hist_card_id_size,hist_city_id_mode,hist_city_id_nunique,hist_category_1_label_mode,hist_category_1_label_sum,hist_category_1_label_mean,hist_installments_mode,hist_installments_sum,hist_installments_mean,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_null_cnt,hist_category_3_label_mode,hist_category_3_label_mean,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_month_lag_mode,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_skew,hist_purchase_date_max,hist_purchase_date_min,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_category_2_mode,hist_category_2_mean,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode_x,hist_subsector_id_nunique_x,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean,hist_price_sum,hist_price_mean,hist_price_max,hist_price_min,hist_price_var,hist_month_diff_max,hist_month_diff_min,hist_month_diff_mean,hist_month_diff_var,hist_month_diff_skew,hist_Christmas_Day_2017_mean,hist_Mothers_Day_2017_mean,hist_fathers_day_2017_mean,hist_Children_day_2017_mean,hist_Valentine_Day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_duration_mean,hist_duration_min,hist_duration_max,hist_duration_var,hist_duration_skew,hist_amount_month_ratio_mean,hist_amount_month_ratio_min,hist_amount_month_ratio_max,hist_amount_month_ratio_var,hist_amount_month_ratio_skew,hist_category_2_mean_mean,hist_category_2_min_mean,hist_category_2_max_mean,hist_category_2_sum_mean,hist_category_3_mean_mean,hist_category_3_min_mean,hist_category_3_max_mean,hist_category_3_sum_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,hist_purchase_date_uptomin,new_authorized_flag_mode,new_authorized_flag_sum,new_authorized_flag_mean,new_card_id_size,new_city_id_mode,new_city_id_nunique,new_category_1_mode,new_category_1_sum,new_category_1_mean,new_installments_mode,new_installments_sum,new_installments_mean,new_installments_var,new_installments_max,new_installments_min,new_installments_null_cnt,new_category_3_mode,new_category_3_mean,new_merchant_category_id_mode,new_merchant_category_id_nunique,new_merchant_id_nunique,new_month_lag_mode,new_month_lag_sum,new_month_lag_mean,new_month_lag_var,new_month_lag_max,new_month_lag_min,new_month_lag_skew,new_purchase_date_max,new_purchase_date_min,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_var,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_skew,new_purchase_amount_over_550,new_category_2_mode,new_category_2_mean,new_state_id_mode,new_state_id_nunique,new_subsector_id_mode,new_subsector_id_nunique,new_purchase_amount_trim_sum,new_purchase_amount_trim_mean,new_purchase_amount_trim_var,new_purchase_amount_trim_max,new_purchase_amount_trim_min,new_purchase_amount_trim_skew,new_purchase_year_mode,new_purchase_year_nunique,new_purchase_year_mean,new_purchase_year_max,new_purchase_year_min,new_purchase_month_mode,new_purchase_month_nunique,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_day_mode,new_purchase_day_nunique,new_purchase_day_mean,new_purchase_day_var,new_purchase_day_max,new_purchase_day_min,new_purchase_day_skew,new_purchase_hour_mode,new_purchase_hour_nunique,new_purchase_hour_mean,new_purchase_hour_var,new_purchase_hour_max,new_purchase_hour_min,new_purchase_hour_skew,new_purchase_dayofweek_mode,new_purchase_dayofweek_nunique,new_purchase_dayofweek_mean,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_weekofyear_mode,new_purchase_weekofyear_nunique,new_purchase_weekofyear_mean,new_purchase_weekofyear_max,new_purchase_weekofyear_min,new_purchase_weekend_mode,new_purchase_weekend_sum,new_purchase_weekend_mean,new_price_sum,new_price_mean,new_price_max,new_price_min,new_price_var,new_month_diff_max,new_month_diff_min,new_month_diff_mean,new_month_diff_var,new_month_diff_skew,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_duration_mean,new_duration_min,new_duration_max,new_duration_var,new_duration_skew,new_amount_month_ratio_mean,new_amount_month_ratio_min,new_amount_month_ratio_max,new_amount_month_ratio_var,new_amount_month_ratio_skew,new_category_2_mean_mean,new_category_3_mean_mean,new_purchase_date_diff,new_purchase_date_average,new_purchase_date_uptonow,new_purchase_date_uptomin,hist_first_buy,hist_last_buy,new_first_buy,new_last_buy,card_id_total_size,card_id_size_ratio,purchase_amount_total,purchase_amount_mean,purchase_amount_max,purchase_amount_min,purchase_amount_ratio,month_diff_mean,month_diff_ratio,month_lag_mean,month_lag_max,month_lag_min,category_1_mean,installments_total,installments_mean,installments_max,installments_ratio,price_total,price_mean,price_max,duration_mean,duration_min,duration_max,amount_month_ratio_mean,amount_month_ratio_min,amount_month_ratio_max,new_CLV,hist_CLV,CLV_ratio,hist_city_id_mode_authorized_flag_category_1_Y,hist_city_id_nunique_authorized_flag_category_1_Y,hist_category_1_authorized_flag_Y_mean,hist_category_1_authorized_flag_Y_sum,hist_category_1_authorized_flag_Y_size,hist_installments_authorized_flag_Y_mode,hist_installments_authorized_flag_Y_size,hist_installments_authorized_flag_Y_mean,hist_installments_authorized_flag_Y_max,hist_installments_authorized_flag_Y_var,hist_installments_authorized_flag_Y_min,hist_installments_authorized_flag_Y_null_cnt,hist_installments_authorized_flag_Y_null_rate,hist_installments_authorized_flag_Y_cnt_std,hist_merchant_category_id_authorized_flag_Y_mode,hist_merchant_category_id_authorized_flag_Y_nunique,hist_merchant_category_id_authorized_flag_Y_null_cnt,hist_merchant_category_id_authorized_flag_Y_cnt_std,hist_merchant_id_authorized_flag_Y_cnt_std,hist_merchant_id_mode,hist_merchant_id_nunique2,hist_merchant_id_null_cnt,hist_merchant_id_cnt_std,temp,hist_category_1_authorized_flag_Y_0_cnt,hist_month_lag_std,hist_month_lag_authorized_flag_Y_mode,hist_month_lag_authorized_flag_Y_min,hist_month_lag_authorized_flag_Y_max,hist_month_lag_authorized_flag_Y_cnt_std,hist_month_lag_authorized_flag_Y_category_1_Y_mode,hist_month_lag_authorized_flag_Y_category_1_Y_min,hist_month_lag_authorized_flag_Y_category_1_Y_max,hist_month_lag_authorized_flag_Y_category_1_Y_cnt_std,hist_month_lag_authorized_flag_Y_category_1_N_mode,hist_month_lag_authorized_flag_Y_category_1_N_min,hist_month_lag_authorized_flag_Y_category_1_N_max,hist_month_lag_authorized_flag_Y_category_1_N_cnt_std,hist_category_2_authorized_flag_Y_mode,hist_category_2_authorized_flag_Y_nunique,hist_category_2_authorized_flag_Y_null_cnt,hist_category_2_authorized_flag_Y_cnt_std,hist_subsector_id_mode_y,hist_subsector_id_nunique_y,hist_subsector_id_null_cnt,hist_subsector_id_cnt_std,hist_subsector_id_authorized_flag_Y_mode,hist_subsector_id_authorized_flag_Y_nunique,hist_subsector_id_authorized_flag_Y_null_cnt,hist_subsector_id_authorized_flag_Y_cnt_std
0,C_ID_92a2005557,5,2,1,6,0.0,-0.820283,2017-06-01,2017,333,17318.0,1,247.0,0.95,260,69,7,0,0.0,0.0,0.0,4.0,0.01538,0.015205,1.0,0.0,0.0,0,0.01538,560,41,94,-2,-1017.0,-3.912,5.75,0,-8,0.066,1e-09,1e-09,-165.96873,-0.638341,0.045003,2.258395,-0.7393,10.24,0.0,1.0,1.046,9,3,34,21,-167.4,-0.644,0.02057,0.8,-0.7393,5.133,2017,2,2017.0,2018,2017,12,9,8.055,12,1,11,31,15.51,76.9,31,1,0.10236,14,23,13.31,24.69,23,0,-0.887,5,7,3.21,6,0,50,35,33.06,52,1,0,90.0,0.3462,,,inf,-inf,,3,2,2.072,0.068,3.299,13.125,0.0,6.266,26.77,0.0,16.47,7.754,-1.325042,-2.201,4.516789,0.216691,7.734,-0.311,-0.3696,1.129197,0.011823,9.35,0.0725,-0.747,5942464.5,1309718.6,0.3467,-0.747,5920398.5,5429670.5,242,0.9307,346,589,1.0,23.0,1.0,23.0,69.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,278.0,14.0,23.0,1.0,34.0,1.479,0.261,2.0,1.0,0.09326,1e-09,1e-09,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.895996,0.0,1.0,1.0,9.0,1.0,37.0,10.0,-13.24,-0.5757,0.01843,-0.2961,-0.7246,0.896,2018.0,1.0,2018.0,2018.0,2018.0,3.0,2.0,3.479,4.0,3.0,6.0,17.0,16.44,88.8,31.0,5.0,0.3389,13.0,8.0,12.87,4.21,16.0,8.0,-0.6035,4.0,7.0,3.13,6.0,0.0,13.0,7.0,13.305,17.0,10.0,0.0,6.0,0.261,-inf,-inf,-inf,-inf,,2.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,41.75,-1.151,-1.449,-0.5923,0.0737,0.895996,-0.2878,-0.3623,-0.1481,0.00461,0.895996,-0.5503,-0.593,54.0,2.348,283.0,338.0,-17318,-17318,-17318,-17318,283.0,0.088462,-179.20873,-1.214041,1.962295,-1.4639,0.079774,4.072,0.965251,-2.433,2.0,-7.0,0.0,4.0,0.01538,1.0,0.0,-44.802183,-78.936365,1.962295,-2.476042,-3.65,3.924489,-0.5988,-0.7319,0.981097,-152.26,-20826.191988,0.007311,69.0,7.0,0.0,0,247,0.0,247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,560,41,0,11.006,4.419,33816.0,94,6.0,4.569,8,247,17.244,-2,-8,0,17.244,-2,-8,0,17.244,-2,-8,0,17.244,1.0,2,0.0,170.412734,34,21,0,23.410417,34,21,0,22.380583


In [None]:
    'hist_subsector_id_mode', 'hist_subsector_id_nunique', 'hist_subsector_id_null_cnt', 'hist_subsector_id_cnt_std'


In [256]:
trainable_feature = [
    'feature_1', 'feature_2', 'feature_3', 
    'first_active_month', 'first_active_year', 'first_active_elapsed_time_from_trade', 'first_active_total_day',
    'hist_card_id_size', 'new_card_id_size',
    'hist_authorized_flag_label_mean', 'hist_authorized_flag_label_sum',
    'hist_city_id_mode_authorized_flag_category_1_Y', 'hist_city_id_nunique_authorized_flag_category_1_Y',
    'hist_category_1_authorized_flag_Y_mean',
    'hist_installments_authorized_flag_Y_max', 'hist_installments_authorized_flag_Y_min', 'hist_installments_authorized_flag_Y_null_cnt', 'hist_installments_authorized_flag_Y_var', 'hist_installments_authorized_flag_Y_cnt_std',
    'hist_merchant_category_id_authorized_flag_Y_cnt_std',
    'hist_merchant_id_mode', 'hist_merchant_id_nunique2', 'hist_merchant_id_null_cnt', 'hist_merchant_id_authorized_flag_Y_cnt_std',
    'hist_month_lag_min', 'hist_month_lag_max', 'hist_month_lag_mode',
    'hist_subsector_id_mode', 'hist_subsector_id_nunique'
]

In [257]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 1,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 4590}

In [258]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

# train_columns = [f for f in train.columns if f not in FEATS_EXCLUDED]
train_columns = trainable_feature

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=train.iloc[trn_idx]['target'])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=train.iloc[val_idx]['target'])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print("CV score: {:<8.5f}".format(mean_squared_error(train.target.values, oof_lgb)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.74471	valid_1's rmse: 3.78033
[200]	training's rmse: 3.69275	valid_1's rmse: 3.75913
[300]	training's rmse: 3.65825	valid_1's rmse: 3.74871
[400]	training's rmse: 3.63324	valid_1's rmse: 3.74325
[500]	training's rmse: 3.61204	valid_1's rmse: 3.73955
[600]	training's rmse: 3.59389	valid_1's rmse: 3.73737
[700]	training's rmse: 3.57818	valid_1's rmse: 3.73589
[800]	training's rmse: 3.56458	valid_1's rmse: 3.73567
[900]	training's rmse: 3.55126	valid_1's rmse: 3.73495
[1000]	training's rmse: 3.53845	valid_1's rmse: 3.73461
[1100]	training's rmse: 3.52718	valid_1's rmse: 3.73518
Early stopping, best iteration is:
[999]	training's rmse: 3.53855	valid_1's rmse: 3.7346
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.74269	valid_1's rmse: 3.78062
[200]	training's rmse: 3.69135	valid_1's rmse: 3.7577
[300]	training's rmse: 3.65662	valid_1's rmse: 3.74

score : 3.73218 

In [259]:
sub = pd.read_csv('./data/sample_submission.csv')

In [260]:
sub.target = predictions_lgb

In [261]:
sub.to_csv('./data/sub_3-73218.csv', index=False)