# 사전 작업

## 모듈 로드

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import seaborn as sns
import gc

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold, StratifiedKFold

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_columns', 600)

# 통합 데이터

In [5]:
def drop_features(data, keyword, debug=False):
    if debug:
        data = data[:1000]
        
    logits = []
    for col in data.columns:
        if keyword in col:
            logits.append(col)
        else:
            pass
    
    return logits

## 데이터 로드

In [6]:
path = './data/'

In [7]:
train = pd.read_csv(path + 'trainFE_0213.csv')
hist = pd.read_csv(path + 'histFE_0213.csv')
new = pd.read_csv(path + 'newFE_0213.csv')
test = pd.read_csv('./data_feature_engineering/train_test_v2.csv')
test = test[test.target.isnull()]

In [8]:
train = train.merge(hist, left_on='card_id', right_on='hist_card_id', how='left').merge(new, left_on='card_id', right_on='new_card_id', how='left')
train.drop(['hist_card_id', 'new_card_id'], axis = 1, inplace=True)

In [9]:
test = test.merge(hist, left_on='card_id', right_on='hist_card_id', how='left').merge(new, left_on='card_id', right_on='new_card_id', how='left')
test.drop(['hist_card_id', 'new_card_id'], axis = 1, inplace=True)

In [10]:
train.head(3)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,first_active,first_active_year,first_active_quarter,first_active_weekofyear,first_active_dayofweek,first_active_elapsed_time_from_trade,first_active_elapsed_time_from_today,first_active_total_day,days_feature1_trade,days_feature2_trade,days_feature3_trade,days_feature1_trade_ratio,days_feature2_trade_ratio,days_feature3_trade_ratio,outliers,feature_1_outlier,feature_2_outlier,feature_3_outlier,first_active_year_outlier,first_active_month_outlier,first_active_total_day_outlier,feature_sum_outlier,feature_mean_outlier,feature_max_outlier,feature_min_outlier,feature_var_outlier,feature_sum,feature_mean,feature_max,feature_min,feature_var,hist_fromRefDate_sum,hist_fromRefDate_min,hist_fromRefDate_max,hist_fromRefDate_mean,hist_fromRefDate_median,hist_fromRefDate_var,hist_fromRefDate_quantileRange,hist_purchase_year_mode,hist_purchase_year_nunique,hist_purchase_year_mean,hist_purchase_year_max,hist_purchase_year_min,hist_purchase_month_mode,hist_purchase_month_nunique,hist_purchase_month_mean,hist_purchase_month_max,hist_purchase_month_min,hist_purchase_day_mode,hist_purchase_day_nunique,hist_purchase_day_mean,hist_purchase_day_var,hist_purchase_day_max,hist_purchase_day_min,hist_purchase_day_skew,hist_purchase_hour_mode,hist_purchase_hour_nunique,hist_purchase_hour_mean,hist_purchase_hour_var,hist_purchase_hour_max,hist_purchase_hour_min,hist_purchase_hour_skew,hist_purchase_date_max,hist_purchase_date_min,hist_month_lag_sum,hist_month_lag_mean,hist_month_lag_median,hist_month_lag_var,hist_month_lag_max,hist_month_lag_min,hist_month_lag_quantileRange,hist_month_lag_skew,hist_purchase_dayofweek_mode,hist_purchase_dayofweek_nunique,hist_purchase_dayofweek_mean,hist_purchase_dayofweek_max,hist_purchase_dayofweek_min,hist_purchase_weekofyear_mode,hist_purchase_weekofyear_nunique,hist_purchase_weekofyear_mean,hist_purchase_weekofyear_max,hist_purchase_weekofyear_min,hist_purchase_weekend_mode,hist_purchase_weekend_sum,hist_purchase_weekend_mean,hist_Christmas_Day_2017_mean,hist_Children_day_2017_mean,hist_Black_Friday_2017_mean,hist_Mothers_Day_2018_mean,hist_purchase_date_total_day_max,hist_purchase_date_total_day_min,hist_purchase_date_total_day_mean,hist_purchase_date_total_day_var,hist_purchase_date_total_day_skew,hist_month_diff_from_trade_max,hist_month_diff_from_trade_min,hist_month_diff_from_trade_mean,hist_month_diff_from_trade_var,hist_month_diff_from_trade_skew,hist_month_diff_from_today_max,hist_month_diff_from_today_min,hist_month_diff_from_today_mean,hist_month_diff_from_today_var,hist_month_diff_from_today_skew,hist_city_id_count,hist_city_id_mode,hist_city_id_nunique,hist_state_id_mode,hist_state_id_nunique,hist_subsector_id_mode,hist_subsector_id_nunique,hist_merchant_category_id_mode,hist_merchant_category_id_nunique,hist_merchant_id_mode,hist_merchant_id_nunique,hist_merchant_visit_sum,hist_merchant_visit_mean,hist_merchant_visit_min,hist_merchant_visit_max,hist_merchant_visit_nunique,hist_merchant_visit_size,hist_merchant_visit_mode,hist_merchant_try_mean,hist_merchant_try_std,hist_merchant_try_min,hist_merchant_try_max,hist_merchant_try_nunique,hist_merchant_try_size,hist_merchant_try_mode,hist_installments_sum,hist_installments_mean,hist_installments_median,hist_installments_var,hist_installments_max,hist_installments_min,hist_installments_quantileRange,hist_installments_null_cnt,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_median,hist_purchase_amount_var,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_quantileRange,hist_purchase_amount_skew,hist_purchase_amount_over_550,hist_purchase_amount_trim_sum,hist_purchase_amount_trim_mean,hist_purchase_amount_trim_var,hist_purchase_amount_trim_max,hist_purchase_amount_trim_min,hist_purchase_amount_trim_skew,hist_authorized_flag_mean,hist_authorized_flag_sum,hist_category_1_mode,hist_category_1_nunique,hist_category_1_mean,hist_category_1_var,hist_category_1_sum,hist_category_3_mode,hist_category_3_nunique,hist_category_3_mean,hist_category_3_var,hist_category_2_mode,hist_category_2_mean,hist_category_2_var,hist_category_2_nunique,hist_category_2_mean_mean,hist_category_2_min_mean,hist_category_2_max_mean,hist_category_2_sum_mean,hist_category_3_mean_mean,hist_category_3_min_mean,hist_category_3_max_mean,hist_category_3_sum_mean,hist_merchant_group_id_mode,hist_merchant_group_id_nunique,hist_numerical_1_sum,hist_numerical_1_max,hist_numerical_1_min,hist_numerical_1_mean,hist_numerical_1_median,hist_numerical_1_var,hist_numerical_1_quantileRange,hist_numerical_2_sum,hist_numerical_2_max,hist_numerical_2_min,hist_numerical_2_mean,hist_numerical_2_median,hist_numerical_2_var,hist_numerical_2_quantileRange,hist_most_recent_sales_range_mode,hist_most_recent_sales_range_nunique,hist_most_recent_sales_range_mean,hist_most_recent_sales_range_var,hist_most_recent_purchases_range_mode,hist_most_recent_purchases_range_nunique,hist_most_recent_purchases_range_mean,hist_most_recent_purchases_range_var,hist_category_4_mode,hist_category_4_nunique,hist_category_4_mean,hist_category_4_var,hist_category_5_mode,hist_category_5_nunique,hist_category_5_mean,hist_category_5_var,hist_avg_sales_lag3_sum,hist_avg_sales_lag3_max,hist_avg_sales_lag3_min,hist_avg_sales_lag3_mean,hist_avg_sales_lag3_median,hist_avg_sales_lag3_var,hist_avg_sales_lag3_quantileRange,hist_avg_purchases_lag3_sum,hist_avg_purchases_lag3_max,hist_avg_purchases_lag3_min,hist_avg_purchases_lag3_mean,hist_avg_purchases_lag3_median,hist_avg_purchases_lag3_var,hist_avg_purchases_lag3_quantileRange,hist_active_months_lag3_sum,hist_active_months_lag3_max,hist_active_months_lag3_min,hist_active_months_lag3_mean,hist_active_months_lag3_median,hist_active_months_lag3_var,hist_active_months_lag3_quantileRange,hist_avg_sales_lag6_sum,hist_avg_sales_lag6_max,hist_avg_sales_lag6_min,hist_avg_sales_lag6_mean,hist_avg_sales_lag6_median,hist_avg_sales_lag6_var,hist_avg_sales_lag6_quantileRange,hist_avg_purchases_lag6_sum,hist_avg_purchases_lag6_max,hist_avg_purchases_lag6_min,hist_avg_purchases_lag6_mean,hist_avg_purchases_lag6_median,hist_avg_purchases_lag6_var,hist_avg_purchases_lag6_quantileRange,hist_active_months_lag6_sum,hist_active_months_lag6_max,hist_active_months_lag6_min,hist_active_months_lag6_mean,hist_active_months_lag6_median,hist_active_months_lag6_var,hist_active_months_lag6_quantileRange,hist_avg_sales_lag12_sum,hist_avg_sales_lag12_max,hist_avg_sales_lag12_min,hist_avg_sales_lag12_mean,hist_avg_sales_lag12_median,hist_avg_sales_lag12_var,hist_avg_sales_lag12_quantileRange,hist_avg_purchases_lag12_sum,hist_avg_purchases_lag12_max,hist_avg_purchases_lag12_min,hist_avg_purchases_lag12_mean,hist_avg_purchases_lag12_median,hist_avg_purchases_lag12_var,hist_avg_purchases_lag12_quantileRange,hist_active_months_lag12_sum,hist_active_months_lag12_max,hist_active_months_lag12_min,hist_active_months_lag12_mean,hist_active_months_lag12_median,hist_active_months_lag12_var,hist_active_months_lag12_quantileRange,hist_city_ym_rate_mode,hist_city_ym_rate_sum,hist_city_ym_rate_mean,hist_city_ym_rate_median,hist_city_ym_rate_var,hist_city_ym_rate_quantileRange,new_fromRefDate_sum,new_fromRefDate_min,new_fromRefDate_max,new_fromRefDate_mean,new_fromRefDate_median,new_fromRefDate_var,new_fromRefDate_quantileRange,new_purchase_year_mode,new_purchase_year_nunique,new_purchase_year_mean,new_purchase_year_max,new_purchase_year_min,new_purchase_month_mode,new_purchase_month_nunique,new_purchase_month_mean,new_purchase_month_max,new_purchase_month_min,new_purchase_day_mode,new_purchase_day_nunique,new_purchase_day_mean,new_purchase_day_var,new_purchase_day_max,new_purchase_day_min,new_purchase_day_skew,new_purchase_hour_mode,new_purchase_hour_nunique,new_purchase_hour_mean,new_purchase_hour_var,new_purchase_hour_max,new_purchase_hour_min,new_purchase_hour_skew,new_purchase_date_max,new_purchase_date_min,new_month_lag_sum,new_month_lag_mean,new_month_lag_median,new_month_lag_var,new_month_lag_max,new_month_lag_min,new_month_lag_quantileRange,new_month_lag_skew,new_purchase_dayofweek_mode,new_purchase_dayofweek_nunique,new_purchase_dayofweek_mean,new_purchase_dayofweek_max,new_purchase_dayofweek_min,new_purchase_weekofyear_mode,new_purchase_weekofyear_nunique,new_purchase_weekofyear_mean,new_purchase_weekofyear_max,new_purchase_weekofyear_min,new_purchase_weekend_mode,new_purchase_weekend_sum,new_purchase_weekend_mean,new_Christmas_Day_2017_mean,new_Children_day_2017_mean,new_Black_Friday_2017_mean,new_Mothers_Day_2018_mean,new_purchase_date_total_day_max,new_purchase_date_total_day_min,new_purchase_date_total_day_mean,new_purchase_date_total_day_var,new_purchase_date_total_day_skew,new_month_diff_from_trade_max,new_month_diff_from_trade_min,new_month_diff_from_trade_mean,new_month_diff_from_trade_var,new_month_diff_from_trade_skew,new_month_diff_from_today_max,new_month_diff_from_today_min,new_month_diff_from_today_mean,new_month_diff_from_today_var,new_month_diff_from_today_skew,new_city_id_count,new_city_id_mode,new_city_id_nunique,new_state_id_mode,new_state_id_nunique,new_subsector_id_mode,new_subsector_id_nunique,new_merchant_category_id_mode,new_merchant_category_id_nunique,new_merchant_id_mode,new_merchant_id_nunique,new_merchant_visit_sum,new_merchant_visit_mean,new_merchant_visit_min,new_merchant_visit_max,new_merchant_visit_nunique,new_merchant_visit_size,new_merchant_visit_mode,new_merchant_try_mean,new_merchant_try_std,new_merchant_try_min,new_merchant_try_max,new_merchant_try_nunique,new_merchant_try_size,new_merchant_try_mode,new_installments_sum,new_installments_mean,new_installments_median,new_installments_var,new_installments_max,new_installments_min,new_installments_quantileRange,new_installments_null_cnt,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_median,new_purchase_amount_var,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_quantileRange,new_purchase_amount_skew,new_purchase_amount_over_550,new_purchase_amount_trim_sum,new_purchase_amount_trim_mean,new_purchase_amount_trim_var,new_purchase_amount_trim_max,new_purchase_amount_trim_min,new_purchase_amount_trim_skew,new_authorized_flag_mean,new_authorized_flag_sum,new_category_1_mode,new_category_1_nunique,new_category_1_mean,new_category_1_var,new_category_1_sum,new_category_3_mode,new_category_3_nunique,new_category_3_mean,new_category_3_var,new_category_2_mode,new_category_2_mean,new_category_2_var,new_category_2_nunique,new_category_2_mean_mean,new_category_2_min_mean,new_category_2_max_mean,new_category_2_sum_mean,new_category_3_mean_mean,new_category_3_min_mean,new_category_3_max_mean,new_category_3_sum_mean,new_merchant_group_id_mode,new_merchant_group_id_nunique,new_numerical_1_sum,new_numerical_1_max,new_numerical_1_min,new_numerical_1_mean,new_numerical_1_median,new_numerical_1_var,new_numerical_1_quantileRange,new_numerical_2_sum,new_numerical_2_max,new_numerical_2_min,new_numerical_2_mean,new_numerical_2_median,new_numerical_2_var,new_numerical_2_quantileRange,new_most_recent_sales_range_mode,new_most_recent_sales_range_nunique,new_most_recent_sales_range_mean,new_most_recent_sales_range_var,new_most_recent_purchases_range_mode,new_most_recent_purchases_range_nunique,new_most_recent_purchases_range_mean,new_most_recent_purchases_range_var,new_category_4_mode,new_category_4_nunique,new_category_4_mean,new_category_4_var,new_category_5_mode,new_category_5_nunique,new_category_5_mean,new_category_5_var,new_avg_sales_lag3_sum,new_avg_sales_lag3_max,new_avg_sales_lag3_min,new_avg_sales_lag3_mean,new_avg_sales_lag3_median,new_avg_sales_lag3_var,new_avg_sales_lag3_quantileRange,new_avg_purchases_lag3_sum,new_avg_purchases_lag3_max,new_avg_purchases_lag3_min,new_avg_purchases_lag3_mean,new_avg_purchases_lag3_median,new_avg_purchases_lag3_var,new_avg_purchases_lag3_quantileRange,new_active_months_lag3_sum,new_active_months_lag3_max,new_active_months_lag3_min,new_active_months_lag3_mean,new_active_months_lag3_median,new_active_months_lag3_var,new_active_months_lag3_quantileRange,new_avg_sales_lag6_sum,new_avg_sales_lag6_max,new_avg_sales_lag6_min,new_avg_sales_lag6_mean,new_avg_sales_lag6_median,new_avg_sales_lag6_var,new_avg_sales_lag6_quantileRange,new_avg_purchases_lag6_sum,new_avg_purchases_lag6_max,new_avg_purchases_lag6_min,new_avg_purchases_lag6_mean,new_avg_purchases_lag6_median,new_avg_purchases_lag6_var,new_avg_purchases_lag6_quantileRange,new_active_months_lag6_sum,new_active_months_lag6_max,new_active_months_lag6_min,new_active_months_lag6_mean,new_active_months_lag6_median,new_active_months_lag6_var,new_active_months_lag6_quantileRange,new_avg_sales_lag12_sum,new_avg_sales_lag12_max,new_avg_sales_lag12_min,new_avg_sales_lag12_mean,new_avg_sales_lag12_median,new_avg_sales_lag12_var,new_avg_sales_lag12_quantileRange,new_avg_purchases_lag12_sum,new_avg_purchases_lag12_max,new_avg_purchases_lag12_min,new_avg_purchases_lag12_mean,new_avg_purchases_lag12_median,new_avg_purchases_lag12_var,new_avg_purchases_lag12_quantileRange,new_active_months_lag12_sum,new_active_months_lag12_max,new_active_months_lag12_min,new_active_months_lag12_mean,new_active_months_lag12_median,new_active_months_lag12_var,new_active_months_lag12_quantileRange,new_city_ym_rate_mode,new_city_ym_rate_sum,new_city_ym_rate_mean,new_city_ym_rate_median,new_city_ym_rate_var,new_city_ym_rate_quantileRange
0,6,C_ID_92a2005557,5,2,1,-0.820283,2017-06-01,2017,2,22,3,333,621,17318.0,1665,666,333,0.015015,0.006006,0.003003,0,0.013145,0.008752,0.011428,0.007631,0.012902,0.010373,0.033324,0.011108,0.013145,0.008752,0.002214,8,2.666667,5,1,2.081666,77283,177,420,291.633962,293.0,5534.596569,132.0,2017,2,2017.173585,2018,2017,12,9,8.049057,12,1,11,31,15.483019,76.947627,31,1,0.108949,14,23,13.283019,24.544597,23,0,-0.859026,2018-02-25 09:31:15,2017-06-27 14:18:08,-1025,-3.867925,-4.0,5.774157,0,-8,4.0,0.04467,5,7,3.237736,6,0,50,35,33.030189,52,1,0,94,0.354717,13.215094,26.260377,16.2,8.264151,17587.396701,17344.595926,17459.20726,5529.039832,0.030782,3,2,2.071698,0.06681,3.339263,12,11,11.641509,0.230846,-0.593532,265,69,7,9,3,34,21,560,41,168840.0,94,2703.0,10.436293,1.0,32.0,12,265,1.0,1.138996,0.346612,1.0,2.0,2,265,1.0,4.0,0.015094,0.0,0.014923,1.0,0.0,0.0,0.0,-169.408819,-0.639279,-0.698613,0.044225,2.258395,-0.739395,0.100618,10.326659,0.0,-170.867213,-0.644782,0.020237,0.8,-0.739395,5.175106,0.950943,252,0,1,0.0,0.0,0,0.0,2,0.015094,0.014923,0.0,0.045283,0.17976,2,0.096102,-0.746908,5943750.0,1579047.0,0.306257,-0.746908,5922101.0,4962103.0,35.0,73,4054.082784,172.719653,-0.057471,15.652829,-0.017811,1922.392402,,3905.022112,170.736672,-0.057471,15.077305,-0.027726,1881.478646,,2.0,5,1.648649,1.60088,0.0,5,1.413127,1.599982,1.0,2,0.953668,0.044357,1.0,2,1.051064,0.20251,326.47,7.73,0.71,1.260502,1.0,1.148262,,402.586644,12.705128,0.466667,1.554389,1.008886,4.785537,,777.0,3.0,3.0,3.0,3.0,0.0,,1878.67,147.69,0.58,7.253552,0.98,848.193339,,5458.81153,504.322881,0.320988,21.076492,0.980583,9451.139039,,1554.0,6.0,6.0,6.0,6.0,0.0,,2146.41,194.61,0.53,8.287297,0.97,1157.068144,,6016.9902,554.397813,0.252963,23.231622,0.981699,11461.658777,,3081.0,12.0,7.0,11.895753,12.0,0.48909,,22.105127,5151.261826,19.438724,20.749006,20.015077,3.268274,10448.0,428.0,483.0,454.26087,454.0,275.29249,25.5,2018.0,1.0,2018.0,2018.0,2018.0,3.0,2.0,3.478261,4.0,3.0,6.0,17.0,16.434783,88.802372,31.0,5.0,0.33897,13.0,8.0,12.869565,4.209486,16.0,8.0,-0.603578,2018-04-29 11:23:05,2018-03-05 14:04:36,34.0,1.478261,1.0,0.26087,2.0,1.0,1.0,0.093233,4.0,7.0,3.130435,6.0,0.0,13.0,7.0,13.304348,17.0,10.0,0.0,6.0,0.26087,0.0,0.0,0.0,41.73913,17650.474363,17595.586528,17621.821734,274.954866,-0.035528,2.0,2.0,2.0,0.0,0.0,12.0,11.0,11.608696,0.249012,-0.477134,23.0,69.0,3.0,9.0,1.0,37.0,10.0,278.0,14.0,13708.0,23.0,23.0,1.0,1.0,1.0,1.0,23.0,1.0,1.0,0.0,1.0,1.0,1.0,23.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.244202,-0.575835,-0.58118,0.018445,-0.296112,-0.724368,0.195991,0.899623,0.0,-13.244202,-0.575835,0.018445,-0.296112,-0.724368,0.899623,1.0,23.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.566793,-0.746893,263.157498,-615576.244067,-0.630998,-0.746893,14.279604,-590147.161107,35.0,17.0,633.757544,183.735111,-0.057471,27.554676,0.002019,3156.826505,7.718753,626.083408,182.079322,-0.057471,27.221018,-0.037641,3083.435646,7.485753,0.0,5.0,1.826087,2.059289,0.0,5.0,1.913043,2.173913,1.0,2.0,0.826087,0.150198,1.0,1.0,1.0,0.0,25.31,2.99,0.68,1.100435,1.01,0.180577,0.095,25.009007,2.222222,0.812757,1.087348,1.01664,0.069379,0.133171,69.0,3.0,3.0,3.0,3.0,0.0,0.0,25.02,3.11,0.65,1.087826,1.03,0.2156,0.16,24.618138,2.333333,0.777778,1.070354,1.007515,0.092094,0.174524,137.0,6.0,5.0,5.956522,6.0,0.043478,0.0,28.15,6.48,0.63,1.223913,0.99,1.344107,0.215,26.103974,3.972222,0.676331,1.134955,0.999061,0.410256,0.231759,267.0,12.0,5.0,11.608696,12.0,2.249012,0.0,17.154857,348.327812,15.144687,17.154857,35.070408,1.463585
1,1,C_ID_3d0044924f,4,1,0,0.392913,2017-01-01,2017,1,52,6,484,772,17167.0,1936,484,0,0.008264,0.002066,0.0,0,0.010712,0.011385,0.010283,0.007631,0.017925,0.017699,0.032379,0.010793,0.011385,0.010283,0.000555,5,1.666667,4,0,2.081666,84098,5,395,230.405479,230.0,13630.71975,188.0,2017,2,2017.153425,2018,2017,1,12,6.241096,12,1,27,31,16.660274,78.071082,31,1,-0.250317,12,24,14.816438,30.903026,23,0,-0.883112,2018-01-31 22:31:09,2017-01-06 16:29:42,-1795,-4.917808,-5.0,14.399819,0,-12,6.0,-0.296477,5,7,3.391781,6,0,4,50,25.309589,52,1,0,143,0.391781,11.153425,13.457534,10.142466,0.0,17562.938299,17172.687292,17398.043242,13629.279905,-0.260091,4,2,3.093151,0.101189,2.009191,13,12,12.605479,0.23953,-0.433415,365,69,9,9,3,34,24,307,57,169273.0,142,3489.0,9.558904,1.0,27.0,13,365,1.0,1.246575,0.632908,1.0,4.0,4,365,1.0,592.0,1.630854,1.0,2.763907,10.0,1.0,,2.0,-217.777537,-0.596651,-0.707839,0.145586,4.630299,-0.7424,0.131467,8.695999,0.0,-223.166196,-0.611414,0.059785,0.8,-0.7424,3.612116,0.969863,354,0,2,0.09589,0.086934,35,1.0,2,1.217631,0.170738,0.0,0.0,0.0,1,0.098398,-0.746908,6010604.0,1601584.0,-0.296347,-0.746893,122079.5,-3846021.0,35.0,103,9955.548697,172.719653,-0.057471,27.275476,0.130913,2553.123128,40.532129,9747.395195,170.736672,-0.057471,26.705192,-0.027726,2507.096742,39.838086,0.0,5,1.073973,1.854403,0.0,5,1.065753,1.88028,1.0,2,0.920548,0.07334,1.0,1,1.0,0.0,446.58,10.75,0.48,1.223507,1.06,1.152525,0.11,512.366554,12.705128,0.523977,1.403744,1.065682,3.670223,0.080428,1095.0,3.0,3.0,3.0,3.0,0.0,0.0,754.88,147.69,0.36,2.068164,1.07,119.111735,0.13,1540.508953,504.322881,0.307888,4.220572,1.081449,1387.226497,0.12301,2190.0,6.0,6.0,6.0,6.0,0.0,0.0,811.35,166.68,0.27,2.222877,1.07,152.084819,0.19,1673.471042,554.397813,0.209035,4.584852,1.110395,1677.400351,0.190098,4329.0,12.0,7.0,11.860274,12.0,0.631522,0.0,21.186055,6053.777617,16.585692,18.005138,37.725431,4.83495,2535.0,396.0,453.0,422.5,421.5,697.1,43.0,2018.0,1.0,2018.0,2018.0,2018.0,2.0,2.0,2.5,3.0,2.0,5.0,4.0,13.5,131.5,30.0,1.0,0.370037,17.0,5.0,11.166667,24.566667,17.0,6.0,0.383255,2018-03-30 06:48:26,2018-02-01 17:07:54,9.0,1.5,1.5,0.3,2.0,1.0,1.0,0.0,0.0,4.0,1.5,4.0,0.0,6.0,4.0,9.0,13.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,56.833333,17620.283634,17563.713819,17589.9845,690.446756,0.0681,3.0,3.0,3.0,0.0,0.0,13.0,12.0,12.5,0.3,0.0,6.0,69.0,1.0,9.0,1.0,19.0,4.0,307.0,5.0,157429.0,6.0,6.0,1.0,1.0,1.0,1.0,6.0,1.0,1.0,0.0,1.0,1.0,1.0,6.0,1.0,6.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,-4.355735,-0.725956,-0.732633,0.000205,-0.701858,-0.73941,0.014226,1.187487,0.0,-4.355735,-0.725956,0.000205,-0.701858,-0.73941,1.187487,1.0,6.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.566793,-0.746893,263.157498,-615576.244067,-0.60609,-0.746893,72.452641,-514857.076705,35.0,6.0,3.274116,3.392916,-0.057471,0.545686,-0.032683,1.94713,0.076841,3.224542,3.373086,-0.057471,0.537424,-0.032683,1.930599,0.054532,3.0,3.0,1.833333,1.766667,2.0,3.0,1.333333,0.666667,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,6.08,1.16,0.94,1.013333,0.98,0.006507,0.065,5.759112,1.051867,0.739749,0.959852,0.990524,0.012267,0.018838,18.0,3.0,3.0,3.0,3.0,0.0,0.0,6.33,1.34,0.81,1.055,1.0,0.04167,0.265,5.757135,1.325019,0.694037,0.959522,0.951439,0.043664,0.111878,36.0,6.0,6.0,6.0,6.0,0.0,0.0,6.1,1.34,0.84,1.016667,0.935,0.044347,0.3,5.562803,1.330626,0.643866,0.927134,0.884415,0.05298,0.148201,72.0,12.0,12.0,12.0,12.0,0.0,0.0,16.58189,105.600997,17.600166,17.600166,1.244263,2.036552
2,8,C_ID_d639edf6cd,2,2,0,0.688056,2016-08-01,2016,3,31,0,637,925,17014.0,1274,1274,0,0.00314,0.00314,0.0,0,0.01061,0.008752,0.010283,0.015914,0.009683,0.013462,0.029645,0.009882,0.01061,0.008752,0.000992,4,1.333333,2,0,1.154701,6551,10,422,148.886364,111.0,13641.963531,186.5,2017,2,2017.068182,2018,2017,1,10,4.5,12,1,21,19,19.227273,61.295983,30,2,-0.720982,19,14,17.977273,12.301797,23,8,-0.918653,2018-02-27 19:08:25,2017-01-11 08:21:22,-382,-8.681818,-10.0,14.687104,0,-13,6.25,0.759962,4,7,3.272727,6,0,4,22,18.113636,49,2,0,11,0.25,9.454545,7.340909,8.704545,1.681818,17589.797512,17177.348171,17316.653358,13642.998605,0.789515,3,2,2.068182,0.065011,3.54848,12,11,11.590909,0.247357,-0.382982,44,143,5,5,2,33,7,705,8,221934.0,13,810.0,18.409091,1.0,28.0,4,44,28.0,1.363636,0.61345,1.0,3.0,3,44,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-29.87822,-0.67905,-0.699552,0.007482,-0.145847,-0.730138,0.037758,5.684661,0.0,-29.87822,-0.67905,0.007482,-0.145847,-0.730138,5.684661,0.954545,42,0,1,0.0,0.0,0,0.0,1,0.0,0.0,4.0,3.545455,1.649049,2,-0.081399,-0.746894,776256.7,-162973.4,0.317262,-0.746908,6010604.0,5114473.0,17894.0,9,53.787948,21.834638,-0.057471,1.222453,0.150742,20.526165,0.0,46.093982,21.59668,-0.057471,1.047591,-0.047556,20.527412,0.0,0.0,4,0.363636,0.887949,0.0,5,0.363636,0.980973,1.0,2,0.931818,0.065011,5.0,2,4.714286,1.087108,50.03,6.93,0.93,1.137045,1.0,0.799631,0.0,56.67282,12.705128,0.949288,1.288019,1.007511,3.103588,0.008295,132.0,3.0,3.0,3.0,3.0,0.0,0.0,51.9,8.42,0.86,1.179545,1.01,1.249967,0.0,60.6816,15.855769,0.871214,1.379127,1.033525,4.991645,0.0,264.0,6.0,6.0,6.0,6.0,0.0,0.0,52.61,8.57,0.76,1.195682,1.02,1.299769,0.0,61.731488,16.21978,0.752641,1.402988,1.052752,5.233572,0.0,523.0,12.0,7.0,11.886364,12.0,0.568182,0.0,2.151798,120.601534,2.740944,2.151798,8.506266,0.266678,482.0,482.0,482.0,482.0,482.0,,0.0,2018.0,1.0,2018.0,2018.0,2018.0,4.0,1.0,4.0,4.0,4.0,28.0,1.0,28.0,,28.0,28.0,,17.0,1.0,17.0,,17.0,17.0,,2018-04-28 17:43:11,2018-04-28 17:43:11,2.0,2.0,2.0,,2.0,2.0,0.0,,5.0,1.0,5.0,5.0,5.0,17.0,1.0,17.0,17.0,17.0,1.0,1.0,1.0,0.0,0.0,0.0,14.0,17649.738322,17649.738322,17649.738322,,,2.0,2.0,2.0,,,11.0,11.0,11.0,,,1.0,143.0,1.0,5.0,1.0,25.0,1.0,528.0,1.0,220897.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,-0.700326,-0.700326,-0.700326,,-0.700326,-0.700326,0.0,,0.0,-0.700326,-0.700326,,-0.700326,-0.700326,,1.0,1.0,0.0,1.0,0.0,,0.0,0.0,1.0,0.0,,4.0,4.0,,1.0,-0.54899,-0.746893,74.385651,-142362.464162,-0.630998,-0.746893,14.279604,-590147.161107,3124.0,1.0,-0.027726,-0.027726,-0.027726,-0.027726,-0.027726,,0.0,-0.037641,-0.037641,-0.037641,-0.037641,-0.037641,,0.0,2.0,1.0,2.0,,2.0,1.0,2.0,,1.0,1.0,1.0,,5.0,1.0,5.0,,1.33,1.33,1.33,1.33,1.33,,0.0,1.297775,1.297775,1.297775,1.297775,1.297775,,0.0,3.0,3.0,3.0,3.0,3.0,,0.0,1.27,1.27,1.27,1.27,1.27,,0.0,1.291121,1.291121,1.291121,1.291121,1.291121,,0.0,6.0,6.0,6.0,6.0,6.0,,0.0,1.08,1.08,1.08,1.08,1.08,,0.0,1.099875,1.099875,1.099875,1.099875,1.099875,,0.0,10.0,10.0,10.0,10.0,10.0,,0.0,2.755332,2.755332,2.755332,2.755332,,0.0


In [87]:
test.shape

(123623, 37)

In [88]:
train.shape

(201917, 282)

In [89]:
train.dtypes[train.dtypes == 'object']

card_id                   object
first_active              object
hist_purchase_date_max    object
hist_purchase_date_min    object
dtype: object

## 제거할 피처

In [90]:
FEATS_EXCLUDED = ['first_active', 'card_id', 'target', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 
                  'new_purchase_date_max', 'new_purchase_date_min']

# FEATS_EXCLUDED += drop_features(train, 'hist_duration')
# FEATS_EXCLUDED += drop_features(train, 'hist_amount_month_ratio')
# FEATS_EXCLUDED += drop_features(train, 'new_duration')
# FEATS_EXCLUDED += drop_features(train, 'new_amount_month_ratio')
# FEATS_EXCLUDED += drop_features(train, 'hist_price')
# FEATS_EXCLUDED += drop_features(train, 'new_price')
# FEATS_EXCLUDED += drop_features(train, 'days_')
# FEATS_EXCLUDED += drop_features(train, 'feature_1_outlier')

In [91]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}

In [92]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=15)

oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance = pd.DataFrame()

train_columns = [f for f in train.columns if f not in FEATS_EXCLUDED]

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['outliers'].values)):    
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train.iloc[trn_idx][train_columns], label=train.iloc[trn_idx]['target'])
    val_data = lgb.Dataset(train.iloc[val_idx][train_columns], label=train.iloc[val_idx]['target'])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(train.iloc[val_idx][train_columns], num_iteration=clf.best_iteration)
    
    predictions_lgb += clf.predict(test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
    
    fold_importance = pd.DataFrame()
    fold_importance["Feature"] = train_columns
    fold_importance["importance"] = clf.feature_importance()
    fold_importance["fold"] = fold_ + 1
    feature_importance = pd.concat([feature_importance, fold_importance], axis=0)
    
print("CV score: {:<8.5f}".format(mean_squared_error(train.target.values, oof_lgb)**0.5))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 3.68945	valid_1's rmse: 3.75943
[200]	training's rmse: 3.6156	valid_1's rmse: 3.74535
[300]	training's rmse: 3.56733	valid_1's rmse: 3.74161
[400]	training's rmse: 3.53406	valid_1's rmse: 3.74018
[500]	training's rmse: 3.50888	valid_1's rmse: 3.74003
[600]	training's rmse: 3.48589	valid_1's rmse: 3.73981
Early stopping, best iteration is:
[413]	training's rmse: 3.53055	valid_1's rmse: 3.73975


KeyError: "['hist_fromRefDate_sum' 'hist_fromRefDate_min' 'hist_fromRefDate_max'\n 'hist_fromRefDate_mean' 'hist_fromRefDate_median' 'hist_fromRefDate_var'\n 'hist_fromRefDate_quantileRange' 'hist_purchase_year_mode'\n 'hist_purchase_year_nunique' 'hist_purchase_year_mean'\n 'hist_purchase_year_max' 'hist_purchase_year_min'\n 'hist_purchase_month_mode' 'hist_purchase_month_nunique'\n 'hist_purchase_month_mean' 'hist_purchase_month_max'\n 'hist_purchase_month_min' 'hist_purchase_day_mode'\n 'hist_purchase_day_nunique' 'hist_purchase_day_mean'\n 'hist_purchase_day_var' 'hist_purchase_day_max' 'hist_purchase_day_min'\n 'hist_purchase_day_skew' 'hist_purchase_hour_mode'\n 'hist_purchase_hour_nunique' 'hist_purchase_hour_mean'\n 'hist_purchase_hour_var' 'hist_purchase_hour_max'\n 'hist_purchase_hour_min' 'hist_purchase_hour_skew' 'hist_month_lag_sum'\n 'hist_month_lag_mean' 'hist_month_lag_median' 'hist_month_lag_var'\n 'hist_month_lag_max' 'hist_month_lag_min' 'hist_month_lag_quantileRange'\n 'hist_month_lag_skew' 'hist_purchase_dayofweek_mode'\n 'hist_purchase_dayofweek_nunique' 'hist_purchase_dayofweek_mean'\n 'hist_purchase_dayofweek_max' 'hist_purchase_dayofweek_min'\n 'hist_purchase_weekofyear_mode' 'hist_purchase_weekofyear_nunique'\n 'hist_purchase_weekofyear_mean' 'hist_purchase_weekofyear_max'\n 'hist_purchase_weekofyear_min' 'hist_purchase_weekend_mode'\n 'hist_purchase_weekend_sum' 'hist_purchase_weekend_mean'\n 'hist_Christmas_Day_2017_mean' 'hist_Children_day_2017_mean'\n 'hist_Black_Friday_2017_mean' 'hist_Mothers_Day_2018_mean'\n 'hist_purchase_date_total_day_max' 'hist_purchase_date_total_day_min'\n 'hist_purchase_date_total_day_mean' 'hist_purchase_date_total_day_var'\n 'hist_purchase_date_total_day_skew' 'hist_month_diff_from_trade_max'\n 'hist_month_diff_from_trade_min' 'hist_month_diff_from_trade_mean'\n 'hist_month_diff_from_trade_var' 'hist_month_diff_from_trade_skew'\n 'hist_month_diff_from_today_max' 'hist_month_diff_from_today_min'\n 'hist_month_diff_from_today_mean' 'hist_month_diff_from_today_var'\n 'hist_month_diff_from_today_skew' 'hist_city_id_count'\n 'hist_city_id_mode' 'hist_city_id_nunique' 'hist_state_id_mode'\n 'hist_state_id_nunique' 'hist_subsector_id_mode'\n 'hist_subsector_id_nunique' 'hist_merchant_category_id_mode'\n 'hist_merchant_category_id_nunique' 'hist_merchant_id_mode'\n 'hist_merchant_id_nunique' 'hist_merchant_visit_sum'\n 'hist_merchant_visit_mean' 'hist_merchant_visit_min'\n 'hist_merchant_visit_max' 'hist_merchant_visit_nunique'\n 'hist_merchant_visit_size' 'hist_merchant_visit_mode'\n 'hist_merchant_try_mean' 'hist_merchant_try_std' 'hist_merchant_try_min'\n 'hist_merchant_try_max' 'hist_merchant_try_nunique'\n 'hist_merchant_try_size' 'hist_merchant_try_mode' 'hist_installments_sum'\n 'hist_installments_mean' 'hist_installments_median'\n 'hist_installments_var' 'hist_installments_max' 'hist_installments_min'\n 'hist_installments_quantileRange' 'hist_installments_null_cnt'\n 'hist_purchase_amount_sum' 'hist_purchase_amount_mean'\n 'hist_purchase_amount_median' 'hist_purchase_amount_var'\n 'hist_purchase_amount_max' 'hist_purchase_amount_min'\n 'hist_purchase_amount_quantileRange' 'hist_purchase_amount_skew'\n 'hist_purchase_amount_over_550' 'hist_purchase_amount_trim_sum'\n 'hist_purchase_amount_trim_mean' 'hist_purchase_amount_trim_var'\n 'hist_purchase_amount_trim_max' 'hist_purchase_amount_trim_min'\n 'hist_purchase_amount_trim_skew' 'hist_authorized_flag_mean'\n 'hist_authorized_flag_sum' 'hist_category_1_mode'\n 'hist_category_1_nunique' 'hist_category_1_mean' 'hist_category_1_var'\n 'hist_category_1_sum' 'hist_category_3_mode' 'hist_category_3_nunique'\n 'hist_category_3_mean' 'hist_category_3_var' 'hist_category_2_mode'\n 'hist_category_2_mean' 'hist_category_2_var' 'hist_category_2_nunique'\n 'hist_category_2_mean_mean' 'hist_category_2_min_mean'\n 'hist_category_2_max_mean' 'hist_category_2_sum_mean'\n 'hist_category_3_mean_mean' 'hist_category_3_min_mean'\n 'hist_category_3_max_mean' 'hist_category_3_sum_mean'\n 'hist_merchant_group_id_mode' 'hist_merchant_group_id_nunique'\n 'hist_numerical_1_sum' 'hist_numerical_1_max' 'hist_numerical_1_min'\n 'hist_numerical_1_mean' 'hist_numerical_1_median' 'hist_numerical_1_var'\n 'hist_numerical_1_quantileRange' 'hist_numerical_2_sum'\n 'hist_numerical_2_max' 'hist_numerical_2_min' 'hist_numerical_2_mean'\n 'hist_numerical_2_median' 'hist_numerical_2_var'\n 'hist_numerical_2_quantileRange' 'hist_most_recent_sales_range_mode'\n 'hist_most_recent_sales_range_nunique'\n 'hist_most_recent_sales_range_mean' 'hist_most_recent_sales_range_var'\n 'hist_most_recent_purchases_range_mode'\n 'hist_most_recent_purchases_range_nunique'\n 'hist_most_recent_purchases_range_mean'\n 'hist_most_recent_purchases_range_var' 'hist_category_4_mode'\n 'hist_category_4_nunique' 'hist_category_4_mean' 'hist_category_4_var'\n 'hist_category_5_mode' 'hist_category_5_nunique' 'hist_category_5_mean'\n 'hist_category_5_var' 'hist_avg_sales_lag3_sum' 'hist_avg_sales_lag3_max'\n 'hist_avg_sales_lag3_min' 'hist_avg_sales_lag3_mean'\n 'hist_avg_sales_lag3_median' 'hist_avg_sales_lag3_var'\n 'hist_avg_sales_lag3_quantileRange' 'hist_avg_purchases_lag3_sum'\n 'hist_avg_purchases_lag3_max' 'hist_avg_purchases_lag3_min'\n 'hist_avg_purchases_lag3_mean' 'hist_avg_purchases_lag3_median'\n 'hist_avg_purchases_lag3_var' 'hist_avg_purchases_lag3_quantileRange'\n 'hist_active_months_lag3_sum' 'hist_active_months_lag3_max'\n 'hist_active_months_lag3_min' 'hist_active_months_lag3_mean'\n 'hist_active_months_lag3_median' 'hist_active_months_lag3_var'\n 'hist_active_months_lag3_quantileRange' 'hist_avg_sales_lag6_sum'\n 'hist_avg_sales_lag6_max' 'hist_avg_sales_lag6_min'\n 'hist_avg_sales_lag6_mean' 'hist_avg_sales_lag6_median'\n 'hist_avg_sales_lag6_var' 'hist_avg_sales_lag6_quantileRange'\n 'hist_avg_purchases_lag6_sum' 'hist_avg_purchases_lag6_max'\n 'hist_avg_purchases_lag6_min' 'hist_avg_purchases_lag6_mean'\n 'hist_avg_purchases_lag6_median' 'hist_avg_purchases_lag6_var'\n 'hist_avg_purchases_lag6_quantileRange' 'hist_active_months_lag6_sum'\n 'hist_active_months_lag6_max' 'hist_active_months_lag6_min'\n 'hist_active_months_lag6_mean' 'hist_active_months_lag6_median'\n 'hist_active_months_lag6_var' 'hist_active_months_lag6_quantileRange'\n 'hist_avg_sales_lag12_sum' 'hist_avg_sales_lag12_max'\n 'hist_avg_sales_lag12_min' 'hist_avg_sales_lag12_mean'\n 'hist_avg_sales_lag12_median' 'hist_avg_sales_lag12_var'\n 'hist_avg_sales_lag12_quantileRange' 'hist_avg_purchases_lag12_sum'\n 'hist_avg_purchases_lag12_max' 'hist_avg_purchases_lag12_min'\n 'hist_avg_purchases_lag12_mean' 'hist_avg_purchases_lag12_median'\n 'hist_avg_purchases_lag12_var' 'hist_avg_purchases_lag12_quantileRange'\n 'hist_active_months_lag12_sum' 'hist_active_months_lag12_max'\n 'hist_active_months_lag12_min' 'hist_active_months_lag12_mean'\n 'hist_active_months_lag12_median' 'hist_active_months_lag12_var'\n 'hist_active_months_lag12_quantileRange' 'hist_city_ym_rate_mode'\n 'hist_city_ym_rate_sum' 'hist_city_ym_rate_mean'\n 'hist_city_ym_rate_median' 'hist_city_ym_rate_var'\n 'hist_city_ym_rate_quantileRange'] not in index"

In [70]:
len(train.card_id)

201917

In [71]:
len(oof_lgb)

201917

In [66]:
print("CV score: {:<8.5f}".format(mean_squared_error(train.target.values, oof_lgb)**0.5))

CV score: 3.65209 


In [74]:
sub = pd.read_csv('./data/sample_submission.csv')

In [76]:
sub.target = predictions_lgb

In [78]:
sub.to_csv('./data/sub_new_fe.csv', index=False)

In [72]:
predictions_lgb

array([-2.50495246, -0.32532403, -0.81959737, ...,  0.8803522 ,
       -3.04385696, -0.03560506])