In [1]:
import pandas as pd
import numpy as np
import gc
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [2]:
FEATURES = ['V85',
         'bank_type_TransactionAmt_mean',
         'D5_fq_enc',
         'V12',
         'V81',
         'V282',
         'bank_type_D7_std',
         'id_15',
         'V13',
         'C12_fq_enc',
         'anomaly',
         'D7_DT_D_std_score',
         'D3_DT_D_min_max',
         'card4_count_full',
         'D14_DT_D_min_max',
         'card1_count_full',
         'V169',
         'D3_DT_M_min_max',
         'V279',
         'V91',
         'bank_type_D10_std',
         'D14',
         'D6_DT_M_std_score',
         'D4_DT_W_min_max',
         'V152',
         'V56',
         'D3_intercept_bin0',
         'D14_intercept_bin0',
         'V220',
         'V277',
         'D12_intercept',
         'ProductCD_W_00cents',
         'D13_intercept_bin0',
         'V291',
         'V189',
         'D15_DT_M_min_max',
         'C5_fq_enc',
         'D3_fq_enc',
         'card5_fq_enc',
         'addr1_count_full',
         'V266',
         'D11_intercept_bin2',
         'V23',
         'D4_intercept_bin3',
         'bank_type_D10_mean',
         'D2_intercept_bin3',
         'V306',
         'DeviceType',
         'V285',
         'D5_DT_W_std_score',
         'V131',
         'V37',
         'V296',
         'bank_type_D1_mean',
         'V75',
         'D3_DT_W_std_score',
         'D10_DT_M_min_max',
         'id_33_0',
         'V67',
         'D4_intercept_bin4',
         'V256',
         'V143',
         'uid5_D6_std',
         'ProductCD_target_mean',
         'mxC3',
         'V129',
         'D13_DT_M_std_score',
         'V24',
         'D3_DT_M_std_score',
         'mxC4',
         'D9',
         'id_30_version_fq_enc',
         'D5_DT_D_std_score',
         'D11_DT_M_std_score',
         'uid5_D6_mean',
         'D14_DT_M_std_score',
         'card5_TransactionAmt_std',
         'V20',
         'C8_fq_enc',
         'V70',
         'V127',
         'D6_intercept',
         'D15_DT_W_min_max',
         'sum_Cxx_binary_higher_than_q95',
         'V156',
         'uid4_D12_mean',
         'C5',
         'uid4_D12_std',
         'id_30_fq_enc',
         'V61',
         'id_33',
         'D15_to_std_addr1',
         'bank_type_D9_mean',
         'D5_intercept',
         'D10_DT_W_min_max',
         'V130',
         'bank_type_D9_std',
         'uid5_D7_std',
         'bank_type_D14_mean',
         'bank_type_D3_std',
         'bank_type_D5_mean',
         'ProductCD',
         'M8',
         'V44',
         'D6_fq_enc',
         'D15_DT_D_min_max',
         'D11_intercept_bin0',
         'V257',
         'bank_type_D7_mean',
         'V76',
         'D15',
         'V38',
         'V55',
         'V261',
         'V149',
         'D4',
         'D8_intercept_bin0',
         'M2',
         'bank_type_D6_std',
         'id_30_version',
         'D4_intercept_bin1',
         'D15_to_mean_card4',
         'V82',
         'D3_DT_D_std_score',
         'D10_intercept_bin3',
         'bank_type_D2_std',
         'V77',
         'M7',
         'D11',
         'D4_intercept_bin2',
         'email_check',
         'V294',
         'V317',
         'V308',
         'id_33_fq_enc',
         'bank_type_D5_std',
         'D8_intercept',
         'V62',
         'V187',
         'card5_TransactionAmt_mean',
         'bank_type_D12_mean',
         'id_33_count_dist',
         'D2_intercept_bin2',
         'C10',
         'V86',
         'D8_DT_M_min_max',
         'D15_intercept_bin4',
         'D6_DT_W_std_score',
         'uid5_D7_mean',
         'C9_fq_enc',
         'mxC10',
         'D14_DT_W_std_score',
         'card2_count_full',
         'V258',
         'bank_type_D14_std',
         'D10_intercept_bin4',
         'V83',
         'bank_type_D13_std',
         'D8_DT_W_min_max',
         'TransactionAmt',
         'V312',
         'D14_intercept',
         'id_33_1',
         'D15_intercept_bin2',
         'D12_DT_W_std_score',
         'V78',
         'D8_D9_decimal_dist',
         'M9',
         'V281',
         'bank_type_D12_std',
         'V54',
         'C9',
         'M4_target_mean',
         'sum_Cxx_binary_higher_than_q90',
         'D10_DT_D_min_max',
         'bank_type_D3_mean',
         'bank_type_D8_mean',
         'R_emaildomain_prefix',
         'bank_type_D6_mean',
         'V314',
         'D11_DT_W_std_score',
         'D10',
         'D4_DT_D_min_max',
         'V283',
         'D10_intercept_bin2',
         'D13_intercept',
         'D8_DT_D_min_max',
         'C2_fq_enc',
         'V165',
         'D1_intercept_bin4',
         'bank_type_D13_mean',
         'D3_intercept',
         'TransactionAmt_2Dec',
         'card3_div_Mean_D9_DOY',
         'C12',
         'D4_DT_M_std_score',
         'D2_intercept_bin1',
         'mxC8',
         'D2_fq_enc',
         'addr1_third_digit',
         'D4_fq_enc',
         'D1_fq_enc',
         'mxC12',
         'D8',
         'D10_intercept_bin1',
         'id_01',
         'id_09',
         'id_03',
         'addr1_second_digit',
         'D15_to_mean_addr1',
         'sum_Cxx_binary_higher_than_q80',
         'V53',
         'TransactionAmt_decimal',
         'card3_div_Mean_D6_DOY',
         'D15_intercept_bin3',
         'V45',
         'id_02_to_std_card4',
         'addr2_div_Mean_D10_DOY_productCD',
         'DeviceInfo_version',
         'DeviceInfo_device',
         'D1_intercept_bin3',
         'D11_intercept',
         'DeviceInfo_version_fq_enc',
         'C6',
         'uid5_D13_std',
         'TransactionAmt_DT_M_min_max',
         'dist2',
         'C8',
         'D15_intercept_bin1',
         'M3',
         'R_emaildomain_fq_enc',
         'DeviceInfo_device_fq_enc',
         'D6_DT_D_std_score',
         'sum_Cxx_binary_higher_than_q60',
         'D11__DeviceInfo',
         'TranAmt_div_Mean_D12_DOY_productCD',
         'D10_DT_M_std_score',
         'uid5_D13_mean',
         'mxC5',
         'id_30',
         'addr2_div_Mean_D4_DOY',
         'uid2_D12_std',
         'C11_fq_enc',
         'id_06',
         'uid2_D12_mean',
         'sum_Cxx_binary_higher_than_q70',
         'V310',
         'V307',
         'C6_fq_enc',
         'D8_fq_enc',
         'dist2_fq_enc',
         'D2_intercept_bin0',
         'addr1_div_Mean_D10_DOY_productCD',
         'addr1_div_Mean_D10_DOY',
         'addr1_div_Mean_D11_DOY',
         'uid2_D8_std',
         'id_02__id_20',
         'V313',
         'D4_intercept_bin0',
         'D11_DT_D_std_score',
         'Transaction_day_of_week',
         'card6_div_Mean_D3_DOY',
         'uid2_D1_std',
         'uid5_D11_mean',
         'uid_fq_enc',
         'D14_DT_D_std_score',
         'D12_DT_D_std_score',
         'id_02_to_mean_card4',
         'uid4_D13_std',
         'D1_intercept_bin1',
         'id_02_to_std_card1',
         'uid5_D11_std',
         'P_emaildomain_prefix',
         'DT_day',
         'D8_DT_M_std_score',
         'uid2_D1_mean',
         'TransactionAmt_to_mean_card4',
         'card5_div_Mean_D11_DOY',
         'D15_DT_M_std_score',
         'V87',
         'uid_D12_std',
         'id_31_device_fq_enc',
         'uid2_D11_mean',
         'card3_DT_W_week_day_dist_best',
         'uid5_D14_std',
         'uid2_D15_mean',
         'sum_Cxx_binary_higher_than_q50',
         'id_13',
         'card3_div_Mean_D11_DOY',
         'C11',
         'bank_type_DT_W_week_day_dist_best',
         'card4_div_Mean_D11_DOY',
         'addr1_div_Mean_D1_DOY',
         'uid2_D4_mean',
         'card2_div_Mean_D11_DOY',
         'C13_fq_enc',
         'uid4_D13_mean',
         'card5_DT_W_week_day_dist_best',
         'id_02',
         'uid5_D14_mean',
         'uid2_D10_mean',
         'id_01_count_dist',
         'D13_DT_W_std_score',
         'C2',
         'C14',
         'addr2_div_Mean_D10_DOY',
         'uid2_D11_std',
         'addr1_div_Mean_D1_DOY_productCD',
         'id_02_to_mean_card1',
         'dist1_fq_enc',
         'card1_div_Mean_D11_DOY',
         'D15_to_std_card1',
         'TransactionAmt_DT_M_std_score',
         'uid2_D6_std',
         'TransactionAmt_to_std_card4',
         'uid2_D15_std',
         'uid3_D8_std',
         'card6_div_Mean_D11_DOY',
         'TranAmt_div_Mean_D14_DOY',
         'card3_div_Mean_D14_DOY',
         'D2',
         'D1',
         'uid_D15_mean',
         'uid4_D6_std',
         'uid_D15_std',
         'D10_intercept_bin0',
         'DeviceInfo_fq_enc',
         'uid2_D13_std',
         'uid_D12_mean',
         'uid4_D6_mean',
         'uid_D1_std',
         'D1_intercept_bin2',
         'uid_D10_mean',
         'card2__id_20',
         'uid4_D7_std',
         'uid3_D13_std',
         'C14_fq_enc',
         'uid_D8_std',
         'uid3_D13_mean',
         'uid2_D4_std',
         'addr1_div_Mean_D4_DOY',
         'uid_D4_mean',
         'D4_DT_W_std_score',
         'addr2_div_Mean_D1_DOY_productCD',
         'uid_D11_mean',
         'D15_intercept_bin0',
         'uid2_D10_std',
         'uid_D13_std',
         'uid2_fq_enc',
         'uid2_D13_mean',
         'uid2_D2_mean',
         'D2_intercept',
         'uid_D11_std',
         'card2',
         'uid4_D14_std',
         'C_sum_after_clip75',
         'R_emaildomain',
         'dist1',
         'id_05',
         'uid_TransactionAmt_mean',
         'uid_D1_mean',
         'uid3_D1_std',
         'uid5_D8_std',
         'uid3_D6_std',
         'Transaction_hour_of_day',
         'uid4_D14_mean',
         'uid5_D10_std',
         'uid3_D10_std',
         'uid5_D1_std',
         'uid5_D15_std',
         'uid2_D7_mean',
         'uid3_D11_std',
         'uid4_D8_std',
         'D13_DT_D_std_score',
         'uid3_D11_mean',
         'uid2_D14_std',
         'uid2_D7_std',
         'uid2_D14_mean',
         'uid_D13_mean',
         'uid_D10_std',
         'uid2_D3_std',
         'uid_D6_std',
         'uid3_D15_std',
         'addr1_fq_enc',
         'id_31',
         'uid_TransactionAmt_std',
         'card1_div_Mean_D4_DOY_productCD',
         'uid2_TransactionAmt_mean',
         'C_sum_after_clip90',
         'uid2_TransactionAmt_std',
         'uid4_D7_mean',
         'uid2_D6_mean',
         'uid3_D15_mean',
         'D15_to_mean_card1',
         'uid5_D15_mean',
         'M4',
         'uid3_D7_std',
         'card2_div_Mean_D4_DOY',
         'card5_div_Mean_D4_DOY_productCD',
         'card5_div_Mean_D4_DOY',
         'D4_intercept',
         'uid_D4_std',
         'card6_div_Mean_D4_DOY_productCD',
         'card5__P_emaildomain',
         'card1_fq_enc',
         'uid5_D10_mean',
         'card1_div_Mean_D4_DOY',
         'C1',
         'M6',
         'uid2_D2_std',
         'P_emaildomain_fq_enc',
         'card1_TransactionAmt_mean',
         'uid3_D10_mean',
         'TransactionAmt_DT_W_min_max',
         'uid5_D4_std',
         'card1_div_Mean_D10_DOY_productCD',
         'uid3_D1_mean',
         'card1_div_Mean_D10_DOY',
         'uid_D14_mean',
         'mxC9',
         'TranAmt_div_Mean_D4_DOY_productCD',
         'D15_DT_W_std_score',
         'DeviceInfo__P_emaildomain',
         'uid3_D14_mean',
         'bank_type_DT_M',
         'mxC11',
         'uid5_D1_mean',
         'uid_D2_mean',
         'D10_DT_W_std_score',
         'card3_DT_M_month_day_dist_best',
         'uid3_D2_std',
         'TranAmt_div_Mean_D4_DOY',
         'card1_TransactionAmt_std',
         'card3_div_Mean_D4_DOY_productCD',
         'D1_intercept_bin0',
         'uid3_D4_std',
         'card2_div_Mean_D10_DOY',
         'uid_D2_std',
         'uid3_D14_std',
         'uid3_D4_mean',
         'uid_D7_mean',
         'uid5_D2_std',
         'card4_div_Mean_D4_DOY_productCD',
         'card6_div_Mean_D4_DOY',
         'TranAmt_div_Mean_D10_DOY',
         'uid2_D9_std',
         'TransactionAmt_DT_W_std_score',
         'C1_fq_enc',
         'card1_div_Mean_D1_DOY',
         'uid5_D4_mean',
         'uid3_D6_mean',
         'mxC14',
         'uid5_D2_mean',
         'card4_div_Mean_D4_DOY',
         'card3_div_Mean_D4_DOY',
         'uid_D14_std',
         'M5',
         'C13',
         'mxC6',
         'card5_div_Mean_D10_DOY_productCD',
         'card3_DT_M_month_day_dist',
         'card2_div_Mean_D10_DOY_productCD',
         'uid_D7_std',
         'card2_div_Mean_D4_DOY_productCD',
         'bank_type_DT_M_month_day_dist',
         'uid3_D7_mean',
         'uid_D3_std',
         'uid5_fq_enc',
         'uid3_fq_enc',
         'uid_D3_mean',
         'D4_DT_D_std_score',
         'uid3_D2_mean',
         'uid4_D1_std',
         'uid2_D5_std',
         'uid4_D10_std',
         'bank_type_DT_D_hour_dist_best',
         'uid2_D8_mean',
         'card6_div_Mean_D10_DOY_productCD',
         'card1_div_Mean_D1_DOY_productCD',
         'uid5_D9_std',
         'card4_div_Mean_D10_DOY_productCD',
         'uid2_D3_mean',
         'uid_D6_mean',
         'card2_div_Mean_D1_DOY',
         'card5_div_Mean_D10_DOY',
         'mxC2',
         'card2_TransactionAmt_std',
         'bank_type_DT_W_week_day_dist',
         'card2_TransactionAmt_mean',
         'uid4_D10_mean',
         'id_31_count_dist',
         'TranAmt_div_Mean_D1_DOY',
         'uid3_D3_std',
         'uid4_D15_std',
         'card5_div_Mean_D1_DOY_productCD',
         'card4_div_Mean_D10_DOY',
         'card5_DT_D_hour_dist_best',
         'uid4_D4_std',
         'card5_DT_M_month_day_dist',
         'bank_type_DT_W',
         'addr1__card1',
         'bank_type_DT_M_month_day_dist_best',
         'card2_div_Mean_D1_DOY_productCD',
         'card6_div_Mean_D10_DOY',
         'uid2_D5_mean',
         'uid_DT_M',
         'card2__dist1',
         'uid2_D9_mean',
         'card5_DT_M_month_day_dist_best',
         'TranAmt_div_Mean_D10_DOY_productCD',
         'uid4_D11_std',
         'uid_D5_mean',
         'uid5_D3_std',
         'TransactionAmt_DT_D_std_score',
         'D8_DT_W_std_score',
         'card5_DT_W_week_day_dist',
         'uid5_D5_std',
         'card3_DT_W_week_day_dist',
         'uid4_D9_std',
         'D10_intercept',
         'uid3_D3_mean',
         'uid4_D5_std',
         'uid_D5_std',
         'card5_div_Mean_D1_DOY',
         'uid5_D3_mean',
         'bank_type_DT_D',
         'uid4_D1_mean',
         'uid_D8_mean',
         'uid3_D5_mean',
         'D15_intercept',
         'uid5_TransactionAmt_std',
         'uid3_D5_std',
         'uid4_D4_mean',
         'uid4_D15_mean',
         'uid5_D8_mean',
         'uid5_D9_mean',
         'uid_D9_std',
         'uid_D9_mean',
         'uid5_D5_mean',
         'mtransamt',
         'bank_type_DT_D_hour_dist',
         'uid4_D11_mean',
         'D15_DT_D_std_score',
         'TransactionAmt_DT_D_min_max',
         'uid4_D2_mean',
         'ntrans',
         'addr2_div_Mean_D1_DOY',
         'uid5_TransactionAmt_mean',
         'uid3_D9_std',
         'TransactionAmt_Dec',
         'uid3_TransactionAmt_std',
         'card5_DT_D_hour_dist',
         'card1',
         'card4_div_Mean_D1_DOY_productCD',
         'P_emaildomain__C2',
         'card3_div_Mean_D10_DOY',
         'uid4_D3_std',
         'card3_DT_D_hour_dist_best',
         'uid4_D8_mean',
         'uid4_D2_std',
         'card6_div_Mean_D1_DOY_productCD',
         'uid_DT_W',
         'Sum_TransAmt_Day',
         'uid4_D5_mean',
         'card4_div_Mean_D1_DOY',
         'card3_div_Mean_D10_DOY_productCD',
         'uid3_D8_mean',
         'TransactionAmt_userid_median',
         'uid4_fq_enc',
         'uid3_TransactionAmt_mean',
         'uid3_D9_mean',
         'card6_div_Mean_D1_DOY',
         'Trans_Count_Day',
         'mxC1',
         'D10_DT_D_std_score',
         'card3_div_Mean_D1_DOY',
         'TransactionAmt_to_mean_card1',
         'card2_fq_enc',
         'product_type',
         'card3_div_Mean_D1_DOY_productCD',
         'TransactionAmt_to_std_card1',
         'uid_DT_D',
         'uid4_D9_mean',
         'D1_intercept',
         'card3_DT_D_hour_dist',
         'TranAmt_div_Mean_D1_DOY_productCD',
         'product_type_DT_M',
         'uid4_D3_mean',
         'uid4_TransactionAmt_mean',
         'uid4_TransactionAmt_std',
         'D8_DT_D_std_score',
         'Mean_TransAmt_Day',
         'minDT',
         'product_type_DT_W',
         'mintransamt',
         'maxtransamt',
         'TransactionAmt_userid_std',
         'P_emaildomain',
         'card1__card5',
         'product_type_DT_D',
         'mxC13',
         'maxDT',
         'id_19',
         'DeviceInfo',
         'id_20',
         'addr1',
         'userid_min_C1',
         'userid_max_C1',
         'userid_max_minus_min_C1',
         'userid_unique_C1',
         'userid_mean_C1',
         'userid_min_C2',
         'userid_max_C2',
         'userid_max_minus_min_C2',
         'userid_unique_C2',
         'userid_mean_C2',
         'userid_min_C3',
         'userid_max_C3',
         'userid_max_minus_min_C3',
         'userid_unique_C3',
         'userid_mean_C3',
         'userid_min_C4',
         'userid_max_C4',
         'userid_max_minus_min_C4',
         'userid_unique_C4',
         'userid_mean_C4',
         'userid_min_C5',
         'userid_max_C5',
         'userid_max_minus_min_C5',
         'userid_unique_C5',
         'userid_mean_C5',
         'userid_min_C6',
         'userid_max_C6',
         'userid_max_minus_min_C6',
         'userid_unique_C6',
         'userid_mean_C6',
         'userid_min_C7',
         'userid_max_C7',
         'userid_max_minus_min_C7',
         'userid_unique_C7',
         'userid_mean_C7',
         'userid_min_C8',
         'userid_max_C8',
         'userid_max_minus_min_C8',
         'userid_unique_C8',
         'userid_mean_C8',
         'userid_min_C9',
         'userid_max_C9',
         'userid_max_minus_min_C9',
         'userid_unique_C9',
         'userid_mean_C9',
         'userid_min_C10',
         'userid_max_C10',
         'userid_max_minus_min_C10',
         'userid_unique_C10',
         'userid_mean_C10',
         'userid_min_C11',
         'userid_max_C11',
         'userid_max_minus_min_C11',
         'userid_unique_C11',
         'userid_mean_C11',
         'userid_min_C12',
         'userid_max_C12',
         'userid_max_minus_min_C12',
         'userid_unique_C12',
         'userid_mean_C12',
         'userid_min_C13',
         'userid_max_C13',
         'userid_max_minus_min_C13',
         'userid_unique_C13',
         'userid_mean_C13',
         'userid_min_C14',
         'userid_max_C14',
         'userid_max_minus_min_C14',
         'userid_unique_C14',
         'userid_mean_C14',
            'hour',
         'hour_sin',
         'week',
         'week_sin',
         'week_cos',
         'month',
         'life_of_customer',
         'addr1_broad_area',
         'uid6_TransactionAmt_mean',
         'uid6_TransactionAmt_std',
         'hour_TransactionAmt_mean',
         'hour_TransactionAmt_std',
         'week_TransactionAmt_mean',
         'week_TransactionAmt_std',
         'D1_diff',
         'D10_diff',
         'D15_diff',
         'new_identity_M5_mean',
         'new_identity_M6_mean',
         'new_identity_V315_mean',
         'new_identity_D1_diff_mean',
         'new_identity_D3_mean',
         'new_identity_D10_diff_mean',
         'new_identity_D15_diff_mean',
         'addr1_addr2_new_identity_M5_mean_mean',
         'addr1_addr2_new_identity_M5_mean_std',
         'addr1_addr2_new_identity_M6_mean_mean',
         'addr1_addr2_new_identity_M6_mean_std',
         'addr1_addr2_new_identity_V315_mean_mean',
         'addr1_addr2_new_identity_V315_mean_std',
         'addr1_addr2_new_identity_D1_diff_mean_mean',
         'addr1_addr2_new_identity_D1_diff_mean_std',
         'addr1_addr2_new_identity_D10_diff_mean_mean',
         'addr1_addr2_new_identity_D10_diff_mean_std',
         'addr1_addr2_new_identity_D15_diff_mean_mean',
         'addr1_addr2_new_identity_D15_diff_mean_std',
         'new_identity_ProductCD_TransactionAmt_mean',
         'uid6_C1_mean',
         'uid6_C1_std',
         'uid6_V54_mean',
         'uid6_V54_std',
         'uid6_V281_mean',
         'uid6_V281_std',
         'uid6_C11_mean',
         'uid6_C11_std',
         'uid6_D4_mean',
         'uid6_D4_std',
         'uid6_V67_mean',
         'uid6_V67_std',
         'uid6_V320_mean',
         'uid6_V320_std',
         'uid6_M5_mean',
         'uid6_M5_std',
         'uid6_M6_mean',
         'uid6_M6_std',
         'uid3_V67_mean',
         'uid3_V67_std',
         'uid3_V83_mean',
         'uid3_V83_std',
         'uid6_fq_enc',
         'card4_fq_enc',
         'card6_fq_enc',
         'ProductCD_fq_enc',
         'M4_fq_enc',
         'addr_fq_enc',
         'R_emaildomain_V118_mean',
         'R_emaildomain_V118_std',
         'R_emaildomain_V119_mean',
         'R_emaildomain_V119_std',
         'card1_V20_mean',
         'card1_V20_std',
         'card1_V151_mean',
         'card1_V151_std',
         'card1_V67_mean',
         'card1_V67_std',
         'hour_V116_mean',
         'hour_V116_std']

In [3]:
len(FEATURES)/2

376.5

In [4]:
tr = pd.read_parquet('../../data/train_FE012.parquet', columns=['TransactionID','isFraud','TransactionDT']+FEATURES[600:])
te = pd.read_parquet('../../data/test_FE012.parquet', columns=['TransactionID','TransactionDT']+FEATURES[600:])

In [5]:
COLS_TO_DROP = ['D3_intercept_bin0',
 'D14_intercept_bin0',
 'D13_intercept_bin0',
 'D11_intercept_bin2',
 'D4_intercept_bin3',
 'D2_intercept_bin3',
 'D4_intercept_bin4',
 'D11_intercept_bin0',
 'D8_intercept_bin0',
 'D4_intercept_bin1',
 'D10_intercept_bin3',
 'D4_intercept_bin2',
 'D2_intercept_bin2',
 'D15_intercept_bin4',
 'D10_intercept_bin4',
 'D15_intercept_bin2',
 'D10_intercept_bin2',
 'D1_intercept_bin4',
 'D2_intercept_bin1',
 'D10_intercept_bin1',
 'D15_intercept_bin3',
 'D1_intercept_bin3',
 'D15_intercept_bin1',
 'D2_intercept_bin0',
 'D4_intercept_bin0',
 'D1_intercept_bin1',
 'D10_intercept_bin0',
 'D1_intercept_bin2',
 'D15_intercept_bin0',
 'D1_intercept_bin0']

In [6]:
tr.drop([col for col in tr.columns if col in COLS_TO_DROP],axis=1,inplace=True)
te.drop([col for col in te.columns if col in COLS_TO_DROP],axis=1,inplace=True)

In [7]:
tr.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,mxC13,maxDT,id_19,DeviceInfo,id_20,addr1,userid_min_C1,...,R_emaildomain_V119_mean,R_emaildomain_V119_std,card1_V20_mean,card1_V20_std,card1_V151_mean,card1_V151_std,card1_V67_mean,card1_V67_std,hour_V116_mean,hour_V116_std
0,2987000,0.0,86400,1.0,0.0,,2564,,315.0,1.0,...,1.000681,0.03321,1.108696,0.73721,7.913043,18.123097,1.232558,0.570597,1.016607,0.135997
1,2987001,0.0,86401,1.0,6.359409e-08,,2564,,325.0,1.0,...,1.000681,0.03321,0.790845,0.540298,7.608696,16.414957,1.001586,0.186864,1.016607,0.135997
2,2987002,0.0,86469,5.0,0.928167,,2564,,330.0,1.0,...,1.000681,0.03321,0.920404,0.40378,13.4,20.435264,0.993834,0.20772,1.016607,0.135997
3,2987003,0.0,86499,47.0,0.3340526,,2564,,476.0,2.0,...,1.000681,0.03321,0.852033,0.482982,3.253165,9.750411,1.010475,0.194549,1.016607,0.135997
4,2987004,0.0,86506,1.0,6.740974e-06,542.0,1663,144.0,420.0,1.0,...,1.000681,0.03321,1.0,0.565685,17.888889,24.624401,1.230769,0.429669,1.016607,0.135997


# Only keep private

In [8]:
te = te.loc[te['TransactionID']>3764887]
gc.collect()

3176

# Prepare for adv validation

In [9]:
tr['i_am_train'] = 1
te['i_am_train'] = 0

In [10]:
te['isFraud'] = -1

In [11]:
tr.shape, te.shape

((590540, 157), (405352, 157))

In [12]:
full_df = pd.concat([tr, te], axis=0, sort=True).reset_index(drop=True)

In [13]:
full_df

Unnamed: 0,D10_diff,D15_diff,D1_diff,DeviceInfo,M4_fq_enc,ProductCD_fq_enc,R_emaildomain_V118_mean,R_emaildomain_V118_std,R_emaildomain_V119_mean,R_emaildomain_V119_std,...,userid_unique_C5,userid_unique_C6,userid_unique_C7,userid_unique_C8,userid_unique_C9,week,week_TransactionAmt_mean,week_TransactionAmt_std,week_cos,week_sin
0,,,,2564,122947.0,800657,1.001712,0.043146,1.000681,0.033210,...,1.0,1.0,1.0,1.0,1.0,48,148.753721,263.413862,0.885456,-0.464723
1,,,,2564,357789.0,800657,1.001712,0.043146,1.000681,0.033210,...,1.0,1.0,1.0,1.0,1.0,48,148.753721,263.413862,0.885456,-0.464723
2,,,,2564,357789.0,800657,1.001712,0.043146,1.000681,0.033210,...,1.0,2.0,1.0,1.0,1.0,48,148.753721,263.413862,0.885456,-0.464723
3,,,,2564,357789.0,800657,1.001712,0.043146,1.000681,0.033210,...,1.0,1.0,1.0,1.0,1.0,48,148.753721,263.413862,0.885456,-0.464723
4,,,,1663,,62397,1.001712,0.043146,1.000681,0.033210,...,1.0,1.0,1.0,1.0,1.0,48,148.753721,263.413862,0.885456,-0.464723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995887,,,,2564,122947.0,137785,1.002987,0.055336,1.002928,0.054804,...,1.0,1.0,1.0,1.0,1.0,1,126.419484,195.179737,0.992709,0.120537
995888,0.0,5.0,5.0,152,122947.0,137785,1.002352,0.048439,1.002352,0.048439,...,1.0,1.0,1.0,1.0,1.0,1,126.419484,195.179737,0.992709,0.120537
995889,,,,2564,357789.0,800657,1.001712,0.043146,1.000681,0.033210,...,1.0,1.0,1.0,1.0,1.0,1,126.419484,195.179737,0.992709,0.120537
995890,,,,2564,357789.0,800657,1.001712,0.043146,1.000681,0.033210,...,1.0,1.0,1.0,1.0,1.0,1,126.419484,195.179737,0.992709,0.120537


In [14]:
del tr, te; gc.collect()

44

In [15]:
# v_shifts = pd.read_parquet('./full_trans_vcols_shift_diff.parquet.gzip')
# del v_shifts['userid']

In [16]:
features = [c for c in full_df.columns if c not in ['i_am_train', # This is the actual target for adv validation
                                               'userid', # An ID
                                               'isFraud', # The target for competition
                                                    'TransactionID','TransactionDT'
                                              ]]

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)

In [17]:
lgb_params = {
    'task': 'train', 
    'max_depth': 10, 
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    #'num_leaves': 3, 
    'learning_rate': 0.05,
    'feature_fraction': 0.50, 
    'bagging_fraction': 0.81, 
    'bagging_freq': 1,
    'lambda_l1': 3, 
    'lambda_l2': 3, 
    'verbose': -99, 
    'boost_from_average': 'true',
    'seed': 99,
    'nthreads':16
}

In [None]:
print('Training the Model:')
list_of_dicts = []
for fi, f in enumerate(tqdm(features)):
    dict_ = {}
    oof_preds = np.zeros(full_df.shape[0])
    print('Fitting to feature',f)
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(full_df.values, full_df['i_am_train'].values)):
        #print("Fold idx:{}".format(fold_ + 1))

        X_train, y_train = pd.DataFrame(full_df.iloc[trn_idx][f]), full_df['i_am_train'].iloc[trn_idx].values
        X_valid, y_valid = pd.DataFrame(full_df.iloc[val_idx][f]), full_df['i_am_train'].iloc[val_idx].values

        trn_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_valid, label=y_valid)

        clf = lgb.train(lgb_params, trn_data, 800, valid_sets = [trn_data, val_data], 
                        verbose_eval=0, early_stopping_rounds = 100)

        oof_preds[val_idx] = clf.predict(X_valid, num_iteration=clf.best_iteration)

    #print("CV score VAL: {:<8.5f} | using feature: {} | best iter: {}\n".format(
    #    roc_auc_score(full_df['i_am_train'], oof_preds), f, clf.best_iteration))
    dict_['feature'] = f
    dict_['cv'] = roc_auc_score(full_df['i_am_train'], oof_preds)
    dict_['best_iter'] = clf.best_iteration
    
    list_of_dicts.append(dict_)

  0%|          | 0/153 [00:00<?, ?it/s]

Training the Model:
Fitting to feature D10_diff


  1%|          | 1/153 [00:18<47:45, 18.85s/it]

Fitting to feature D15_diff


  1%|▏         | 2/153 [00:48<55:17, 21.97s/it]

Fitting to feature D1_diff


  2%|▏         | 3/153 [01:11<56:22, 22.55s/it]

Fitting to feature DeviceInfo


  3%|▎         | 4/153 [02:15<1:26:45, 34.94s/it]

Fitting to feature M4_fq_enc


  3%|▎         | 5/153 [02:33<1:13:04, 29.63s/it]

Fitting to feature ProductCD_fq_enc


  4%|▍         | 6/153 [02:50<1:03:49, 26.05s/it]

Fitting to feature R_emaildomain_V118_mean


  5%|▍         | 7/153 [03:10<58:45, 24.15s/it]  

Fitting to feature R_emaildomain_V118_std


  5%|▌         | 8/153 [03:29<54:22, 22.50s/it]

Fitting to feature R_emaildomain_V119_mean


  6%|▌         | 9/153 [03:47<50:52, 21.20s/it]

Fitting to feature R_emaildomain_V119_std


  7%|▋         | 10/153 [04:05<48:23, 20.30s/it]

Fitting to feature addr1


  7%|▋         | 11/153 [05:09<1:19:08, 33.44s/it]

Fitting to feature addr1_addr2_new_identity_D10_diff_mean_mean


  8%|▊         | 12/153 [05:45<1:20:24, 34.21s/it]

Fitting to feature addr1_addr2_new_identity_D10_diff_mean_std


  8%|▊         | 13/153 [06:24<1:23:11, 35.66s/it]

Fitting to feature addr1_addr2_new_identity_D15_diff_mean_mean


  9%|▉         | 14/153 [07:09<1:28:43, 38.30s/it]

Fitting to feature addr1_addr2_new_identity_D15_diff_mean_std


 10%|▉         | 15/153 [07:47<1:28:23, 38.43s/it]

Fitting to feature addr1_addr2_new_identity_D1_diff_mean_mean


 10%|█         | 16/153 [08:39<1:36:31, 42.28s/it]

Fitting to feature addr1_addr2_new_identity_D1_diff_mean_std


 11%|█         | 17/153 [09:28<1:40:41, 44.42s/it]

Fitting to feature addr1_addr2_new_identity_M5_mean_mean


 12%|█▏        | 18/153 [09:51<1:25:23, 37.95s/it]

Fitting to feature addr1_addr2_new_identity_M5_mean_std


 12%|█▏        | 19/153 [10:09<1:11:18, 31.93s/it]

Fitting to feature addr1_addr2_new_identity_M6_mean_mean


 13%|█▎        | 20/153 [10:29<1:02:55, 28.39s/it]

Fitting to feature addr1_addr2_new_identity_M6_mean_std


 14%|█▎        | 21/153 [10:49<56:56, 25.89s/it]  

Fitting to feature addr1_addr2_new_identity_V315_mean_mean


 14%|█▍        | 22/153 [11:39<1:12:14, 33.09s/it]

Fitting to feature addr1_addr2_new_identity_V315_mean_std


 15%|█▌        | 23/153 [12:26<1:20:42, 37.25s/it]

Fitting to feature addr1_broad_area


 16%|█▌        | 24/153 [12:41<1:06:05, 30.74s/it]

Fitting to feature addr_fq_enc


 16%|█▋        | 25/153 [13:21<1:11:12, 33.38s/it]

Fitting to feature card1_V151_mean


 17%|█▋        | 26/153 [13:56<1:11:53, 33.97s/it]

Fitting to feature card1_V151_std


 18%|█▊        | 27/153 [14:27<1:09:31, 33.11s/it]

Fitting to feature card1_V20_mean


 18%|█▊        | 28/153 [15:10<1:15:10, 36.09s/it]

Fitting to feature card1_V20_std


 19%|█▉        | 29/153 [15:59<1:22:15, 39.80s/it]

Fitting to feature card1_V67_mean


 20%|█▉        | 30/153 [16:43<1:24:18, 41.13s/it]

Fitting to feature card1_V67_std


 20%|██        | 31/153 [17:25<1:23:59, 41.30s/it]

Fitting to feature card4_fq_enc


 21%|██        | 32/153 [17:42<1:08:37, 34.03s/it]

Fitting to feature card6_fq_enc


 22%|██▏       | 33/153 [18:09<1:04:05, 32.05s/it]

Fitting to feature hour


 22%|██▏       | 34/153 [18:24<53:13, 26.84s/it]  

Fitting to feature hour_TransactionAmt_mean


 23%|██▎       | 35/153 [18:38<45:14, 23.00s/it]

Fitting to feature hour_TransactionAmt_std


 24%|██▎       | 36/153 [18:53<40:12, 20.62s/it]

Fitting to feature hour_V116_mean


 24%|██▍       | 37/153 [19:08<36:26, 18.85s/it]

Fitting to feature hour_V116_std


 25%|██▍       | 38/153 [19:23<33:52, 17.68s/it]

Fitting to feature hour_sin


 25%|██▌       | 39/153 [19:38<32:10, 16.93s/it]

Fitting to feature id_19


 26%|██▌       | 40/153 [20:35<54:31, 28.95s/it]

Fitting to feature id_20


 27%|██▋       | 41/153 [21:14<59:28, 31.86s/it]

Fitting to feature life_of_customer


 27%|██▋       | 42/153 [21:52<1:02:33, 33.81s/it]

Fitting to feature maxDT


 28%|██▊       | 43/153 [22:47<1:13:55, 40.32s/it]

Fitting to feature month


 29%|██▉       | 44/153 [23:27<1:13:02, 40.21s/it]

Fitting to feature mxC13


 29%|██▉       | 45/153 [24:11<1:14:09, 41.20s/it]

Fitting to feature new_identity_D10_diff_mean


 30%|███       | 46/153 [24:48<1:11:17, 39.98s/it]

Fitting to feature new_identity_D15_diff_mean


 31%|███       | 47/153 [25:21<1:06:45, 37.78s/it]

Fitting to feature new_identity_D1_diff_mean


 31%|███▏      | 48/153 [25:46<59:33, 34.03s/it]  

Fitting to feature new_identity_D3_mean


 32%|███▏      | 49/153 [26:18<57:51, 33.38s/it]

Fitting to feature new_identity_M5_mean


 33%|███▎      | 50/153 [27:00<1:01:51, 36.04s/it]

Fitting to feature new_identity_M6_mean


 33%|███▎      | 51/153 [27:55<1:11:07, 41.84s/it]

Fitting to feature new_identity_ProductCD_TransactionAmt_mean


 34%|███▍      | 52/153 [28:34<1:08:36, 40.76s/it]

Fitting to feature new_identity_V315_mean


 35%|███▍      | 53/153 [29:05<1:03:05, 37.86s/it]

Fitting to feature uid3_V67_mean


 35%|███▌      | 54/153 [29:47<1:04:41, 39.21s/it]

Fitting to feature uid3_V67_std


 36%|███▌      | 55/153 [30:28<1:05:02, 39.82s/it]

Fitting to feature uid3_V83_mean


 37%|███▋      | 56/153 [31:08<1:04:30, 39.90s/it]

Fitting to feature uid3_V83_std


 37%|███▋      | 57/153 [31:40<59:59, 37.50s/it]  

Fitting to feature uid6_C11_mean


 38%|███▊      | 58/153 [32:16<58:20, 36.84s/it]

Fitting to feature uid6_C11_std


 39%|███▊      | 59/153 [32:46<54:37, 34.87s/it]

Fitting to feature uid6_C1_mean


 39%|███▉      | 60/153 [33:18<52:34, 33.92s/it]

Fitting to feature uid6_C1_std


 40%|███▉      | 61/153 [33:52<52:06, 33.98s/it]

Fitting to feature uid6_D4_mean


 41%|████      | 62/153 [34:26<51:29, 33.95s/it]

Fitting to feature uid6_D4_std


 41%|████      | 63/153 [35:04<52:53, 35.27s/it]

Fitting to feature uid6_M5_mean


 42%|████▏     | 64/153 [35:52<57:59, 39.10s/it]

Fitting to feature uid6_M5_std


 42%|████▏     | 65/153 [37:16<1:16:55, 52.44s/it]

Fitting to feature uid6_M6_mean


 43%|████▎     | 66/153 [38:44<1:31:54, 63.38s/it]

Fitting to feature uid6_M6_std


 44%|████▍     | 67/153 [39:55<1:34:00, 65.59s/it]

Fitting to feature uid6_TransactionAmt_mean


 44%|████▍     | 68/153 [41:29<1:44:45, 73.95s/it]

Fitting to feature uid6_TransactionAmt_std


 45%|████▌     | 69/153 [42:58<1:49:57, 78.54s/it]

Fitting to feature uid6_V281_mean


 46%|████▌     | 70/153 [44:33<1:55:21, 83.39s/it]

Fitting to feature uid6_V281_std


 46%|████▋     | 71/153 [46:16<2:02:16, 89.48s/it]

Fitting to feature uid6_V320_mean


 47%|████▋     | 72/153 [47:43<1:59:41, 88.66s/it]

Fitting to feature uid6_V320_std


 48%|████▊     | 73/153 [49:26<2:03:47, 92.85s/it]

Fitting to feature uid6_V54_mean


 48%|████▊     | 74/153 [50:51<1:59:26, 90.72s/it]

Fitting to feature uid6_V54_std


 49%|████▉     | 75/153 [52:13<1:54:14, 87.88s/it]

Fitting to feature uid6_V67_mean


 50%|████▉     | 76/153 [53:55<1:58:20, 92.22s/it]

Fitting to feature uid6_V67_std


 51%|█████     | 78/153 [57:16<2:00:04, 96.06s/it]

Fitting to feature userid_max_C1


 52%|█████▏    | 79/153 [59:06<2:03:28, 100.12s/it]

Fitting to feature userid_max_C10


 52%|█████▏    | 80/153 [1:01:41<2:22:02, 116.74s/it]

Fitting to feature userid_max_C11


 53%|█████▎    | 81/153 [1:03:54<2:25:46, 121.48s/it]

Fitting to feature userid_max_C12


 54%|█████▎    | 82/153 [1:06:17<2:31:20, 127.89s/it]

Fitting to feature userid_max_C13


 54%|█████▍    | 83/153 [1:08:12<2:24:49, 124.14s/it]

Fitting to feature userid_max_C14


 55%|█████▍    | 84/153 [1:10:09<2:20:27, 122.13s/it]

Fitting to feature userid_max_C2


 56%|█████▌    | 85/153 [1:12:02<2:15:02, 119.15s/it]

Fitting to feature userid_max_C3


 56%|█████▌    | 86/153 [1:12:46<1:47:57, 96.68s/it] 

Fitting to feature userid_max_C4


 57%|█████▋    | 87/153 [1:15:08<2:01:17, 110.26s/it]

Fitting to feature userid_max_C5


 58%|█████▊    | 88/153 [1:16:37<1:52:32, 103.89s/it]

Fitting to feature userid_max_C6


 58%|█████▊    | 89/153 [1:18:32<1:54:17, 107.15s/it]

Fitting to feature userid_max_C7


 59%|█████▉    | 90/153 [1:21:24<2:13:05, 126.76s/it]

Fitting to feature userid_max_C8


 59%|█████▉    | 91/153 [1:23:32<2:11:25, 127.19s/it]

Fitting to feature userid_max_C9


 60%|██████    | 92/153 [1:25:21<2:03:40, 121.64s/it]

Fitting to feature userid_max_minus_min_C1


 61%|██████    | 93/153 [1:26:52<1:52:32, 112.55s/it]

Fitting to feature userid_max_minus_min_C10


 61%|██████▏   | 94/153 [1:27:36<1:30:17, 91.82s/it] 

Fitting to feature userid_max_minus_min_C11


 62%|██████▏   | 95/153 [1:28:56<1:25:15, 88.20s/it]

Fitting to feature userid_max_minus_min_C12


 63%|██████▎   | 96/153 [1:30:34<1:26:41, 91.25s/it]

Fitting to feature userid_max_minus_min_C13


 63%|██████▎   | 97/153 [1:32:42<1:35:37, 102.45s/it]

Fitting to feature userid_max_minus_min_C14


 64%|██████▍   | 98/153 [1:33:56<1:26:03, 93.88s/it] 

Fitting to feature userid_max_minus_min_C2


 65%|██████▍   | 99/153 [1:35:34<1:25:30, 95.01s/it]

Fitting to feature userid_max_minus_min_C3


 65%|██████▌   | 100/153 [1:35:58<1:04:59, 73.57s/it]

Fitting to feature userid_max_minus_min_C4


 66%|██████▌   | 101/153 [1:37:14<1:04:35, 74.53s/it]

Fitting to feature userid_max_minus_min_C5


 67%|██████▋   | 102/153 [1:39:26<1:18:00, 91.78s/it]

Fitting to feature userid_max_minus_min_C6


 67%|██████▋   | 103/153 [1:41:20<1:21:55, 98.30s/it]

Fitting to feature userid_max_minus_min_C7


 68%|██████▊   | 104/153 [1:42:28<1:12:47, 89.14s/it]

Fitting to feature userid_max_minus_min_C8


 69%|██████▊   | 105/153 [1:43:36<1:06:25, 83.03s/it]

Fitting to feature userid_max_minus_min_C9


 69%|██████▉   | 106/153 [1:45:25<1:10:58, 90.61s/it]

Fitting to feature userid_mean_C1


 70%|██████▉   | 107/153 [1:46:54<1:09:10, 90.23s/it]

Fitting to feature userid_mean_C10


 71%|███████   | 108/153 [1:49:10<1:17:55, 103.89s/it]

Fitting to feature userid_mean_C11


 71%|███████   | 109/153 [1:50:47<1:14:43, 101.89s/it]

Fitting to feature userid_mean_C12


 72%|███████▏  | 110/153 [1:53:02<1:20:13, 111.93s/it]

Fitting to feature userid_mean_C13


 73%|███████▎  | 111/153 [1:54:32<1:13:45, 105.37s/it]

Fitting to feature userid_mean_C14


 73%|███████▎  | 112/153 [1:56:13<1:10:59, 103.89s/it]

Fitting to feature userid_mean_C2


 74%|███████▍  | 113/153 [1:57:39<1:05:46, 98.66s/it] 

Fitting to feature userid_mean_C3


 75%|███████▍  | 114/153 [1:58:55<59:37, 91.74s/it]  

Fitting to feature userid_mean_C4


 75%|███████▌  | 115/153 [2:01:20<1:08:10, 107.65s/it]

Fitting to feature userid_mean_C5


 76%|███████▌  | 116/153 [2:02:52<1:03:35, 103.13s/it]

Fitting to feature userid_mean_C6


 76%|███████▋  | 117/153 [2:04:10<57:14, 95.40s/it]   

Fitting to feature userid_mean_C7


 77%|███████▋  | 118/153 [2:06:25<1:02:36, 107.32s/it]

Fitting to feature userid_mean_C8


In [None]:
study = pd.DataFrame(list_of_dicts)

In [None]:
study.sort_values('cv') # (in a new cell)

In [None]:
study.loc[study['cv']>=0.60, 'feature'].values # (in a new cell)

In [None]:

# in a new cell:
# My recommendation is to not use the following columns:
bad_cols = study.loc[study['cv']>=0.60, 'feature'].values
for col in bad_cols:
    print('\nLooking at column',col)
    print('Training mean is',full_df.loc[full_df['i_am_train']==1,col].mean(),'while Testing mean is',full_df.loc[full_df['i_am_train']==0,col].mean())
    print('Training median is',full_df.loc[full_df['i_am_train']==1,col].median(),'while Testing median is',full_df.loc[full_df['i_am_train']==0,col].median())
    print('Training std is',full_df.loc[full_df['i_am_train']==1,col].std(),'while Testing std is',full_df.loc[full_df['i_am_train']==0,col].std())

In [None]:
study.to_csv('fe012_adv_600.csv')