In [1]:
import pandas as pd
import numpy as np
import gc
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [2]:
FEATURES = ['V85',
         'bank_type_TransactionAmt_mean',
         'D5_fq_enc',
         'V12',
         'V81',
         'V282',
         'bank_type_D7_std',
         'id_15',
         'V13',
         'C12_fq_enc',
         'anomaly',
         'D7_DT_D_std_score',
         'D3_DT_D_min_max',
         'card4_count_full',
         'D14_DT_D_min_max',
         'card1_count_full',
         'V169',
         'D3_DT_M_min_max',
         'V279',
         'V91',
         'bank_type_D10_std',
         'D14',
         'D6_DT_M_std_score',
         'D4_DT_W_min_max',
         'V152',
         'V56',
         'D3_intercept_bin0',
         'D14_intercept_bin0',
         'V220',
         'V277',
         'D12_intercept',
         'ProductCD_W_00cents',
         'D13_intercept_bin0',
         'V291',
         'V189',
         'D15_DT_M_min_max',
         'C5_fq_enc',
         'D3_fq_enc',
         'card5_fq_enc',
         'addr1_count_full',
         'V266',
         'D11_intercept_bin2',
         'V23',
         'D4_intercept_bin3',
         'bank_type_D10_mean',
         'D2_intercept_bin3',
         'V306',
         'DeviceType',
         'V285',
         'D5_DT_W_std_score',
         'V131',
         'V37',
         'V296',
         'bank_type_D1_mean',
         'V75',
         'D3_DT_W_std_score',
         'D10_DT_M_min_max',
         'id_33_0',
         'V67',
         'D4_intercept_bin4',
         'V256',
         'V143',
         'uid5_D6_std',
         'ProductCD_target_mean',
         'mxC3',
         'V129',
         'D13_DT_M_std_score',
         'V24',
         'D3_DT_M_std_score',
         'mxC4',
         'D9',
         'id_30_version_fq_enc',
         'D5_DT_D_std_score',
         'D11_DT_M_std_score',
         'uid5_D6_mean',
         'D14_DT_M_std_score',
         'card5_TransactionAmt_std',
         'V20',
         'C8_fq_enc',
         'V70',
         'V127',
         'D6_intercept',
         'D15_DT_W_min_max',
         'sum_Cxx_binary_higher_than_q95',
         'V156',
         'uid4_D12_mean',
         'C5',
         'uid4_D12_std',
         'id_30_fq_enc',
         'V61',
         'id_33',
         'D15_to_std_addr1',
         'bank_type_D9_mean',
         'D5_intercept',
         'D10_DT_W_min_max',
         'V130',
         'bank_type_D9_std',
         'uid5_D7_std',
         'bank_type_D14_mean',
         'bank_type_D3_std',
         'bank_type_D5_mean',
         'ProductCD',
         'M8',
         'V44',
         'D6_fq_enc',
         'D15_DT_D_min_max',
         'D11_intercept_bin0',
         'V257',
         'bank_type_D7_mean',
         'V76',
         'D15',
         'V38',
         'V55',
         'V261',
         'V149',
         'D4',
         'D8_intercept_bin0',
         'M2',
         'bank_type_D6_std',
         'id_30_version',
         'D4_intercept_bin1',
         'D15_to_mean_card4',
         'V82',
         'D3_DT_D_std_score',
         'D10_intercept_bin3',
         'bank_type_D2_std',
         'V77',
         'M7',
         'D11',
         'D4_intercept_bin2',
         'email_check',
         'V294',
         'V317',
         'V308',
         'id_33_fq_enc',
         'bank_type_D5_std',
         'D8_intercept',
         'V62',
         'V187',
         'card5_TransactionAmt_mean',
         'bank_type_D12_mean',
         'id_33_count_dist',
         'D2_intercept_bin2',
         'C10',
         'V86',
         'D8_DT_M_min_max',
         'D15_intercept_bin4',
         'D6_DT_W_std_score',
         'uid5_D7_mean',
         'C9_fq_enc',
         'mxC10',
         'D14_DT_W_std_score',
         'card2_count_full',
         'V258',
         'bank_type_D14_std',
         'D10_intercept_bin4',
         'V83',
         'bank_type_D13_std',
         'D8_DT_W_min_max',
         'TransactionAmt',
         'V312',
         'D14_intercept',
         'id_33_1',
         'D15_intercept_bin2',
         'D12_DT_W_std_score',
         'V78',
         'D8_D9_decimal_dist',
         'M9',
         'V281',
         'bank_type_D12_std',
         'V54',
         'C9',
         'M4_target_mean',
         'sum_Cxx_binary_higher_than_q90',
         'D10_DT_D_min_max',
         'bank_type_D3_mean',
         'bank_type_D8_mean',
         'R_emaildomain_prefix',
         'bank_type_D6_mean',
         'V314',
         'D11_DT_W_std_score',
         'D10',
         'D4_DT_D_min_max',
         'V283',
         'D10_intercept_bin2',
         'D13_intercept',
         'D8_DT_D_min_max',
         'C2_fq_enc',
         'V165',
         'D1_intercept_bin4',
         'bank_type_D13_mean',
         'D3_intercept',
         'TransactionAmt_2Dec',
         'card3_div_Mean_D9_DOY',
         'C12',
         'D4_DT_M_std_score',
         'D2_intercept_bin1',
         'mxC8',
         'D2_fq_enc',
         'addr1_third_digit',
         'D4_fq_enc',
         'D1_fq_enc',
         'mxC12',
         'D8',
         'D10_intercept_bin1',
         'id_01',
         'id_09',
         'id_03',
         'addr1_second_digit',
         'D15_to_mean_addr1',
         'sum_Cxx_binary_higher_than_q80',
         'V53',
         'TransactionAmt_decimal',
         'card3_div_Mean_D6_DOY',
         'D15_intercept_bin3',
         'V45',
         'id_02_to_std_card4',
         'addr2_div_Mean_D10_DOY_productCD',
         'DeviceInfo_version',
         'DeviceInfo_device',
         'D1_intercept_bin3',
         'D11_intercept',
         'DeviceInfo_version_fq_enc',
         'C6',
         'uid5_D13_std',
         'TransactionAmt_DT_M_min_max',
         'dist2',
         'C8',
         'D15_intercept_bin1',
         'M3',
         'R_emaildomain_fq_enc',
         'DeviceInfo_device_fq_enc',
         'D6_DT_D_std_score',
         'sum_Cxx_binary_higher_than_q60',
         'D11__DeviceInfo',
         'TranAmt_div_Mean_D12_DOY_productCD',
         'D10_DT_M_std_score',
         'uid5_D13_mean',
         'mxC5',
         'id_30',
         'addr2_div_Mean_D4_DOY',
         'uid2_D12_std',
         'C11_fq_enc',
         'id_06',
         'uid2_D12_mean',
         'sum_Cxx_binary_higher_than_q70',
         'V310',
         'V307',
         'C6_fq_enc',
         'D8_fq_enc',
         'dist2_fq_enc',
         'D2_intercept_bin0',
         'addr1_div_Mean_D10_DOY_productCD',
         'addr1_div_Mean_D10_DOY',
         'addr1_div_Mean_D11_DOY',
         'uid2_D8_std',
         'id_02__id_20',
         'V313',
         'D4_intercept_bin0',
         'D11_DT_D_std_score',
         'Transaction_day_of_week',
         'card6_div_Mean_D3_DOY',
         'uid2_D1_std',
         'uid5_D11_mean',
         'uid_fq_enc',
         'D14_DT_D_std_score',
         'D12_DT_D_std_score',
         'id_02_to_mean_card4',
         'uid4_D13_std',
         'D1_intercept_bin1',
         'id_02_to_std_card1',
         'uid5_D11_std',
         'P_emaildomain_prefix',
         'DT_day',
         'D8_DT_M_std_score',
         'uid2_D1_mean',
         'TransactionAmt_to_mean_card4',
         'card5_div_Mean_D11_DOY',
         'D15_DT_M_std_score',
         'V87',
         'uid_D12_std',
         'id_31_device_fq_enc',
         'uid2_D11_mean',
         'card3_DT_W_week_day_dist_best',
         'uid5_D14_std',
         'uid2_D15_mean',
         'sum_Cxx_binary_higher_than_q50',
         'id_13',
         'card3_div_Mean_D11_DOY',
         'C11',
         'bank_type_DT_W_week_day_dist_best',
         'card4_div_Mean_D11_DOY',
         'addr1_div_Mean_D1_DOY',
         'uid2_D4_mean',
         'card2_div_Mean_D11_DOY',
         'C13_fq_enc',
         'uid4_D13_mean',
         'card5_DT_W_week_day_dist_best',
         'id_02',
         'uid5_D14_mean',
         'uid2_D10_mean',
         'id_01_count_dist',
         'D13_DT_W_std_score',
         'C2',
         'C14',
         'addr2_div_Mean_D10_DOY',
         'uid2_D11_std',
         'addr1_div_Mean_D1_DOY_productCD',
         'id_02_to_mean_card1',
         'dist1_fq_enc',
         'card1_div_Mean_D11_DOY',
         'D15_to_std_card1',
         'TransactionAmt_DT_M_std_score',
         'uid2_D6_std',
         'TransactionAmt_to_std_card4',
         'uid2_D15_std',
         'uid3_D8_std',
         'card6_div_Mean_D11_DOY',
         'TranAmt_div_Mean_D14_DOY',
         'card3_div_Mean_D14_DOY',
         'D2',
         'D1',
         'uid_D15_mean',
         'uid4_D6_std',
         'uid_D15_std',
         'D10_intercept_bin0',
         'DeviceInfo_fq_enc',
         'uid2_D13_std',
         'uid_D12_mean',
         'uid4_D6_mean',
         'uid_D1_std',
         'D1_intercept_bin2',
         'uid_D10_mean',
         'card2__id_20',
         'uid4_D7_std',
         'uid3_D13_std',
         'C14_fq_enc',
         'uid_D8_std',
         'uid3_D13_mean',
         'uid2_D4_std',
         'addr1_div_Mean_D4_DOY',
         'uid_D4_mean',
         'D4_DT_W_std_score',
         'addr2_div_Mean_D1_DOY_productCD',
         'uid_D11_mean',
         'D15_intercept_bin0',
         'uid2_D10_std',
         'uid_D13_std',
         'uid2_fq_enc',
         'uid2_D13_mean',
         'uid2_D2_mean',
         'D2_intercept',
         'uid_D11_std',
         'card2',
         'uid4_D14_std',
         'C_sum_after_clip75',
         'R_emaildomain',
         'dist1',
         'id_05',
         'uid_TransactionAmt_mean',
         'uid_D1_mean',
         'uid3_D1_std',
         'uid5_D8_std',
         'uid3_D6_std',
         'Transaction_hour_of_day',
         'uid4_D14_mean',
         'uid5_D10_std',
         'uid3_D10_std',
         'uid5_D1_std',
         'uid5_D15_std',
         'uid2_D7_mean',
         'uid3_D11_std',
         'uid4_D8_std',
         'D13_DT_D_std_score',
         'uid3_D11_mean',
         'uid2_D14_std',
         'uid2_D7_std',
         'uid2_D14_mean',
         'uid_D13_mean',
         'uid_D10_std',
         'uid2_D3_std',
         'uid_D6_std',
         'uid3_D15_std',
         'addr1_fq_enc',
         'id_31',
         'uid_TransactionAmt_std',
         'card1_div_Mean_D4_DOY_productCD',
         'uid2_TransactionAmt_mean',
         'C_sum_after_clip90',
         'uid2_TransactionAmt_std',
         'uid4_D7_mean',
         'uid2_D6_mean',
         'uid3_D15_mean',
         'D15_to_mean_card1',
         'uid5_D15_mean',
         'M4',
         'uid3_D7_std',
         'card2_div_Mean_D4_DOY',
         'card5_div_Mean_D4_DOY_productCD',
         'card5_div_Mean_D4_DOY',
         'D4_intercept',
         'uid_D4_std',
         'card6_div_Mean_D4_DOY_productCD',
         'card5__P_emaildomain',
         'card1_fq_enc',
         'uid5_D10_mean',
         'card1_div_Mean_D4_DOY',
         'C1',
         'M6',
         'uid2_D2_std',
         'P_emaildomain_fq_enc',
         'card1_TransactionAmt_mean',
         'uid3_D10_mean',
         'TransactionAmt_DT_W_min_max',
         'uid5_D4_std',
         'card1_div_Mean_D10_DOY_productCD',
         'uid3_D1_mean',
         'card1_div_Mean_D10_DOY',
         'uid_D14_mean',
         'mxC9',
         'TranAmt_div_Mean_D4_DOY_productCD',
         'D15_DT_W_std_score',
         'DeviceInfo__P_emaildomain',
         'uid3_D14_mean',
         'bank_type_DT_M',
         'mxC11',
         'uid5_D1_mean',
         'uid_D2_mean',
         'D10_DT_W_std_score',
         'card3_DT_M_month_day_dist_best',
         'uid3_D2_std',
         'TranAmt_div_Mean_D4_DOY',
         'card1_TransactionAmt_std',
         'card3_div_Mean_D4_DOY_productCD',
         'D1_intercept_bin0',
         'uid3_D4_std',
         'card2_div_Mean_D10_DOY',
         'uid_D2_std',
         'uid3_D14_std',
         'uid3_D4_mean',
         'uid_D7_mean',
         'uid5_D2_std',
         'card4_div_Mean_D4_DOY_productCD',
         'card6_div_Mean_D4_DOY',
         'TranAmt_div_Mean_D10_DOY',
         'uid2_D9_std',
         'TransactionAmt_DT_W_std_score',
         'C1_fq_enc',
         'card1_div_Mean_D1_DOY',
         'uid5_D4_mean',
         'uid3_D6_mean',
         'mxC14',
         'uid5_D2_mean',
         'card4_div_Mean_D4_DOY',
         'card3_div_Mean_D4_DOY',
         'uid_D14_std',
         'M5',
         'C13',
         'mxC6',
         'card5_div_Mean_D10_DOY_productCD',
         'card3_DT_M_month_day_dist',
         'card2_div_Mean_D10_DOY_productCD',
         'uid_D7_std',
         'card2_div_Mean_D4_DOY_productCD',
         'bank_type_DT_M_month_day_dist',
         'uid3_D7_mean',
         'uid_D3_std',
         'uid5_fq_enc',
         'uid3_fq_enc',
         'uid_D3_mean',
         'D4_DT_D_std_score',
         'uid3_D2_mean',
         'uid4_D1_std',
         'uid2_D5_std',
         'uid4_D10_std',
         'bank_type_DT_D_hour_dist_best',
         'uid2_D8_mean',
         'card6_div_Mean_D10_DOY_productCD',
         'card1_div_Mean_D1_DOY_productCD',
         'uid5_D9_std',
         'card4_div_Mean_D10_DOY_productCD',
         'uid2_D3_mean',
         'uid_D6_mean',
         'card2_div_Mean_D1_DOY',
         'card5_div_Mean_D10_DOY',
         'mxC2',
         'card2_TransactionAmt_std',
         'bank_type_DT_W_week_day_dist',
         'card2_TransactionAmt_mean',
         'uid4_D10_mean',
         'id_31_count_dist',
         'TranAmt_div_Mean_D1_DOY',
         'uid3_D3_std',
         'uid4_D15_std',
         'card5_div_Mean_D1_DOY_productCD',
         'card4_div_Mean_D10_DOY',
         'card5_DT_D_hour_dist_best',
         'uid4_D4_std',
         'card5_DT_M_month_day_dist',
         'bank_type_DT_W',
         'addr1__card1',
         'bank_type_DT_M_month_day_dist_best',
         'card2_div_Mean_D1_DOY_productCD',
         'card6_div_Mean_D10_DOY',
         'uid2_D5_mean',
         'uid_DT_M',
         'card2__dist1',
         'uid2_D9_mean',
         'card5_DT_M_month_day_dist_best',
         'TranAmt_div_Mean_D10_DOY_productCD',
         'uid4_D11_std',
         'uid_D5_mean',
         'uid5_D3_std',
         'TransactionAmt_DT_D_std_score',
         'D8_DT_W_std_score',
         'card5_DT_W_week_day_dist',
         'uid5_D5_std',
         'card3_DT_W_week_day_dist',
         'uid4_D9_std',
         'D10_intercept',
         'uid3_D3_mean',
         'uid4_D5_std',
         'uid_D5_std',
         'card5_div_Mean_D1_DOY',
         'uid5_D3_mean',
         'bank_type_DT_D',
         'uid4_D1_mean',
         'uid_D8_mean',
         'uid3_D5_mean',
         'D15_intercept',
         'uid5_TransactionAmt_std',
         'uid3_D5_std',
         'uid4_D4_mean',
         'uid4_D15_mean',
         'uid5_D8_mean',
         'uid5_D9_mean',
         'uid_D9_std',
         'uid_D9_mean',
         'uid5_D5_mean',
         'mtransamt',
         'bank_type_DT_D_hour_dist',
         'uid4_D11_mean',
         'D15_DT_D_std_score',
         'TransactionAmt_DT_D_min_max',
         'uid4_D2_mean',
         'ntrans',
         'addr2_div_Mean_D1_DOY',
         'uid5_TransactionAmt_mean',
         'uid3_D9_std',
         'TransactionAmt_Dec',
         'uid3_TransactionAmt_std',
         'card5_DT_D_hour_dist',
         'card1',
         'card4_div_Mean_D1_DOY_productCD',
         'P_emaildomain__C2',
         'card3_div_Mean_D10_DOY',
         'uid4_D3_std',
         'card3_DT_D_hour_dist_best',
         'uid4_D8_mean',
         'uid4_D2_std',
         'card6_div_Mean_D1_DOY_productCD',
         'uid_DT_W',
         'Sum_TransAmt_Day',
         'uid4_D5_mean',
         'card4_div_Mean_D1_DOY',
         'card3_div_Mean_D10_DOY_productCD',
         'uid3_D8_mean',
         'TransactionAmt_userid_median',
         'uid4_fq_enc',
         'uid3_TransactionAmt_mean',
         'uid3_D9_mean',
         'card6_div_Mean_D1_DOY',
         'Trans_Count_Day',
         'mxC1',
         'D10_DT_D_std_score',
         'card3_div_Mean_D1_DOY',
         'TransactionAmt_to_mean_card1',
         'card2_fq_enc',
         'product_type',
         'card3_div_Mean_D1_DOY_productCD',
         'TransactionAmt_to_std_card1',
         'uid_DT_D',
         'uid4_D9_mean',
         'D1_intercept',
         'card3_DT_D_hour_dist',
         'TranAmt_div_Mean_D1_DOY_productCD',
         'product_type_DT_M',
         'uid4_D3_mean',
         'uid4_TransactionAmt_mean',
         'uid4_TransactionAmt_std',
         'D8_DT_D_std_score',
         'Mean_TransAmt_Day',
         'minDT',
         'product_type_DT_W',
         'mintransamt',
         'maxtransamt',
         'TransactionAmt_userid_std',
         'P_emaildomain',
         'card1__card5',
         'product_type_DT_D',
         'mxC13',
         'maxDT',
         'id_19',
         'DeviceInfo',
         'id_20',
         'addr1',
         'userid_min_C1',
         'userid_max_C1',
         'userid_max_minus_min_C1',
         'userid_unique_C1',
         'userid_mean_C1',
         'userid_min_C2',
         'userid_max_C2',
         'userid_max_minus_min_C2',
         'userid_unique_C2',
         'userid_mean_C2',
         'userid_min_C3',
         'userid_max_C3',
         'userid_max_minus_min_C3',
         'userid_unique_C3',
         'userid_mean_C3',
         'userid_min_C4',
         'userid_max_C4',
         'userid_max_minus_min_C4',
         'userid_unique_C4',
         'userid_mean_C4',
         'userid_min_C5',
         'userid_max_C5',
         'userid_max_minus_min_C5',
         'userid_unique_C5',
         'userid_mean_C5',
         'userid_min_C6',
         'userid_max_C6',
         'userid_max_minus_min_C6',
         'userid_unique_C6',
         'userid_mean_C6',
         'userid_min_C7',
         'userid_max_C7',
         'userid_max_minus_min_C7',
         'userid_unique_C7',
         'userid_mean_C7',
         'userid_min_C8',
         'userid_max_C8',
         'userid_max_minus_min_C8',
         'userid_unique_C8',
         'userid_mean_C8',
         'userid_min_C9',
         'userid_max_C9',
         'userid_max_minus_min_C9',
         'userid_unique_C9',
         'userid_mean_C9',
         'userid_min_C10',
         'userid_max_C10',
         'userid_max_minus_min_C10',
         'userid_unique_C10',
         'userid_mean_C10',
         'userid_min_C11',
         'userid_max_C11',
         'userid_max_minus_min_C11',
         'userid_unique_C11',
         'userid_mean_C11',
         'userid_min_C12',
         'userid_max_C12',
         'userid_max_minus_min_C12',
         'userid_unique_C12',
         'userid_mean_C12',
         'userid_min_C13',
         'userid_max_C13',
         'userid_max_minus_min_C13',
         'userid_unique_C13',
         'userid_mean_C13',
         'userid_min_C14',
         'userid_max_C14',
         'userid_max_minus_min_C14',
         'userid_unique_C14',
         'userid_mean_C14',
            'hour',
         'hour_sin',
         'week',
         'week_sin',
         'week_cos',
         'month',
         'life_of_customer',
         'addr1_broad_area',
         'uid6_TransactionAmt_mean',
         'uid6_TransactionAmt_std',
         'hour_TransactionAmt_mean',
         'hour_TransactionAmt_std',
         'week_TransactionAmt_mean',
         'week_TransactionAmt_std',
         'D1_diff',
         'D10_diff',
         'D15_diff',
         'new_identity_M5_mean',
         'new_identity_M6_mean',
         'new_identity_V315_mean',
         'new_identity_D1_diff_mean',
         'new_identity_D3_mean',
         'new_identity_D10_diff_mean',
         'new_identity_D15_diff_mean',
         'addr1_addr2_new_identity_M5_mean_mean',
         'addr1_addr2_new_identity_M5_mean_std',
         'addr1_addr2_new_identity_M6_mean_mean',
         'addr1_addr2_new_identity_M6_mean_std',
         'addr1_addr2_new_identity_V315_mean_mean',
         'addr1_addr2_new_identity_V315_mean_std',
         'addr1_addr2_new_identity_D1_diff_mean_mean',
         'addr1_addr2_new_identity_D1_diff_mean_std',
         'addr1_addr2_new_identity_D10_diff_mean_mean',
         'addr1_addr2_new_identity_D10_diff_mean_std',
         'addr1_addr2_new_identity_D15_diff_mean_mean',
         'addr1_addr2_new_identity_D15_diff_mean_std',
         'new_identity_ProductCD_TransactionAmt_mean',
         'uid6_C1_mean',
         'uid6_C1_std',
         'uid6_V54_mean',
         'uid6_V54_std',
         'uid6_V281_mean',
         'uid6_V281_std',
         'uid6_C11_mean',
         'uid6_C11_std',
         'uid6_D4_mean',
         'uid6_D4_std',
         'uid6_V67_mean',
         'uid6_V67_std',
         'uid6_V320_mean',
         'uid6_V320_std',
         'uid6_M5_mean',
         'uid6_M5_std',
         'uid6_M6_mean',
         'uid6_M6_std',
         'uid3_V67_mean',
         'uid3_V67_std',
         'uid3_V83_mean',
         'uid3_V83_std',
         'uid6_fq_enc',
         'card4_fq_enc',
         'card6_fq_enc',
         'ProductCD_fq_enc',
         'M4_fq_enc',
         'addr_fq_enc',
         'R_emaildomain_V118_mean',
         'R_emaildomain_V118_std',
         'R_emaildomain_V119_mean',
         'R_emaildomain_V119_std',
         'card1_V20_mean',
         'card1_V20_std',
         'card1_V151_mean',
         'card1_V151_std',
         'card1_V67_mean',
         'card1_V67_std',
         'hour_V116_mean',
         'hour_V116_std']

In [3]:
len(FEATURES)/2

376.5

In [4]:
tr = pd.read_parquet('../../data/train_FE012.parquet', columns=['TransactionID','isFraud','TransactionDT']+FEATURES[100:200])
te = pd.read_parquet('../../data/test_FE012.parquet', columns=['TransactionID','TransactionDT']+FEATURES[100:200])

In [5]:
COLS_TO_DROP = ['D3_intercept_bin0',
 'D14_intercept_bin0',
 'D13_intercept_bin0',
 'D11_intercept_bin2',
 'D4_intercept_bin3',
 'D2_intercept_bin3',
 'D4_intercept_bin4',
 'D11_intercept_bin0',
 'D8_intercept_bin0',
 'D4_intercept_bin1',
 'D10_intercept_bin3',
 'D4_intercept_bin2',
 'D2_intercept_bin2',
 'D15_intercept_bin4',
 'D10_intercept_bin4',
 'D15_intercept_bin2',
 'D10_intercept_bin2',
 'D1_intercept_bin4',
 'D2_intercept_bin1',
 'D10_intercept_bin1',
 'D15_intercept_bin3',
 'D1_intercept_bin3',
 'D15_intercept_bin1',
 'D2_intercept_bin0',
 'D4_intercept_bin0',
 'D1_intercept_bin1',
 'D10_intercept_bin0',
 'D1_intercept_bin2',
 'D15_intercept_bin0',
 'D1_intercept_bin0']

In [6]:
tr.drop([col for col in tr.columns if col in COLS_TO_DROP],axis=1,inplace=True)
te.drop([col for col in te.columns if col in COLS_TO_DROP],axis=1,inplace=True)

In [7]:
tr.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,bank_type_D5_mean,ProductCD,M8,V44,D6_fq_enc,D15_DT_D_min_max,V257,...,V165,bank_type_D13_mean,D3_intercept,TransactionAmt_2Dec,card3_div_Mean_D9_DOY,C12,D4_DT_M_std_score,mxC8,D2_fq_enc,addr1_third_digit
0,2987000,0.0,86400,41.90625,4,2,,899261,0.0,,...,,0.0,13.0,0.5,,0.0,,0.0,515566,5
1,2987001,0.0,86401,48.25,4,2,1.0,899261,0.0,,...,,23.984375,,0.0,,0.0,-0.737244,0.0,515566,5
2,2987002,0.0,86469,46.75,4,0,1.0,899261,0.453237,,...,,15.335938,,0.0,,0.0,-0.737244,0.0,515566,0
3,2987003,0.0,86499,43.8125,4,2,1.0,899261,0.159712,,...,,28.890625,-0.001146,0.0,,0.0,-0.187538,0.0,1374,6
4,2987004,0.0,86506,48.25,1,2,,899261,,1.0,...,5155.0,23.984375,,0.0,,0.0,,1.0,515566,0


# Only keep private

In [8]:
te = te.loc[te['TransactionID']>3764887]
gc.collect()

3176

# Prepare for adv validation

In [9]:
tr['i_am_train'] = 1
te['i_am_train'] = 0

In [10]:
te['isFraud'] = -1

In [11]:
tr.shape, te.shape

((590540, 92), (405352, 92))

In [12]:
full_df = pd.concat([tr, te], axis=0, sort=True).reset_index(drop=True)

In [13]:
full_df

Unnamed: 0,C10,C12,C2_fq_enc,C9,C9_fq_enc,D10,D10_DT_D_min_max,D11,D11_DT_W_std_score,D12_DT_W_std_score,...,i_am_train,id_30_version,id_33_1,id_33_count_dist,id_33_fq_enc,isFraud,mxC10,mxC8,sum_Cxx_binary_higher_than_q90,uid5_D7_mean
0,0.0,0.0,581696,1.0,420354,13.0,0.018705,13.0,-0.894949,,...,1,0,,517251,,0.0,0.0,0.0,0,
1,0.0,0.0,581696,0.0,341561,0.0,0.000000,,,,...,1,0,,517251,,0.0,0.0,0.0,0,
2,0.0,0.0,581696,1.0,420354,0.0,0.000000,315.0,0.925020,,...,1,0,,517251,,0.0,0.0,0.0,0,
3,0.0,0.0,31682,1.0,420354,84.0,0.120863,,,,...,1,0,,517251,,0.0,0.0,0.0,0,
4,1.0,0.0,581696,0.0,341561,,,,,,...,1,67,1080.0,544,1430.0,0.0,1.0,1.0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995887,0.0,1.0,581696,0.0,341561,0.0,0.000000,,,-0.416525,...,0,0,,436020,,-1.0,0.0,0.0,0,56.125000
995888,2.0,1.0,96037,0.0,341561,0.0,0.000000,,,-0.330303,...,0,0,0.0,436020,,-1.0,2.0,2.0,2,67.125000
995889,0.0,0.0,581696,1.0,420354,0.0,0.000000,0.0,-0.877079,,...,0,0,,436020,,-1.0,0.0,0.0,0,
995890,0.0,0.0,581696,1.0,420354,0.0,0.000000,0.0,-0.877079,,...,0,0,,436020,,-1.0,0.0,0.0,0,


In [14]:
del tr, te; gc.collect()

44

In [15]:
# v_shifts = pd.read_parquet('./full_trans_vcols_shift_diff.parquet.gzip')
# del v_shifts['userid']

In [16]:
features = [c for c in full_df.columns if c not in ['i_am_train', # This is the actual target for adv validation
                                               'userid', # An ID
                                               'isFraud', # The target for competition
                                                    'TransactionID','TransactionDT'
                                              ]]

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1337)

In [18]:
lgb_params = {
    'task': 'train', 
    'max_depth': 10, 
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    #'num_leaves': 3, 
    'learning_rate': 0.05,
    'feature_fraction': 0.50, 
    'bagging_fraction': 0.81, 
    'bagging_freq': 1,
    'lambda_l1': 3, 
    'lambda_l2': 3, 
    'verbose': -99, 
    'boost_from_average': 'true',
    'seed': 99,
    'nthreads': 16
}

In [None]:
print('Training the Model:')
list_of_dicts = []
for fi, f in enumerate(tqdm(features)):
    dict_ = {}
    oof_preds = np.zeros(full_df.shape[0])
    print('Fitting to feature',f)
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(full_df.values, full_df['i_am_train'].values)):
        #print("Fold idx:{}".format(fold_ + 1))

        X_train, y_train = pd.DataFrame(full_df.iloc[trn_idx][f]), full_df['i_am_train'].iloc[trn_idx].values
        X_valid, y_valid = pd.DataFrame(full_df.iloc[val_idx][f]), full_df['i_am_train'].iloc[val_idx].values

        trn_data = lgb.Dataset(X_train, label=y_train)
        val_data = lgb.Dataset(X_valid, label=y_valid)

        clf = lgb.train(lgb_params, trn_data, 800, valid_sets = [trn_data, val_data], 
                        verbose_eval=0, early_stopping_rounds = 100)

        oof_preds[val_idx] = clf.predict(X_valid, num_iteration=clf.best_iteration)

    #print("CV score VAL: {:<8.5f} | using feature: {} | best iter: {}\n".format(
    #    roc_auc_score(full_df['i_am_train'], oof_preds), f, clf.best_iteration))
    dict_['feature'] = f
    dict_['cv'] = roc_auc_score(full_df['i_am_train'], oof_preds)
    dict_['best_iter'] = clf.best_iteration
    
    list_of_dicts.append(dict_)

  0%|          | 0/88 [00:00<?, ?it/s]

Training the Model:
Fitting to feature C10


  1%|          | 1/88 [00:50<1:13:54, 50.97s/it]

Fitting to feature C12


  2%|▏         | 2/88 [01:42<1:13:05, 50.99s/it]

Fitting to feature C2_fq_enc


  3%|▎         | 3/88 [02:03<59:43, 42.16s/it]  

Fitting to feature C9


  5%|▍         | 4/88 [02:22<49:24, 35.29s/it]

Fitting to feature C9_fq_enc


  6%|▌         | 5/88 [03:11<54:17, 39.25s/it]

Fitting to feature D10


  7%|▋         | 6/88 [03:41<50:00, 36.59s/it]

Fitting to feature D10_DT_D_min_max


  8%|▊         | 7/88 [04:24<51:43, 38.31s/it]

Fitting to feature D11


  9%|▉         | 8/88 [04:54<47:58, 35.98s/it]

Fitting to feature D11_DT_W_std_score


 10%|█         | 9/88 [06:26<1:09:18, 52.64s/it]

Fitting to feature D12_DT_W_std_score


 11%|█▏        | 10/88 [07:44<1:18:17, 60.23s/it]

Fitting to feature D13_intercept


 12%|█▎        | 11/88 [08:28<1:11:07, 55.42s/it]

Fitting to feature D14_DT_W_std_score


 14%|█▎        | 12/88 [09:48<1:19:45, 62.97s/it]

Fitting to feature D14_intercept


 15%|█▍        | 13/88 [10:23<1:08:08, 54.51s/it]

Fitting to feature D15


 16%|█▌        | 14/88 [10:52<57:40, 46.76s/it]  

Fitting to feature D15_DT_D_min_max


In [None]:
study = pd.DataFrame(list_of_dicts)

In [None]:
study.sort_values('cv') # (in a new cell)

In [None]:
study.loc[study['cv']>=0.60, 'feature'].values # (in a new cell)

In [None]:

# in a new cell:
# My recommendation is to not use the following columns:
bad_cols = study.loc[study['cv']>=0.60, 'feature'].values
for col in bad_cols:
    print('\nLooking at column',col)
    print('Training mean is',full_df.loc[full_df['i_am_train']==1,col].mean(),'while Testing mean is',full_df.loc[full_df['i_am_train']==0,col].mean())
    print('Training median is',full_df.loc[full_df['i_am_train']==1,col].median(),'while Testing median is',full_df.loc[full_df['i_am_train']==0,col].median())
    print('Training std is',full_df.loc[full_df['i_am_train']==1,col].std(),'while Testing std is',full_df.loc[full_df['i_am_train']==0,col].std())