In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import gc,os,random
import time,datetime
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from multiprocessing import Pool as ProcessPool
from lightGBMWrapper import LightGBMWrapper 
from greedyFindBin import GreedyFindBin

# define column names
customer_id_column = ['customer_ID']
date_column = ['S_2']
categorical_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
non_numerical_features = customer_id_column + date_column + categorical_features
label = ['target']

# label file
train_label_file = './data/amex-default-prediction/train_labels.csv'

# feature file
raw_feature_file = './data/raw_feature_statistic.feather'
rank_feature_file = './data/rank_feature.feather'
l3_feature_file = './data/l3_statistic.feather'
l6_feature_file = './data/l6_statistic.feather'

## 0 - loading and combine feature

In [24]:
%%time
# loading raw features
raw_feature = pd.read_feather(raw_feature_file)
# loading rank features
rank_feature = pd.read_feather(rank_feature_file)
# loading l3 feature
l3_feature = pd.read_feather(l3_feature_file)
# loading l3 feature
l6_feature = pd.read_feather(l6_feature_file)

CPU times: user 21.8 s, sys: 22.1 s, total: 44 s
Wall time: 8.65 s


In [25]:
%%time
def pad_target(x, pad_length):
    t = np.zeros(pad_length)
    t[:-len(x)] = np.nan
    t[-len(x):] = x
    return list(t)

# loading predictions
prediction = pd.read_csv('./model/lightGBM_raw_feature_20221203_113723/oof.csv')

groupped_prediction = prediction.groupby('customer_ID',sort=False)['target'].agg(lambda x:pad_target(x, 13))

series_features = pd.DataFrame(data=groupped_prediction.tolist(),columns=['target%s'%i for i in range(1,14)])

series_features.head()

CPU times: user 15.5 s, sys: 1.38 s, total: 16.9 s
Wall time: 17.4 s


Unnamed: 0,target1,target2,target3,target4,target5,target6,target7,target8,target9,target10,target11,target12,target13
0,0.000312,0.000846,0.000457,0.000908,0.000592,0.000363,0.000133,0.00078,0.000708,0.001256,0.000798,0.000787,0.00063
1,0.00151,0.000402,0.002015,0.000976,0.00122,0.004671,0.003702,0.00225,0.000331,0.001708,0.000436,0.001962,0.001019
2,0.006588,0.007621,0.006027,0.003541,0.001041,0.003713,0.004462,0.001837,0.006575,0.002741,0.001576,0.004659,0.002296
3,0.214364,0.139801,0.093994,0.108052,0.058178,0.008384,0.00716,0.015105,0.008999,0.006733,0.010411,0.006849,0.00837
4,0.000319,0.000257,0.000307,0.000473,0.001009,0.012346,0.001206,0.000825,0.005284,0.002881,0.001475,0.001747,0.001578


In [26]:
%%time
# combine features
assert raw_feature[customer_id_column].equals(rank_feature[customer_id_column]) == True
assert raw_feature[customer_id_column].equals(l3_feature[customer_id_column]) == True
assert raw_feature[customer_id_column].equals(l6_feature[customer_id_column]) == True

# combine all the feature
all_features = pd.concat([raw_feature, 
                          rank_feature.drop(columns=customer_id_column), 
                          l3_feature.drop(columns=customer_id_column),
                          l6_feature.drop(columns=customer_id_column),
                          series_features], axis=1)
# release memory
del raw_feature
del rank_feature
del l3_feature
del l6_feature
del series_features

all_features.head(2)

CPU times: user 6.02 s, sys: 15.2 s, total: 21.2 s
Wall time: 28.1 s


Unnamed: 0,customer_ID,one_hot_B_30_-1_mean,one_hot_B_30_-1_std,one_hot_B_30_-1_sum,one_hot_B_30_0_mean,one_hot_B_30_0_std,one_hot_B_30_0_sum,one_hot_B_30_1_mean,one_hot_B_30_1_std,one_hot_B_30_1_sum,...,target4,target5,target6,target7,target8,target9,target10,target11,target12,target13
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.0,0.0,0,1.0,0.0,13,0.0,0.0,0,...,0.000908,0.000592,0.000363,0.000133,0.00078,0.000708,0.001256,0.000798,0.000787,0.00063
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.0,0,1.0,0.0,13,0.0,0.0,0,...,0.000976,0.00122,0.004671,0.003702,0.00225,0.000331,0.001708,0.000436,0.001962,0.001019


In [48]:
# discretize all the features
for col in tqdm(all_features.columns):
    if col not in [customer_id_column[0], date_column[0], label[0]]:
        vc = all_features[col].value_counts().sort_index()
        bins = GreedyFindBin(vc.index.values,vc.values,len(vc),255,vc.sum())
        all_features[col] = np.digitize(all_features[col],[-np.inf]+bins)
        all_features.loc[all_features[col]==len(bins)+1,col] = 0
        all_features[col] = all_features[col] / all_features[col].max()

100%|██████████| 5490/5490 [48:15<00:00,  1.90it/s] 


In [49]:
all_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 5490 entries, customer_ID to target13
dtypes: float64(5489), object(1)
memory usage: 18.8+ GB


In [50]:
%%time
all_features.to_feather('./data/all_feature.feather')

CPU times: user 44.8 s, sys: 1min 39s, total: 2min 24s
Wall time: 1min 48s


## 1 - LGB parameter explaination

https://lightgbm.readthedocs.io/en/latest/Features.html

- epoch -> num_boost_round: number of boosting iterations
- ----
- early_stopping -> early_stopping_rounds: will stop training if one metric of one validation data doesn’t improve in last early_stopping_round rounds
- ----
- verbose_eavl -> verbose_eval: run validation every verbose_eval rounds during training
- ----
- boosting -> gbdt, dart, goss
  - gbdt, it is the default type of boosting, stable and reliable, but easy to over-specialization, time-consuming, memory-consuming
  - dart, add dropout during tree building, better accuracy and not easy to overfitting but bring more settings
  - goss, provides a new sampling method for GBDT by separating those instances with larger gradients, converge faster but overfitting when dataset is small (20x faster) 
    - Gradient One-Side Sampling / EFB: Exclusive Feature Bundling
- ----
- max_depth -> limit the max depth for tree model. This is used to deal with over-fitting when #data is small. Tree still grows leaf-wise, <= 0 means no limit
- ----
- num_leaves ->  default = 31, constraints: 1 < num_leaves <= 131072, max number of leaves in one tree, num_leaves <= 2^(max_depth)
- ----
- bagging_freq -> frequency for bagging
  - 0 means disable bagging; k means perform bagging at every k iteration. 
  - Every k-th iteration, LightGBM will randomly select bagging_fraction * 100 % of the data to use for the next k iterations
- ----
- bagging_fraction
  - default = 1.0, constraints: 0.0 < bagging_fraction <= 1.0, like feature_fraction, but this will randomly select part of data without resampling
  - can be used to speed up training
  - can be used to deal with over-fitting
- ----
- feature_fraction
  - default = 1.0, type = double, aliases: sub_feature, colsample_bytree, constraints: 0.0 < feature_fraction <= 1.0
  - LightGBM will randomly select a subset of features on each iteration (tree) if feature_fraction is smaller than 1.0. For example, if you set it to 0.8, LightGBM will select 80% of features before training each tree
  - can be used to speed up training
  - can be used to deal with over-fitting
- ----
- min_data_in_leaf
  - default = 20, constraints: min_data_in_leaf >= 0
  - minimal number of data in one leaf. Can be used to deal with over-fitting
  - Note: this is an approximation based on the Hessian, so occasionally you may observe splits which produce leaf nodes that have less than this many observations
- ----
- max_bin
  - default = 255, type = int, aliases: max_bins, constraints: max_bin > 1
  - max number of bins that feature values will be bucketed in
  - small number of bins may reduce training accuracy but may increase general power (deal with over-fitting)
  - LightGBM will auto compress memory according to max_bin. For example, LightGBM will use uint8_t for feature value if max_bin=255
- ----
- min_data_in_bin
  - default = 3, type = int, constraints: min_data_in_bin > 0
  - minimal number of data inside one bin
  - use this to avoid one-data-one-bin (potential over-fitting)
- ----
- tree_learner
  - default = serial, type = enum
  - serial, single machine tree learner
  - feature, feature parallel tree learner, aliases: feature_parallel
  - data, data parallel tree learner, aliases: data_parallel
  - voting, voting parallel tree learner, aliases: voting_parallel
- ----
- boost_from_average
  - default = true, type = bool
  - used only in regression, binary, multiclassova and cross-entropy applications
  - adjusts initial score to the mean of labels for faster convergence
- ----
- lambda_l1 -> l1_regularization, constraints: lambda_l1 >= 0.0
- ----
- lambad_l2 -> -> l2_regularization, constraints: lambda_l2 >= 0.0
- ----
- num_threads, number of threads for LightGBM, for the best speed, set this to the number of **real** CPU cores
- ----
- verbosity -> controls the level of LightGBM’s verbosity
  - '<0': Fatal
  - '=0': Error (Warning)
  - '=1': Info
  - '>1': Debug
  
  
## 2 - LGB Model - raw feature + rank feature + last 3 & 6 feature

In [2]:
all_features = pd.read_feather('./data/all_feature.feather')

In [3]:
%%time
labels = pd.read_csv('./data/amex-default-prediction/train_labels.csv')

assert all_features[customer_id_column].equals(labels[customer_id_column]) == True

# combine with label
all_features = pd.concat([all_features, labels.drop(columns=customer_id_column)], axis=1)

CPU times: user 12.5 s, sys: 53.3 s, total: 1min 5s
Wall time: 1min 48s


In [4]:
all_feature_column = [ c for c in all_features.columns if c not in [customer_id_column[0], date_column[0], label[0]]]

all_feature_without_series_feature = [c for c in all_feature_column if 'target' not in c]

print(f"All feature length {len(all_feature_column)}, feature without series feature {len(all_feature_without_series_feature)}")

All feature length 5489, feature without series feature 5476


In [5]:
# define lightGBM config
lgb_config = {
    'name': 'lightGBM_raw_feature',
    'root_path':'/Users/qucy/Kaggle/amex_default_prediction/model/',
    'seed':42,
    'epoch':4500,
    'early_stopping':100,
    'verbose_eval':50,
    'n_folds':3,
    'features': all_feature_without_series_feature,
    'label': label,
    'id': customer_id_column,
    'verbose_eval': 50,
    'remark': 'lgb_with_raw_rank_last36_features',
    'lgb_hyper_params':{
                  'objective' : 'binary',
                  'metric' : 'binary_logloss',
                  'boosting': 'goss', # dart
                  'max_depth' : -1,
                  'num_leaves' : 64,
                  'learning_rate' : 0.035,
                  #'bagging_freq': 5,
                  #'bagging_fraction' : 0.75,
                  'feature_fraction' : 0.05,
                  'min_data_in_leaf': 256,
                  'max_bin': 63,
                  'min_data_in_bin': 256,
                  'tree_learner': 'serial',
                  'boost_from_average': 'false',
                  'lambda_l1' : 0.1,
                  'lambda_l2' : 30,
                  'num_threads': 12,
                  'verbosity' : 1,
    }
}
# construct lightGBM model
lightGBMModel_1 = LightGBMWrapper(lgb_config)

In [1]:
lightGBMModel_1.train(all_features)

## 3 - LGB Model - raw feature + rank feature + last 3 & 6 feature + series feature

In [None]:
# define lightGBM config
lgb_config = {
    'name': 'lightGBM_raw_feature',
    'root_path':'/Users/qucy/Kaggle/amex_default_prediction/model/',
    'seed':42,
    'epoch':4500,
    'early_stopping':100,
    'verbose_eval':50,
    'n_folds':5,
    'features': model_features,
    'label': label,
    'id': customer_id_column,
    'verbose_eval': 50,
    'remark': 'lgb_with_raw_rank_last36_series_features',
    'lgb_hyper_params':{
                  'objective' : 'binary',
                  'metric' : 'binary_logloss',
                  'boosting': 'dart',
                  'max_depth' : -1,
                  'num_leaves' : 64,
                  'learning_rate' : 0.035,
                  'bagging_freq': 5,
                  'bagging_fraction' : 0.75,
                  'feature_fraction' : 0.05,
                  'min_data_in_leaf': 256,
                  'max_bin': 63,
                  'min_data_in_bin': 256,
                  'tree_learner': 'serial',
                  'boost_from_average': 'false',
                  'lambda_l1' : 0.1,
                  'lambda_l2' : 30,
                  'num_threads': 12,
                  'verbosity' : 1,
    }
}
# construct lightGBM model
lightGBMModel_2 = LightGBMWrapper(lgb_config)

### e2-standard-8 -> 8 cores & 32 GB memory 
#### /var/log/syslog

- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.925901] [   1475]  1002  1475     3644      279    65536        0             0 sshd
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.934126] [   1476]  1002  1476     1437       99    49152        0             0 bash
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.942505] [   1479]     0  1479     2174       94    57344        0             0 sudo
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.950743] [   1480]     0  1480     2060       92    49152        0             0 su
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.958790] [   1481]     0  1481     1437      133    57344        0             0 bash
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.967005] [   5466]     0  5466 12549494  8083447 65392640        0             0 python3
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.975496] [   5482]     0  5482     2479      139    65536        0             0 top
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.983620] Out of memory: Kill process 5466 (python3) score 983 or sacrifice child
- Dec  6 12:50:27 ml-training-2 kernel: [ 2069.991485] Killed process 5466 (python3) total-vm:50197976kB, anon-rss:32333788kB, file-rss:0kB, shmem-rss:0kB
- Dec  6 12:50:27 ml-training-2 kernel: [ 2070.160062] oom_reaper: reaped process 5466 (python3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

### e2-standard-16 -> 16 cores & 64 GB memory 
#### /var/log/syslog

- Dec  6 12:59:24 ml-training-2 kernel: [  161.931784] [    777]  1002   777     3644      273    77824        0             0 sshd
- Dec  6 12:59:24 ml-training-2 kernel: [  161.940080] [    778]  1002   778     1437      102    49152        0             0 bash
- Dec  6 12:59:24 ml-training-2 kernel: [  161.948308] [    785]     0   785     2174       94    53248        0             0 sudo
- Dec  6 12:59:24 ml-training-2 kernel: [  161.956569] [    786]     0   786     2060       92    53248        0             0 su
- Dec  6 12:59:24 ml-training-2 kernel: [  161.964742] [    787]     0   787     1437      130    53248        0             0 bash
- Dec  6 12:59:24 ml-training-2 kernel: [  161.973126] [    795]     0   795 30816715 16287220 133640192        0             0 python3
- Dec  6 12:59:24 ml-training-2 kernel: [  161.981696] [    848]     0   848     2511      187    65536        0             0 top
- Dec  6 12:59:24 ml-training-2 kernel: [  161.989964] Out of memory: Kill process 795 (python3) score 989 or sacrifice child
- Dec  6 12:59:24 ml-training-2 kernel: [  161.997837] Killed process 795 (python3) total-vm:123266860kB, anon-rss:65148880kB, file-rss:0kB, shmem-rss:0kB
- Dec  6 12:59:24 ml-training-2 kernel: [  162.174943] oom_reaper: reaped process 795 (python3), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

### e2-highmem-16 -> 16 cores & 128 GB memory
#### top
- top - 13:14:03 up 8 min,  1 user,  load average: 13.60, 6.06, 2.42
- Tasks: 172 total,   2 running, 170 sleeping,   0 stopped,   0 zombie
- %Cpu0  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu1  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu2  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu3  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu4  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu5  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu6  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu7  : 99.7 us,  0.3 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu8  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu9  :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu10 :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu11 :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu12 :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu13 : 99.7 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.3 st
- %Cpu14 :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- %Cpu15 :100.0 us,  0.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  0.0 hi,  0.0 si,  0.0 st
- MiB Mem : 128935.3 total,  68043.5 free,  57208.8 used,   3683.0 buff/cache
- MiB Swap:      0.0 total,      0.0 free,      0.0 used.  70671.7 avail Mem 

#### GOSS - 3 fold - training time around 20 minutes
#### ================================Fold 0 start================================
 - 850 round - train_metric: 0.172627 - valid_metric: 0.214018
 - 900 round - train_metric: 0.170284 - valid_metric: 0.213907
 - 950 round - train_metric: 0.168035 - valid_metric: 0.213871
 - 1000 round - train_metric: 0.165821 - valid_metric: 0.213815
 - 1050 round - train_metric: 0.163621 - valid_metric: 0.213813
 - 1100 round - train_metric: 0.161421 - valid_metric: 0.213775
 - 0 amex metric: 0.800661
#### ================================Fold 0 End================================
#### ================================Fold 1 start================================
 - 950 round - train_metric: 0.166408 - valid_metric: 0.217907
 - 1000 round - train_metric: 0.164171 - valid_metric: 0.217843
 - 1050 round - train_metric: 0.161952 - valid_metric: 0.217863
 - 1100 round - train_metric: 0.159757 - valid_metric: 0.217860
 - 1150 round - train_metric: 0.157664 - valid_metric: 0.217832
 - 1200 round - train_metric: 0.155581 - valid_metric: 0.217803
 - 1250 round - train_metric: 0.153543 - valid_metric: 0.217839
 - 1 amex metric: 0.792522
#### ================================Fold 1 End================================
#### ================================Fold 2 start================================
 - 750 round - train_metric: 0.176491 - valid_metric: 0.216337
 - 800 round - train_metric: 0.174059 - valid_metric: 0.216257
 - 850 round - train_metric: 0.171673 - valid_metric: 0.216152
 - 900 round - train_metric: 0.169309 - valid_metric: 0.216053
 - 950 round - train_metric: 0.166995 - valid_metric: 0.216014
 - 1000 round - train_metric: 0.164729 - valid_metric: 0.216006
 - 1050 round - train_metric: 0.162479 - valid_metric: 0.215959
 - 1100 round - train_metric: 0.160311 - valid_metric: 0.215972
 - 1150 round - train_metric: 0.158167 - valid_metric: 0.215907
 - 1200 round - train_metric: 0.156091 - valid_metric: 0.215881
 - 2 amex metric: 0.792113
#### ================================Fold 2 End================================

### dart - 3 fold - training around 1 hour 40 minutes

#### ================================Fold 0 Start================================
 - 4200 round - train_metric: 0.108382 - valid_metric: 0.162283
 - 4250 round - train_metric: 0.107759 - valid_metric: 0.162193
 - 4300 round - train_metric: 0.107392 - valid_metric: 0.162166
 - 4350 round - train_metric: 0.106771 - valid_metric: 0.162095
 - 4400 round - train_metric: 0.106392 - valid_metric: 0.162083
 - 4450 round - train_metric: 0.105948 - valid_metric: 0.162027
 - 0 amex metric: 0.873338
#### ================================Fold 0 End================================
#### ================================Fold 1 Start================================
 - 4150 round - train_metric: 0.107695 - valid_metric: 0.166027
 - 4200 round - train_metric: 0.107197 - valid_metric: 0.165962
 - 4250 round - train_metric: 0.106564 - valid_metric: 0.165871
 - 4300 round - train_metric: 0.106195 - valid_metric: 0.165804
 - 4350 round - train_metric: 0.105530 - valid_metric: 0.165641
 - 4400 round - train_metric: 0.105169 - valid_metric: 0.165630
 - 4450 round - train_metric: 0.104718 - valid_metric: 0.165578
 - 1 amex metric: 0.866628
#### ================================Fold 1 End================================
#### ================================Fold 2 Start================================
 - 4200 round - train_metric: 0.107762 - valid_metric: 0.165464
 - 4250 round - train_metric: 0.107101 - valid_metric: 0.165406
 - 4300 round - train_metric: 0.106763 - valid_metric: 0.165398
 - 4350 round - train_metric: 0.106107 - valid_metric: 0.165281
 - 4400 round - train_metric: 0.105713 - valid_metric: 0.165204
 - 4450 round - train_metric: 0.105255 - valid_metric: 0.165125
 - 2 amex metric: 0.866922
#### ================================Fold 2 End================================
 - All mean metric:0.868963, global metric:0.868536