# 1 -denoise data

In [4]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import gc,os,random
import time,datetime
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import tqdm
from multiprocessing import Pool as ProcessPool

from greedyFindBin import GreedyFindBin
from lightGBMWrapper import LightGBMWrapper
from featureEngineering import *

customer_id_column = ['customer_ID']
date_column = ['S_2']
categorical_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
non_numerical_features = customer_id_column + date_column + categorical_features
label = ['target']


resized_train_file = './data/train_x_resized.feather'
resized_test_file = './data/test_x_resized.feather'

denoised_train_file = './data/train_x_denoised.feather'
denoised_test_file = './data/test_x_denoised.feather'

train_label_file = './data/amex-default-prediction/train_labels.csv'

In [5]:
t = {x : np.int8 for x in categorical_features}

def denoise(df):
    df['D_63'] = df['D_63'].map({'CR':0, 'XZ':1, 'XM':2, 'CO':3, 'CL':4, 'XL':5})
    df['D_64'] = df['D_64'].map({None:-1, 'O':0, '-1':1, 'R':2, 'U':3})
    df[categorical_features] = df[categorical_features].fillna(-1)
    df[categorical_features] = df[categorical_features].astype(t)
    for col in tqdm(df.columns):
        if col not in non_numerical_features:
            # fill na
            if df[col].max() * 100 > np.finfo(np.float16).max:
                df[col] = df[col].fillna(0).astype(np.float32)
            else:
                df[col] = df[col].fillna(0)
            # remove noise
            df[col] = np.floor(df[col] * 100).astype(np.int16)
    return df

In [6]:
# denoise training data
if not os.path.isfile(denoised_train_file):
    train = pd.read_feather(resized_train_file)
    train = denoise(train)
    train.to_feather(denoised_train_file)
    del train

In [7]:
# denoise testing data
if not os.path.isfile(denoised_test_file):
    test = pd.read_feather(resized_test_file)
    test = denoise(test)
    test.to_feather(denoised_test_file)
    del test

In [8]:
%%time
# load denoised data
train_x = pd.read_feather(denoised_train_file)
# loading label file
train_y = pd.read_csv(train_label_file)
# join x with y
training = pd.merge(train_x, train_y, how='inner', on=['customer_ID'])
training.head(2)

CPU times: user 8.28 s, sys: 14.6 s, total: 22.9 s
Wall time: 16.7 s


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,93,0,0,100,0,12,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,93,0,0,100,0,12,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# numerical_features
numerical_features = [x for x in train_x.columns if x not in non_numerical_features]

# 2 - feature engineering

In [10]:
# normalization
training[numerical_features] = training[numerical_features] / 100

## 2.1 raw feature handling

In [104]:
# normal dataframe
original_features_one_hotted = one_hot_encoding(training, categorical_features, is_drop=False)
original_features_categorical_statistic = category_feature_statistic(original_features_one_hotted, categorical_features, calc_last=False)
original_features_numerical_statistic = numerical_feature_statistic(original_features_one_hotted, numerical_features, calc_last=False)
original_features_diff_statistic = diff_features(original_features_one_hotted, numerical_features, calc_last=False)

one hot encoding: B_30
one hot encoding: B_38
one hot encoding: D_114
one hot encoding: D_116
one hot encoding: D_117
one hot encoding: D_120
one hot encoding: D_126
one hot encoding: D_63
one hot encoding: D_64
one hot encoding: D_66
one hot encoding: D_68
one hot cols:['one_hot_B_30_-1', 'one_hot_B_30_0', 'one_hot_B_30_1', 'one_hot_B_30_2', 'one_hot_B_38_-1', 'one_hot_B_38_1', 'one_hot_B_38_2', 'one_hot_B_38_3', 'one_hot_B_38_4', 'one_hot_B_38_5', 'one_hot_B_38_6', 'one_hot_B_38_7', 'one_hot_D_114_-1', 'one_hot_D_114_0', 'one_hot_D_114_1', 'one_hot_D_116_-1', 'one_hot_D_116_0', 'one_hot_D_116_1', 'one_hot_D_117_-1', 'one_hot_D_117_1', 'one_hot_D_117_2', 'one_hot_D_117_3', 'one_hot_D_117_4', 'one_hot_D_117_5', 'one_hot_D_117_6', 'one_hot_D_120_-1', 'one_hot_D_120_0', 'one_hot_D_120_1', 'one_hot_D_126_-1', 'one_hot_D_126_0', 'one_hot_D_126_1', 'one_hot_D_63_0', 'one_hot_D_63_1', 'one_hot_D_63_2', 'one_hot_D_63_3', 'one_hot_D_63_4', 'one_hot_D_63_5', 'one_hot_D_64_-1', 'one_hot_D_64_0',

### 2.1.1 categorical feature statistic - mean, std, sum, last

In [106]:
original_features_categorical_statistic.head(2)

Unnamed: 0,customer_ID,one_hot_B_30_-1_mean,one_hot_B_30_-1_std,one_hot_B_30_-1_sum,one_hot_B_30_0_mean,one_hot_B_30_0_std,one_hot_B_30_0_sum,one_hot_B_30_1_mean,one_hot_B_30_1_std,one_hot_B_30_1_sum,...,D_114_nunique,D_116_nunique,D_117_nunique,D_120_nunique,D_126_nunique,D_63_nunique,D_64_nunique,D_66_nunique,D_68_nunique,S_2_count
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.0,0.0,0,1.0,0.0,13,0.0,0.0,0,...,1,1,1,1,1,1,1,1,1,13
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.0,0,1.0,0.0,13,0.0,0.0,0,...,1,1,1,2,1,1,1,1,1,13


### 2.1.2 numerical feature statistic - mean, std, max, min, sum, last

In [107]:
original_features_numerical_statistic.head(2)

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_sum,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_144_mean,D_144_std,D_144_min,D_144_max,D_144_sum,D_145_mean,D_145_std,D_145_min,D_145_max,D_145_sum
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.928462,0.026092,0.86,0.96,12.07,0.006923,0.024962,0.0,0.09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.895385,0.020662,0.86,0.92,11.64,0.210769,0.198472,0.0,0.56,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [133]:
original_features_numerical_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 886 entries, customer_ID to D_145_sum
dtypes: float64(885), object(1)
memory usage: 3.0+ GB


In [134]:
# convert to float16 to save memory
for c in tqdm(original_features_numerical_statistic.columns[1:]):
    original_features_numerical_statistic[c] = original_features_numerical_statistic[c].astype(np.float16)

100%|██████████| 885/885 [15:55<00:00,  1.08s/it]


In [135]:
# after conversion
original_features_numerical_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 886 entries, customer_ID to D_145_sum
dtypes: float16(885), object(1)
memory usage: 778.1+ MB


### 2.1.3 numerical feature - differ statistic - mean, std, min, max, sum, last

In [108]:
original_features_diff_statistic.head(2)

Unnamed: 0,customer_ID,diff_P_2_mean,diff_P_2_std,diff_P_2_min,diff_P_2_max,diff_P_2_sum,diff_P_2_last,diff_D_39_mean,diff_D_39_std,diff_D_39_min,...,diff_D_144_min,diff_D_144_max,diff_D_144_sum,diff_D_144_last,diff_D_145_mean,diff_D_145_std,diff_D_145_min,diff_D_145_max,diff_D_145_sum,diff_D_145_last
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.0,0.035675,-0.09,0.04,0.0,0.0,0.0,0.038376,-0.09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,-0.003333,0.016143,-0.04,0.02,-0.04,0.01,-0.0175,0.302388,-0.38,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
original_features_diff_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 1063 entries, customer_ID to diff_D_145_last
dtypes: float64(1062), object(1)
memory usage: 3.6+ GB


In [137]:
# convert to float16 to save memory
for c in tqdm(original_features_diff_statistic.columns[1:]):
    original_features_diff_statistic[c] = original_features_diff_statistic[c].astype(np.float16)

100%|██████████| 1062/1062 [22:26<00:00,  1.27s/it]


In [138]:
original_features_diff_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 1063 entries, customer_ID to diff_D_145_last
dtypes: float16(1062), object(1)
memory usage: 933.1+ MB


### 2.1.4 concat all the statistics calculate from raw feature

In [151]:
original_features_diff_statistic.columns

Index(['customer_ID', 'diff_P_2_mean', 'diff_P_2_std', 'diff_P_2_min',
       'diff_P_2_max', 'diff_P_2_sum', 'diff_P_2_last', 'diff_D_39_mean',
       'diff_D_39_std', 'diff_D_39_min',
       ...
       'diff_D_144_min', 'diff_D_144_max', 'diff_D_144_sum', 'diff_D_144_last',
       'diff_D_145_mean', 'diff_D_145_std', 'diff_D_145_min', 'diff_D_145_max',
       'diff_D_145_sum', 'diff_D_145_last'],
      dtype='object', length=1063)

In [152]:
assert original_features_categorical_statistic[customer_id_column].equals(original_features_numerical_statistic[customer_id_column]) == True
assert original_features_categorical_statistic[customer_id_column].equals(original_features_diff_statistic[customer_id_column]) == True

raw_feature_statistic = pd.concat([original_features_categorical_statistic, 
                                   original_features_numerical_statistic.drop(columns=customer_id_column), 
                                   original_features_diff_statistic.drop(columns=customer_id_column)], axis=1)

raw_feature_statistic.head(2)

Unnamed: 0,customer_ID,one_hot_B_30_-1_mean,one_hot_B_30_-1_std,one_hot_B_30_-1_sum,one_hot_B_30_0_mean,one_hot_B_30_0_std,one_hot_B_30_0_sum,one_hot_B_30_1_mean,one_hot_B_30_1_std,one_hot_B_30_1_sum,...,diff_D_144_min,diff_D_144_max,diff_D_144_sum,diff_D_144_last,diff_D_145_mean,diff_D_145_std,diff_D_145_min,diff_D_145_max,diff_D_145_sum,diff_D_145_last
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.0,0.0,0,1.0,0.0,13,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.0,0,1.0,0.0,13,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [153]:
raw_feature_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 2119 entries, customer_ID to diff_D_145_last
dtypes: float16(1947), float64(106), int64(12), object(1), uint8(53)
memory usage: 2.1+ GB


In [154]:
# write to disk
raw_feature_statistic.to_feather('./data/raw_feature_statistic.feather')

# release memory
del original_features_categorical_statistic
del original_features_numerical_statistic
del original_features_diff_statistic
del raw_feature_statistic

## 2.2 ranking feature handling

### 2.2.1 customer level rank - customer compare with his historical features

In [155]:
customer_ranking_features = customer_rankings_features(training, numerical_features)

customer_ranking_features.head(2)

Unnamed: 0,customer_ID,rank_P_2,rank_D_39,rank_B_1,rank_B_2,rank_R_1,rank_S_3,rank_D_41,rank_B_3,rank_D_42,...,rank_D_136,rank_D_137,rank_D_138,rank_D_139,rank_D_140,rank_D_141,rank_D_142,rank_D_143,rank_D_144,rank_D_145
12,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.423077,0.5,0.269231,0.5,0.538462,1.0,0.538462,0.538462,0.538462,...,0.538462,0.538462,0.538462,0.538462,0.538462,0.538462,0.538462,0.538462,0.538462,0.538462
25,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.346154,0.461538,0.807692,0.576923,0.538462,1.0,0.538462,0.461538,0.538462,...,0.538462,0.538462,0.538462,0.538462,0.538462,0.538462,0.538462,0.5,0.538462,0.538462


In [156]:
customer_ranking_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458913 entries, 12 to 5531450
Columns: 178 entries, customer_ID to rank_D_145
dtypes: float64(177), object(1)
memory usage: 626.7+ MB


In [163]:
%%time
c_ranking_desc = customer_ranking_features.describe()

c_ranking_desc.T

CPU times: user 2.19 s, sys: 183 ms, total: 2.37 s
Wall time: 2.38 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rank_P_2,458913.0,0.555308,0.316789,0.076923,0.269231,0.555556,0.846154,1.0
rank_D_39,458913.0,0.576260,0.246920,0.076923,0.384615,0.538462,0.769231,1.0
rank_B_1,458913.0,0.579572,0.280572,0.076923,0.384615,0.538462,0.846154,1.0
rank_B_2,458913.0,0.519477,0.243522,0.076923,0.384615,0.538462,0.653846,1.0
rank_R_1,458913.0,0.565179,0.156579,0.076923,0.500000,0.538462,0.538462,1.0
...,...,...,...,...,...,...,...,...
rank_D_141,458913.0,0.582584,0.123586,0.076923,0.538462,0.538462,0.545455,1.0
rank_D_142,458913.0,0.615039,0.166681,0.076923,0.538462,0.538462,0.555556,1.0
rank_D_143,458913.0,0.553040,0.087113,0.076923,0.538462,0.538462,0.538462,1.0
rank_D_144,458913.0,0.569257,0.117179,0.076923,0.538462,0.538462,0.538462,1.0


In [165]:
# convert to float16 to save memory
for c in tqdm(customer_ranking_features.columns[1:]):
    customer_ranking_features[c] = customer_ranking_features[c].astype(np.float16)

100%|██████████| 177/177 [00:35<00:00,  4.98it/s]


In [166]:
customer_ranking_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458913 entries, 12 to 5531450
Columns: 178 entries, customer_ID to rank_D_145
dtypes: float16(177), object(1)
memory usage: 161.9+ MB


### 2.2.2 year-month level rank - customer compare with the other customer at the same year-month

In [158]:
year_month_ranking_features = year_month_ranking_features(training, numerical_features)

year_month_ranking_features.head(2)

Unnamed: 0,customer_ID,ym_rank_P_2,ym_rank_D_39,ym_rank_B_1,ym_rank_B_2,ym_rank_R_1,ym_rank_S_3,ym_rank_D_41,ym_rank_B_3,ym_rank_D_42,...,ym_rank_D_136,ym_rank_D_137,ym_rank_D_138,ym_rank_D_139,ym_rank_D_140,ym_rank_D_141,ym_rank_D_142,ym_rank_D_143,ym_rank_D_144,ym_rank_D_145
12,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.888668,0.261363,0.143238,0.833268,0.421766,0.421881,0.41682,0.243286,0.440388,...,0.486787,0.499838,0.494942,0.41242,0.487954,0.412431,0.41408,0.412482,0.454187,0.412427
25,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.79371,0.699167,0.50785,0.833268,0.421766,0.593921,0.41682,0.243286,0.440388,...,0.486787,0.499838,0.494942,0.41242,0.487954,0.412431,0.41408,0.412482,0.454187,0.412427


In [167]:
year_month_ranking_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458913 entries, 12 to 5531450
Columns: 178 entries, customer_ID to ym_rank_D_145
dtypes: float64(177), object(1)
memory usage: 626.7+ MB


In [168]:
%%time
ym_ranking_desc = year_month_ranking_features.describe()

ym_ranking_desc.T

CPU times: user 2.19 s, sys: 197 ms, total: 2.38 s
Wall time: 2.39 s


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
ym_rank_P_2,458913.0,0.500001,0.288651,0.000002,0.250550,0.494909,0.755668,0.999097
ym_rank_D_39,458913.0,0.500001,0.267109,0.261363,0.261363,0.261363,0.751192,1.000000
ym_rank_B_1,458913.0,0.500001,0.285123,0.000002,0.143238,0.507850,0.751133,0.999192
ym_rank_B_2,458913.0,0.500001,0.282500,0.029639,0.255368,0.538009,0.833268,0.988865
ym_rank_R_1,458913.0,0.500001,0.182399,0.421766,0.421766,0.421766,0.421766,0.999999
...,...,...,...,...,...,...,...,...
ym_rank_D_141,458913.0,0.500001,0.191210,0.412431,0.412431,0.412431,0.412431,1.000000
ym_rank_D_142,458913.0,0.500001,0.190366,0.000010,0.414080,0.414080,0.414080,1.000000
ym_rank_D_143,458913.0,0.500001,0.190234,0.412482,0.412482,0.412482,0.412482,0.993807
ym_rank_D_144,458913.0,0.500001,0.144472,0.454187,0.454187,0.454187,0.454187,0.999696


In [169]:
# convert to float16 to save memory
for c in tqdm(year_month_ranking_features.columns[1:]):
    year_month_ranking_features[c] = year_month_ranking_features[c].astype(np.float16)

100%|██████████| 177/177 [00:36<00:00,  4.89it/s]


In [170]:
year_month_ranking_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 458913 entries, 12 to 5531450
Columns: 178 entries, customer_ID to ym_rank_D_145
dtypes: float16(177), object(1)
memory usage: 161.9+ MB


In [174]:
assert customer_ranking_features[customer_id_column].equals(year_month_ranking_features[customer_id_column]) == True

rank_feature = pd.concat([customer_ranking_features, year_month_ranking_features.drop(columns=customer_id_column)], axis=1).reset_index()

rank_feature.head(2)

Unnamed: 0,index,customer_ID,rank_P_2,rank_D_39,rank_B_1,rank_B_2,rank_R_1,rank_S_3,rank_D_41,rank_B_3,...,ym_rank_D_136,ym_rank_D_137,ym_rank_D_138,ym_rank_D_139,ym_rank_D_140,ym_rank_D_141,ym_rank_D_142,ym_rank_D_143,ym_rank_D_144,ym_rank_D_145
0,12,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.423096,0.5,0.269287,0.5,0.538574,1.0,0.538574,0.538574,...,0.486816,0.499756,0.494873,0.412354,0.488037,0.412354,0.414062,0.412598,0.454102,0.412354
1,25,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.346191,0.461426,0.807617,0.577148,0.538574,1.0,0.538574,0.461426,...,0.486816,0.499756,0.494873,0.412354,0.488037,0.412354,0.414062,0.412598,0.454102,0.412354


In [175]:
rank_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 356 entries, index to ym_rank_D_145
dtypes: float16(354), int64(1), object(1)
memory usage: 316.9+ MB


In [176]:
# write to disk
rank_feature.to_feather('./data/rank_feature.feather')

# release memory
del customer_ranking_features
del year_month_ranking_features
del rank_feature

## 2.3 latest 3 and latest 6 transaction related feature

### 2.3.1 latest 3 transaction feature

In [177]:
# retrieve latest 3 transactions for each customer
l3_txn = retrieve_last_n_transactions_per_customer(training, n=3)
# one hot
l3_txn_one_hotted = one_hot_encoding(l3_txn, categorical_features, is_drop=False)
# categorical feature statistic
l3_txn__features_categorical_statistic = category_feature_statistic(l3_txn_one_hotted, categorical_features, calc_last=False)
# numerical statisitc
l3_txn__features_numerical_statistic = numerical_feature_statistic(l3_txn, numerical_features, calc_last=False)
# diff statistic
l3_txn__features_diff_statistic = diff_features(l3_txn, numerical_features, calc_last=False)

one hot encoding: B_30
one hot encoding: B_38
one hot encoding: D_114
one hot encoding: D_116
one hot encoding: D_117
one hot encoding: D_120
one hot encoding: D_126
one hot encoding: D_63
one hot encoding: D_64
one hot encoding: D_66
one hot encoding: D_68
one hot cols:['one_hot_B_30_-1', 'one_hot_B_30_0', 'one_hot_B_30_1', 'one_hot_B_30_2', 'one_hot_B_38_-1', 'one_hot_B_38_1', 'one_hot_B_38_2', 'one_hot_B_38_3', 'one_hot_B_38_4', 'one_hot_B_38_5', 'one_hot_B_38_6', 'one_hot_B_38_7', 'one_hot_D_114_-1', 'one_hot_D_114_0', 'one_hot_D_114_1', 'one_hot_D_116_-1', 'one_hot_D_116_0', 'one_hot_D_116_1', 'one_hot_D_117_-1', 'one_hot_D_117_1', 'one_hot_D_117_2', 'one_hot_D_117_3', 'one_hot_D_117_4', 'one_hot_D_117_5', 'one_hot_D_117_6', 'one_hot_D_120_-1', 'one_hot_D_120_0', 'one_hot_D_120_1', 'one_hot_D_126_-1', 'one_hot_D_126_0', 'one_hot_D_126_1', 'one_hot_D_63_0', 'one_hot_D_63_1', 'one_hot_D_63_2', 'one_hot_D_63_3', 'one_hot_D_63_4', 'one_hot_D_63_5', 'one_hot_D_64_-1', 'one_hot_D_64_0',

### categorical statistic

In [178]:
l3_txn__features_categorical_statistic.head(2)

Unnamed: 0,customer_ID,one_hot_B_30_-1_mean,one_hot_B_30_-1_std,one_hot_B_30_-1_sum,one_hot_B_30_0_mean,one_hot_B_30_0_std,one_hot_B_30_0_sum,one_hot_B_30_1_mean,one_hot_B_30_1_std,one_hot_B_30_1_sum,...,D_114_nunique,D_116_nunique,D_117_nunique,D_120_nunique,D_126_nunique,D_63_nunique,D_64_nunique,D_66_nunique,D_68_nunique,S_2_count
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.0,0.0,0,1.0,0.0,3,0.0,0.0,0,...,1,1,1,1,1,1,1,1,1,3
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.0,0,1.0,0.0,3,0.0,0.0,0,...,1,1,1,2,1,1,1,1,1,3


In [186]:
l3_txn__features_categorical_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 172 entries, customer_ID to S_2_count
dtypes: float64(106), int64(12), object(1), uint8(53)
memory usage: 439.8+ MB


In [190]:
# rename columns avoid conflict
renamed_columns = customer_id_column + ['l3_' + t for t in l3_txn__features_categorical_statistic.columns[1:]]

l3_txn__features_categorical_statistic.columns = renamed_columns

In [191]:
l3_txn__features_categorical_statistic.head(2)

Unnamed: 0,customer_ID,l3_one_hot_B_30_-1_mean,l3_one_hot_B_30_-1_std,l3_one_hot_B_30_-1_sum,l3_one_hot_B_30_0_mean,l3_one_hot_B_30_0_std,l3_one_hot_B_30_0_sum,l3_one_hot_B_30_1_mean,l3_one_hot_B_30_1_std,l3_one_hot_B_30_1_sum,...,l3_D_114_nunique,l3_D_116_nunique,l3_D_117_nunique,l3_D_120_nunique,l3_D_126_nunique,l3_D_63_nunique,l3_D_64_nunique,l3_D_66_nunique,l3_D_68_nunique,l3_S_2_count
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.0,0.0,0,1.0,0.0,3,0.0,0.0,0,...,1,1,1,1,1,1,1,1,1,3
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.0,0,1.0,0.0,3,0.0,0.0,0,...,1,1,1,2,1,1,1,1,1,3


### numerical statistic

In [192]:
l3_txn__features_numerical_statistic.head(2)

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_sum,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_144_mean,D_144_std,D_144_min,D_144_max,D_144_sum,D_145_mean,D_145_std,D_145_min,D_145_max,D_145_sum
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.92,0.017321,0.9,0.93,2.76,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.876667,0.005774,0.87,0.88,2.63,0.303333,0.197315,0.17,0.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [193]:
l3_txn__features_numerical_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 886 entries, customer_ID to D_145_sum
dtypes: float64(885), object(1)
memory usage: 3.0+ GB


In [197]:
for c in tqdm(l3_txn__features_numerical_statistic.columns[1:]):
    l3_txn__features_numerical_statistic[c] = l3_txn__features_numerical_statistic[c].astype(np.float16)

100%|██████████| 885/885 [15:26<00:00,  1.05s/it]


In [198]:
# rename columns avoid conflict
renamed_columns = customer_id_column + ['l3_' + t for t in l3_txn__features_numerical_statistic.columns[1:]]

l3_txn__features_numerical_statistic.columns = renamed_columns

l3_txn__features_numerical_statistic.head(2)

Unnamed: 0,customer_ID,l3_P_2_mean,l3_P_2_std,l3_P_2_min,l3_P_2_max,l3_P_2_sum,l3_D_39_mean,l3_D_39_std,l3_D_39_min,l3_D_39_max,...,l3_D_144_mean,l3_D_144_std,l3_D_144_min,l3_D_144_max,l3_D_144_sum,l3_D_145_mean,l3_D_145_std,l3_D_145_min,l3_D_145_max,l3_D_145_sum
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.919922,0.017319,0.899902,0.930176,2.759766,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.876465,0.005772,0.870117,0.879883,2.630859,0.303223,0.197266,0.170044,0.529785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
l3_txn__features_numerical_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 886 entries, customer_ID to l3_D_145_sum
dtypes: float16(885), object(1)
memory usage: 778.1+ MB


### diff statistic

In [202]:
l3_txn__features_diff_statistic.head(2)

Unnamed: 0,customer_ID,diff_P_2_mean,diff_P_2_std,diff_P_2_min,diff_P_2_max,diff_P_2_sum,diff_P_2_last,diff_D_39_mean,diff_D_39_std,diff_D_39_min,...,diff_D_144_min,diff_D_144_max,diff_D_144_sum,diff_D_144_last,diff_D_145_mean,diff_D_145_std,diff_D_145_min,diff_D_145_max,diff_D_145_sum,diff_D_145_last
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.015,0.021213,0.0,0.03,0.03,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.014142,-0.01,0.01,0.0,0.01,-0.02,0.480833,-0.36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [203]:
l3_txn__features_diff_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 1063 entries, customer_ID to diff_D_145_last
dtypes: float64(1062), object(1)
memory usage: 3.6+ GB


In [204]:
for c in tqdm(l3_txn__features_diff_statistic.columns[1:]):
    l3_txn__features_diff_statistic[c] = l3_txn__features_diff_statistic[c].astype(np.float16)

100%|██████████| 1062/1062 [2:44:21<00:00,  9.29s/it]     


In [205]:
# rename columns avoid conflict
renamed_columns = customer_id_column + ['l3_' + t for t in l3_txn__features_diff_statistic.columns[1:]]

l3_txn__features_diff_statistic.columns = renamed_columns

l3_txn__features_diff_statistic.head(2)

Unnamed: 0,customer_ID,l3_diff_P_2_mean,l3_diff_P_2_std,l3_diff_P_2_min,l3_diff_P_2_max,l3_diff_P_2_sum,l3_diff_P_2_last,l3_diff_D_39_mean,l3_diff_D_39_std,l3_diff_D_39_min,...,l3_diff_D_144_min,l3_diff_D_144_max,l3_diff_D_144_sum,l3_diff_D_144_last,l3_diff_D_145_mean,l3_diff_D_145_std,l3_diff_D_145_min,l3_diff_D_145_max,l3_diff_D_145_sum,l3_diff_D_145_last
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.014999,0.02121,0.0,0.029999,0.029999,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.014145,-0.010002,0.010002,0.0,0.010002,-0.020004,0.480713,-0.360107,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [206]:
l3_txn__features_diff_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 1063 entries, customer_ID to l3_diff_D_145_last
dtypes: float16(1062), object(1)
memory usage: 933.1+ MB


### concat features and save to disk

In [207]:
assert l3_txn__features_categorical_statistic[customer_id_column].equals(l3_txn__features_numerical_statistic[customer_id_column]) == True
assert l3_txn__features_categorical_statistic[customer_id_column].equals(l3_txn__features_diff_statistic[customer_id_column]) == True

l3_feature_statistic = pd.concat([l3_txn__features_categorical_statistic, 
                                  l3_txn__features_numerical_statistic.drop(columns=customer_id_column), 
                                  l3_txn__features_diff_statistic.drop(columns=customer_id_column)], axis=1)

l3_feature_statistic.head(2)

Unnamed: 0,customer_ID,l3_one_hot_B_30_-1_mean,l3_one_hot_B_30_-1_std,l3_one_hot_B_30_-1_sum,l3_one_hot_B_30_0_mean,l3_one_hot_B_30_0_std,l3_one_hot_B_30_0_sum,l3_one_hot_B_30_1_mean,l3_one_hot_B_30_1_std,l3_one_hot_B_30_1_sum,...,l3_diff_D_144_min,l3_diff_D_144_max,l3_diff_D_144_sum,l3_diff_D_144_last,l3_diff_D_145_mean,l3_diff_D_145_std,l3_diff_D_145_min,l3_diff_D_145_max,l3_diff_D_145_sum,l3_diff_D_145_last
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.0,0.0,0,1.0,0.0,3,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.0,0.0,0,1.0,0.0,3,0.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [208]:
l3_feature_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 2119 entries, customer_ID to l3_diff_D_145_last
dtypes: float16(1947), float64(106), int64(12), object(1), uint8(53)
memory usage: 2.1+ GB


In [209]:
# write to disk
l3_feature_statistic.to_feather('./data/l3_statistic.feather')

# release memory
del l3_txn__features_categorical_statistic
del l3_txn__features_numerical_statistic
del l3_txn__features_diff_statistic
del l3_feature_statistic

### 2.3.2 latest 6 transaction feature

In [210]:
# latest 6 transactions features
l6_txn = retrieve_last_n_transactions_per_customer(training, n=6)
l6_txn_features_numerical_statistic = numerical_feature_statistic(l6_txn, numerical_features, calc_last=False)
l6_txn_features_numerical_statistic.head()

Unnamed: 0,customer_ID,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_sum,D_39_mean,D_39_std,D_39_min,D_39_max,...,D_144_mean,D_144_std,D_144_min,D_144_max,D_144_sum,D_145_mean,D_145_std,D_145_min,D_145_max,D_145_sum
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.913333,0.031411,0.86,0.95,5.48,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.878333,0.017224,0.86,0.91,5.27,0.206667,0.194182,0.0,0.53,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0.865,0.03937,0.79,0.9,5.19,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0.613333,0.008165,0.6,0.62,3.68,0.005,0.012247,0.0,0.03,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0.868333,0.007528,0.86,0.88,5.21,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [211]:
l6_txn_features_numerical_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 886 entries, customer_ID to D_145_sum
dtypes: float64(885), object(1)
memory usage: 3.0+ GB


In [212]:
for c in tqdm(l6_txn_features_numerical_statistic.columns[1:]):
    l6_txn_features_numerical_statistic[c] = l6_txn_features_numerical_statistic[c].astype(np.float16)

100%|██████████| 885/885 [15:32<00:00,  1.05s/it]


In [213]:
l6_txn_features_numerical_statistic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 886 entries, customer_ID to D_145_sum
dtypes: float16(885), object(1)
memory usage: 778.1+ MB


In [214]:
# rename columns avoid conflict
renamed_columns = customer_id_column + ['l6_' + t for t in l6_txn_features_numerical_statistic.columns[1:]]

l6_txn_features_numerical_statistic.columns = renamed_columns

l6_txn_features_numerical_statistic.head(2)

Unnamed: 0,customer_ID,l6_P_2_mean,l6_P_2_std,l6_P_2_min,l6_P_2_max,l6_P_2_sum,l6_D_39_mean,l6_D_39_std,l6_D_39_min,l6_D_39_max,...,l6_D_144_mean,l6_D_144_std,l6_D_144_min,l6_D_144_max,l6_D_144_sum,l6_D_145_mean,l6_D_145_std,l6_D_145_min,l6_D_145_max,l6_D_145_sum
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0.913574,0.031403,0.859863,0.950195,5.480469,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0.878418,0.017227,0.859863,0.910156,5.269531,0.206665,0.194214,0.0,0.529785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [215]:
# write to disk
l6_txn_features_numerical_statistic.to_feather('./data/l6_statistic.feather')

# release memory
del l6_txn_features_numerical_statistic

## 2.4 Series feature

- Train a lightGBM with raw feature + onehot encoding
- Using prediction as series input for up coming models

In [265]:
# reload training
#del training
training = pd.merge(train_x, train_y, how='inner', on=['customer_ID'])
training.head(2)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,93,0,0,100,0,12,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,93,0,0,100,0,12,0,0,...,0,0,0,0,0,0,0,0,0,0


In [222]:
# normalization
training[numerical_features] = training[numerical_features] / 100

In [11]:
# convert to float16 to save memory
for c in tqdm(numerical_features):
    training[c] = training[c].astype(np.float16)

100%|██████████| 177/177 [00:16<00:00, 10.99it/s]


In [12]:
training_onehot = one_hot_encoding(training, categorical_features, is_drop=True)
training_onehot.head()

one hot encoding: B_30
one hot encoding: B_38
one hot encoding: D_114
one hot encoding: D_116
one hot encoding: D_117
one hot encoding: D_120
one hot encoding: D_126
one hot encoding: D_63
one hot encoding: D_64
one hot encoding: D_66
one hot encoding: D_68


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,one_hot_D_66_0,one_hot_D_66_1,one_hot_D_68_-1,one_hot_D_68_0,one_hot_D_68_1,one_hot_D_68_2,one_hot_D_68_3,one_hot_D_68_4,one_hot_D_68_5,one_hot_D_68_6
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.930176,0.0,0.0,1.0,0.0,0.119995,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.930176,0.0,0.0,1.0,0.0,0.119995,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.950195,0.090027,0.020004,1.009766,0.0,0.119995,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.959961,0.0,0.010002,1.0,0.0,0.109985,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.939941,0.0,0.010002,1.0,0.0,0.109985,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
training_onehot.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Columns: 233 entries, customer_ID to one_hot_D_68_6
dtypes: float16(177), int64(1), object(2), uint8(53)
memory usage: 2.3+ GB


In [14]:
model_features = [c for c in training_onehot.columns if c not in (customer_id_column + date_column + label)]

In [19]:
# define lightGBM config
lgb_config = {
    'name': 'lightGBM_raw_feature',
    'root_path':'/Users/qucy/Kaggle/amex_default_prediction/model/',
    'seed':42,
    'epoch':4500,
    'early_stopping':100,
    'verbose_eval':50,
    'n_folds':5,
    'features': model_features,
    'label': label,
    'id': customer_id_column,
    'verbose_eval': 50,
    'remark': 'lgb_with_numerical_feature',
    'lgb_hyper_params':{
                  'objective' : 'binary',
                  'metric' : 'binary_logloss',
                  'boosting': 'goss', # dart
                  'max_depth' : -1,
                  'num_leaves' : 64,
                  'learning_rate' : 0.035,
                  #'bagging_freq': 5,
                  #'bagging_fraction' : 0.7,
                  'feature_fraction' : 0.7,
                  'min_data_in_leaf': 256,
                  'max_bin': 63,
                  'min_data_in_bin': 256,
                  'tree_learner': 'serial',
                  'boost_from_average': 'false',
                  'lambda_l1' : 0.1,
                  'lambda_l2' : 30,
                  'num_threads': 6,
                  'verbosity' : 1,
    }
}
# construct lightGBM model
lightGBMModel = LightGBMWrapper(lgb_config)

In [20]:
lightGBMModel.train(training_onehot)


{'name': 'lightGBM_raw_feature', 'root_path': '/Users/qucy/Kaggle/amex_default_prediction/model/', 'seed': 42, 'epoch': 4500, 'early_stopping': 100, 'verbose_eval': 50, 'n_folds': 5, 'features': ['P_2', 'D_39', 'B_1', 'B_2', 'R_1', 'S_3', 'D_41', 'B_3', 'D_42', 'D_43', 'D_44', 'B_4', 'D_45', 'B_5', 'R_2', 'D_46', 'D_47', 'D_48', 'D_49', 'B_6', 'B_7', 'B_8', 'D_50', 'D_51', 'B_9', 'R_3', 'D_52', 'P_3', 'B_10', 'D_53', 'S_5', 'B_11', 'S_6', 'D_54', 'R_4', 'S_7', 'B_12', 'S_8', 'D_55', 'D_56', 'B_13', 'R_5', 'D_58', 'S_9', 'B_14', 'D_59', 'D_60', 'D_61', 'B_15', 'S_11', 'D_62', 'D_65', 'B_16', 'B_17', 'B_18', 'B_19', 'B_20', 'S_12', 'R_6', 'S_13', 'B_21', 'D_69', 'B_22', 'D_70', 'D_71', 'D_72', 'S_15', 'B_23', 'D_73', 'P_4', 'D_74', 'D_75', 'D_76', 'B_24', 'R_7', 'D_77', 'B_25', 'B_26', 'D_78', 'D_79', 'R_8', 'R_9', 'S_16', 'D_80', 'R_10', 'R_11', 'B_27', 'D_81', 'D_82', 'S_17', 'R_12', 'B_28', 'R_13', 'D_83', 'R_14', 'R_15', 'D_84', 'R_16', 'B_29', 'S_18', 'D_86', 'D_87', 'R_17', 'R_18'

[2950]	training's binary_logloss: 0.228188	valid_1's binary_logloss: 0.238814
[3000]	training's binary_logloss: 0.227712	valid_1's binary_logloss: 0.238474
[3050]	training's binary_logloss: 0.227247	valid_1's binary_logloss: 0.238138
[3100]	training's binary_logloss: 0.226773	valid_1's binary_logloss: 0.237791
[3150]	training's binary_logloss: 0.226313	valid_1's binary_logloss: 0.23747
[3200]	training's binary_logloss: 0.225871	valid_1's binary_logloss: 0.237164
[3250]	training's binary_logloss: 0.225434	valid_1's binary_logloss: 0.236865
[3300]	training's binary_logloss: 0.225002	valid_1's binary_logloss: 0.236568
[3350]	training's binary_logloss: 0.224566	valid_1's binary_logloss: 0.236254
[3400]	training's binary_logloss: 0.224117	valid_1's binary_logloss: 0.235937
[3450]	training's binary_logloss: 0.223657	valid_1's binary_logloss: 0.235607
[3500]	training's binary_logloss: 0.223232	valid_1's binary_logloss: 0.235311
[3550]	training's binary_logloss: 0.222784	valid_1's binary_loglo

[LightGBM] [Info] Number of positive: 1102296, number of negative: 3322865
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5664
[LightGBM] [Info] Number of data points in the train set: 4425161, number of used features: 230
[LightGBM] [Info] Using GOSS
Training until validation scores don't improve for 100 rounds
[50]	training's binary_logloss: 0.329988	valid_1's binary_logloss: 0.329545
[100]	training's binary_logloss: 0.28371	valid_1's binary_logloss: 0.283395
[150]	training's binary_logloss: 0.272908	valid_1's binary_logloss: 0.272855
[200]	training's binary_logloss: 0.268198	valid_1's binary_logloss: 0.268384
[250]	training's binary_logloss: 0.265238	valid_1's binary_logloss: 0.26564
[300]	training's binary_logloss: 0.263076	valid_1's binary_logloss: 0.263687
[350]	training's binary_logloss: 0.261368	valid_1's binary_logloss: 0.262173
[400]	training's binary_logloss: 0.259943	vali

[LightGBM] [Info] Number of positive: 1102295, number of negative: 3322866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5666
[LightGBM] [Info] Number of data points in the train set: 4425161, number of used features: 230
[LightGBM] [Info] Using GOSS
Training until validation scores don't improve for 100 rounds
[50]	training's binary_logloss: 0.329703	valid_1's binary_logloss: 0.330096
[100]	training's binary_logloss: 0.283479	valid_1's binary_logloss: 0.28421
[150]	training's binary_logloss: 0.272712	valid_1's binary_logloss: 0.273694
[200]	training's binary_logloss: 0.267908	valid_1's binary_logloss: 0.269171
[250]	training's binary_logloss: 0.264974	valid_1's binary_logloss: 0.266498
[300]	training's binary_logloss: 0.262802	valid_1's binary_logloss: 0.264551
[350]	training's binary_logloss: 0.26108	valid_1's binary_logloss: 0.263042
[400]	training's binary_logloss: 0.25965	valid

[LightGBM] [Info] Number of positive: 1102295, number of negative: 3322866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5665
[LightGBM] [Info] Number of data points in the train set: 4425161, number of used features: 230
[LightGBM] [Info] Using GOSS
Training until validation scores don't improve for 100 rounds
[50]	training's binary_logloss: 0.329948	valid_1's binary_logloss: 0.329799
[100]	training's binary_logloss: 0.283706	valid_1's binary_logloss: 0.28355
[150]	training's binary_logloss: 0.272929	valid_1's binary_logloss: 0.2729
[200]	training's binary_logloss: 0.268198	valid_1's binary_logloss: 0.268332
[250]	training's binary_logloss: 0.26527	valid_1's binary_logloss: 0.265579
[300]	training's binary_logloss: 0.263082	valid_1's binary_logloss: 0.263594
[350]	training's binary_logloss: 0.261357	valid_1's binary_logloss: 0.262072
[400]	training's binary_logloss: 0.259941	valid_

[LightGBM] [Info] Number of positive: 1102295, number of negative: 3322866
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5700
[LightGBM] [Info] Number of data points in the train set: 4425161, number of used features: 230
[LightGBM] [Info] Using GOSS
Training until validation scores don't improve for 100 rounds
[50]	training's binary_logloss: 0.329668	valid_1's binary_logloss: 0.330376
[100]	training's binary_logloss: 0.283378	valid_1's binary_logloss: 0.284452
[150]	training's binary_logloss: 0.272621	valid_1's binary_logloss: 0.273982
[200]	training's binary_logloss: 0.26793	valid_1's binary_logloss: 0.269561
[250]	training's binary_logloss: 0.26502	valid_1's binary_logloss: 0.26688
[300]	training's binary_logloss: 0.262813	valid_1's binary_logloss: 0.264884
[350]	training's binary_logloss: 0.261124	valid_1's binary_logloss: 0.263391
[400]	training's binary_logloss: 0.259644	valid

All mean metric:0.769329, global metric:0.769331


In [3]:
# release memory
del train_x
del train_y

# loading testing data
test = pd.read_feather('./data/test_x_denoised.feather')
test.head(2)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-02-19,0.629883,0.0,0.010002,0.810059,0.010002,0.170044,0.010002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010002,0.0
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,2019-03-25,0.589844,0.010002,0.010002,0.810059,0.0,0.23999,0.0,0.010002,...,0.0,0.0,0.0,0.0,0.0,0.010002,0.0,0.0,0.0,0.0


In [4]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11363762 entries, 0 to 11363761
Columns: 190 entries, customer_ID to D_145
dtypes: float16(176), int8(12), object(2)
memory usage: 4.0+ GB


In [7]:
# merge train and test before onehot
data = pd.concat([training, test], axis=0)

data.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,93.0,0.0,0.0,100.0,0.0,12.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,93.0,0.0,0.0,100.0,0.0,12.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,95.0,9.0,2.0,101.0,0.0,12.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,96.0,0.0,1.0,100.0,0.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,94.0,0.0,1.0,100.0,0.0,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
del training
del test

# onehot encoding
data_onehot = one_hot_encoding(data, categorical_features, is_drop=True)
data_onehot.head()

one hot encoding: B_30
one hot encoding: B_38
one hot encoding: D_114
one hot encoding: D_116
one hot encoding: D_117
one hot encoding: D_120
one hot encoding: D_126
one hot encoding: D_63
one hot encoding: D_64
one hot encoding: D_66
one hot encoding: D_68


Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,one_hot_D_66_0,one_hot_D_66_1,one_hot_D_68_-1,one_hot_D_68_0,one_hot_D_68_1,one_hot_D_68_2,one_hot_D_68_3,one_hot_D_68_4,one_hot_D_68_5,one_hot_D_68_6
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,93.0,0.0,0.0,100.0,0.0,12.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,93.0,0.0,0.0,100.0,0.0,12.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,95.0,9.0,2.0,101.0,0.0,12.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,96.0,0.0,1.0,100.0,0.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,94.0,0.0,1.0,100.0,0.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


In [16]:
# split test data out
testing = data_onehot[data_onehot['target'].isna()]

del data_onehot

In [17]:
testing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11363762 entries, 0 to 11363761
Columns: 233 entries, customer_ID to one_hot_D_68_6
dtypes: float32(176), float64(1), int16(1), object(2), uint8(53)
memory usage: 8.4+ GB


In [18]:
# fillna
for c in tqdm(numerical_features):
    testing[c] = testing[c].fillna(0)

100%|██████████| 178/178 [1:20:28<00:00, 27.13s/it]  


In [19]:
# normalization
for c in tqdm(numerical_features):
    testing[c] = testing[c] / 100

100%|██████████| 178/178 [00:17<00:00, 10.28it/s]


In [20]:
# convert to float16 to save memory
for c in tqdm(numerical_features):
    testing[c] = testing[c].astype(np.float16)

100%|██████████| 178/178 [00:15<00:00, 11.68it/s]


In [22]:
testing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11363762 entries, 0 to 11363761
Columns: 233 entries, customer_ID to one_hot_D_68_6
dtypes: float16(178), object(2), uint8(53)
memory usage: 4.6+ GB


In [None]:
# loading prediction
def predict(data, model_path=None):
    """ predict and generate submission file
    """
    # create submission dataframe
    submission = data[self.id]
    submission["prediction"] = 0
    # looping all the folds
    for fold in range(self.n_folds):
        # generate model file path
        model_path = self.output_path if model_path is None else model_path
        model_file = os.path.join(model_path, f"{fold}.ckpt")
        # loading model
        model = lgb.Booster(model_file=model_file)
        test_preds = model.predict(
            data[self.features], num_iteration=model.best_iteration
        )
        submission["prediction"] += test_preds / self.n_folds
    # save to local disk
    submission.to_csv(
        os.path.join(model_path, "submission.csv.zip"),
        compression="zip",
        index=False,
    )

In [24]:
submission = testing[customer_id_column]
submission["prediction"] = 0

submission.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0


In [28]:
model_features = [c for c in testing.columns if c not in (customer_id_column + date_column + label)]

In [31]:
model_path = './model/lightGBM_raw_feature_20221127_153356/'
model_file = os.path.join(model_path, "0.ckpt")
# loading model
model = lgb.Booster(model_file=model_file)
# prediction
test_preds = model.predict(
    testing[model_features], num_iteration=model.best_iteration
)

In [32]:
submission["prediction"] = test_preds
submission.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.319382
1,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.472452
2,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.40981
3,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.589523
4,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.52899


In [33]:
# save to local disk
submission.to_csv(
    os.path.join(model_path, "submission.csv.zip"),
    compression="zip",
    index=False,
)