This model is based on https://www.kaggle.com/code/kunheekimkr/amex-lgbm-gpu-starter-0-795.

In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os
from tqdm.auto import tqdm

print('RAPIDS version',cudf.__version__)

RAPIDS version 21.10.01


In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

## Process and Feature Engineer Train Data

According to the analysis of data wrangling we remove the following features:

1) according to missing value analysis: 'B_29' and 'D-82'

2) according to correlation analysis: 'D_75','D_74','D_119','D_77','D_104','D_143','D_141','S_7','S_24','B_33','B_23','B_15','B_37','B_11'
3) according to distribution of non-categorical features analysis: 'D_87','D_61','D_123','D_69','D_106','D_65','D_137','D_109','D_49','D_135','D_50','D_71','D_93','D_138','B_40','B_10','B_6','B_12','B_27','B_13','B_26','B_5','B_21','B_31','B_36'

In [3]:
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    df = df.drop(columns=['B_29','D_82','D_75','D_74','D_119','D_77','D_104','D_143','D_141','S_7','S_24','B_33',
                   'B_23','B_15','B_37','B_11','D_87','D_61','D_123','D_69','D_106','D_65','D_137','D_109',
                   'D_49','D_135','D_50','D_71','D_93','D_138','B_40','B_10','B_6','B_12','B_27','B_13',
                   'B_26','B_5','B_21','B_31','B_36'])
   # FILL NAN
    df = df.fillna(NAN_VALUE) 

    #pred_1 = U[['customer_ID','target','pre_target']]
    print('shape of data:', df.shape)
    
    return df

In [4]:
def get_difference(data, num_features):
    second_last = data.groupby(['customer_ID'])[num_features].nth(-2)
    last = data.groupby(['customer_ID'])[num_features].last()
    returned_df = last - second_last
    returned_df.columns = [col + '_diff1' for col in returned_df.columns]
    returned_df.reset_index(inplace = True)
    return returned_df


def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in test_num_agg:
        if 'last' in col and col.replace('last', 'mean') in test_num_agg:
            test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'mean')]
            test_num_agg[col + '_lag_div'] = test_num_agg[col] / (test_num_agg[col.replace('last', 'mean')] + 0.001)

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count','last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    
    #cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    #for col in tqdm(cols):
    #    test_num_agg[col] = test_num_agg[col].astype(np.float32)
    ## Transform int64 columns to int32
    #cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    #for col in tqdm(cols):
    #    test_cat_agg[col] = test_cat_agg[col].astype(np.int32)
    # Get the difference
    df_diff = get_difference(df, num_features)
    df = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(df_diff, how = 'inner', on = 'customer_ID')
    #print(test_num_agg['customer_ID'].head())
    #print(test_cat_agg['customer_ID'].head())
    #print(df_diff['customer_ID'].head())
    del test_num_agg, test_cat_agg
    df.set_index('customer_ID',inplace = True)
    print('shape after engineering', df.shape )
    
    
    #cols = list(df.dtypes[df.dtypes == 'float64'].index)
    #for col in tqdm(cols):
    #    df[col] = df[col].astype(np.float32)
    #cols = list(df.dtypes[df.dtypes == 'int64'].index)
    #cols = [col for col in cols if col != 'customer_ID']
    #for col in tqdm(cols):
    #    df[col] = df[col].astype(np.int32)
    #cols = list(df.dtypes[df.dtypes == 'object'].index)
    #cols = [col for col in cols if col != 'customer_ID']
    #for col in tqdm(cols):
    #    df[col] = df[col].astype('category')
    return df

In [5]:
print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 149)


In [6]:
train = process_and_feature_engineer(train)

shape after engineering (458913, 1121)


In [7]:
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets.index = targets['customer_ID'].sort_index()
targets = targets.drop('customer_ID', axis=1)

print(type(train))
print(type(targets))
train = train.join(targets).sort_index()

del targets
gc.collect()

train = train.fillna(NAN_VALUE)

# FEATURES
FEATURES = train.columns[1:-1]

<class 'cudf.core.dataframe.DataFrame'>
<class 'cudf.core.dataframe.DataFrame'>


In [8]:
target = train.target.values

## Faster metric Implementation

reference:https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020

In [9]:
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    return ('Score',
            amex_metric(y_true, y_pred),
            True)

In [10]:
import datetime
import warnings
import gc
import pickle
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier, log_evaluation, early_stopping, record_evaluation
from copy import deepcopy

## Train LightGBM

In [11]:
features = [f for f in train.columns if f != 'customer_ID' and f != 'target']
print("Number of Features :",len(features))
def lgbm_params(random_state=1, n_estimators=3150):
    return LGBMClassifier(n_estimators=n_estimators,
                          #boosting_type = 'dart',
                          learning_rate=0.03, reg_lambda=50,
                          min_child_samples=2400,
                          num_leaves=100,
                          colsample_bytree=0.19,
                          device='gpu',
                          random_state=random_state, metric="None")

Number of Features : 1121


In [12]:
# create a callback function to save the best model for dart...
class _DartSaveBestModel:
    
    def __init__(self,eval_result):
        self.eval_result = eval_result
        
    def __call__(self,env):
        global best_score
        global file
        if env.iteration>1:
            eval_result_list = eval_result['valid_0']['Score']
            if eval_result_list[-1] > best_score:
                best_score = eval_result_list[-1]
                #print(f"new best score {best_score}, saving the model")
                #print(env.params)
                if env.iteration>100:
                    pickle.dump(env.model, open(file, 'wb'))

def lgb_dart_callback(eval_result):
    return _DartSaveBestModel(eval_result)

In [13]:
score_list = []
kf = StratifiedKFold(n_splits=5)

global file
global best_score
for fold, (train_idx, valid_idx) in enumerate(kf.split(train, train.target.to_array())):
    print('#'*25)
    print('### Fold',fold)
    print('### Train size:',len(train_idx),', Validation size:',len(valid_idx))
    print('#'*25)
    X_tr, X_val, y_tr, y_val, model = None, None, None, None, None
    start_time = datetime.datetime.now()
    X_tr = train.iloc[train_idx][features].as_gpu_matrix()
    X_val = train.iloc[valid_idx][features].as_gpu_matrix()
    y_tr = cupy.asarray(train.iloc[train_idx]["target"])
    y_val = cupy.asarray(train.iloc[valid_idx]["target"])
    
    model = lgbm_params()
    eval_result = {}
    best_score = 0
    file = f'LGBM_v{VER}_fold{fold}.pkl'
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_tr, cupy.asnumpy(y_tr),
                  eval_set = [(X_val, cupy.asnumpy(y_val))], 
                  eval_metric=[lgb_amex_metric],
                  callbacks=[log_evaluation(10),early_stopping(1500),record_evaluation(eval_result),lgb_dart_callback(eval_result)])
    print(f"printing using current round of model..")
    pickle.dump(model, open(f'LGBM_v{VER}_fold{fold}_current.pkl', 'wb'))
    y_val_pred = model.predict_proba(X_val, raw_score=True)
    score = amex_metric(y_val, y_val_pred)
    print(f"\n\n\nFold {fold} | Training Time: {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f" Score = {score:.5f}\n\n\n")
    
    print(f"printing using best round of model..")
    with open(file, 'rb') as pickle_file:
        best_model = pickle.load(pickle_file)
    y_val_pred = best_model.predict(X_val, raw_score=True)
    score = amex_metric(y_val, y_val_pred)
    print(f"\n\n\nFold {fold} | Training Time: {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f" Score = {score:.5f}\n\n\n")
    score_list.append(score)
    
    del X_val, y_val, score, model, best_model, best_score
    gc.collect()
    
print(f"\n\n\nScore: {np.mean(cupy.asarray(score_list)):.5f}\n\n\n")

#########################
### Fold 0
### Train size: 367130 , Validation size: 91783
#########################
Training until validation scores don't improve for 1500 rounds
[10]	valid_0's Score: 0.732998
[20]	valid_0's Score: 0.738805
[30]	valid_0's Score: 0.743925
[40]	valid_0's Score: 0.749257
[50]	valid_0's Score: 0.75129
[60]	valid_0's Score: 0.754712
[70]	valid_0's Score: 0.757253
[80]	valid_0's Score: 0.759642
[90]	valid_0's Score: 0.762676
[100]	valid_0's Score: 0.763735
[110]	valid_0's Score: 0.765027
[120]	valid_0's Score: 0.767374
[130]	valid_0's Score: 0.769009
[140]	valid_0's Score: 0.770996
[150]	valid_0's Score: 0.771829
[160]	valid_0's Score: 0.773237
[170]	valid_0's Score: 0.774366
[180]	valid_0's Score: 0.775825
[190]	valid_0's Score: 0.777269
[200]	valid_0's Score: 0.777958
[210]	valid_0's Score: 0.778559
[220]	valid_0's Score: 0.779652
[230]	valid_0's Score: 0.780762
[240]	valid_0's Score: 0.781724
[250]	valid_0's Score: 0.782763
[260]	valid_0's Score: 0.782898
[270






Fold 0 | Training Time: 17:33 | Score = 0.79312



#########################
### Fold 1
### Train size: 367130 , Validation size: 91783
#########################
Training until validation scores don't improve for 1500 rounds
[10]	valid_0's Score: 0.733643
[20]	valid_0's Score: 0.74135
[30]	valid_0's Score: 0.746821
[40]	valid_0's Score: 0.750339
[50]	valid_0's Score: 0.753171
[60]	valid_0's Score: 0.756904
[70]	valid_0's Score: 0.759017
[80]	valid_0's Score: 0.761503
[90]	valid_0's Score: 0.763463
[100]	valid_0's Score: 0.764671
[110]	valid_0's Score: 0.766026
[120]	valid_0's Score: 0.767856
[130]	valid_0's Score: 0.769998
[140]	valid_0's Score: 0.77183
[150]	valid_0's Score: 0.773197
[160]	valid_0's Score: 0.774499
[170]	valid_0's Score: 0.775893
[180]	valid_0's Score: 0.777283
[190]	valid_0's Score: 0.777784
[200]	valid_0's Score: 0.778512
[210]	valid_0's Score: 0.778686
[220]	valid_0's Score: 0.779456
[230]	valid_0's Score: 0.780224
[240]	valid_0's Score: 0.781188
[250]	valid_0's






Fold 1 | Training Time: 16:20 | Score = 0.79199



#########################
### Fold 2
### Train size: 367130 , Validation size: 91783
#########################
Training until validation scores don't improve for 1500 rounds
[10]	valid_0's Score: 0.740334
[20]	valid_0's Score: 0.744798
[30]	valid_0's Score: 0.749217
[40]	valid_0's Score: 0.75308
[50]	valid_0's Score: 0.756572
[60]	valid_0's Score: 0.759468
[70]	valid_0's Score: 0.762167
[80]	valid_0's Score: 0.765061
[90]	valid_0's Score: 0.766934
[100]	valid_0's Score: 0.768995
[110]	valid_0's Score: 0.770544
[120]	valid_0's Score: 0.772296
[130]	valid_0's Score: 0.774195
[140]	valid_0's Score: 0.775727
[150]	valid_0's Score: 0.776959
[160]	valid_0's Score: 0.777976
[170]	valid_0's Score: 0.779473
[180]	valid_0's Score: 0.779866
[190]	valid_0's Score: 0.780699
[200]	valid_0's Score: 0.781364
[210]	valid_0's Score: 0.782672
[220]	valid_0's Score: 0.783818
[230]	valid_0's Score: 0.785069
[240]	valid_0's Score: 0.785476
[250]	valid_0'






Fold 2 | Training Time: 16:03 | Score = 0.79444



#########################
### Fold 3
### Train size: 367131 , Validation size: 91782
#########################
Training until validation scores don't improve for 1500 rounds
[10]	valid_0's Score: 0.734549
[20]	valid_0's Score: 0.741173
[30]	valid_0's Score: 0.747247
[40]	valid_0's Score: 0.750143
[50]	valid_0's Score: 0.752899
[60]	valid_0's Score: 0.756513
[70]	valid_0's Score: 0.758432
[80]	valid_0's Score: 0.760943
[90]	valid_0's Score: 0.762844
[100]	valid_0's Score: 0.76491
[110]	valid_0's Score: 0.766319
[120]	valid_0's Score: 0.768797
[130]	valid_0's Score: 0.771216
[140]	valid_0's Score: 0.771807
[150]	valid_0's Score: 0.773324
[160]	valid_0's Score: 0.77488
[170]	valid_0's Score: 0.776819
[180]	valid_0's Score: 0.778248
[190]	valid_0's Score: 0.779208
[200]	valid_0's Score: 0.779699
[210]	valid_0's Score: 0.780087
[220]	valid_0's Score: 0.780448
[230]	valid_0's Score: 0.780906
[240]	valid_0's Score: 0.781393
[250]	valid_0's






Fold 3 | Training Time: 13:20 | Score = 0.79316



#########################
### Fold 4
### Train size: 367131 , Validation size: 91782
#########################
Training until validation scores don't improve for 1500 rounds
[10]	valid_0's Score: 0.737989
[20]	valid_0's Score: 0.743564
[30]	valid_0's Score: 0.748691
[40]	valid_0's Score: 0.753463
[50]	valid_0's Score: 0.756248
[60]	valid_0's Score: 0.759408
[70]	valid_0's Score: 0.762572
[80]	valid_0's Score: 0.765062
[90]	valid_0's Score: 0.767883
[100]	valid_0's Score: 0.769977
[110]	valid_0's Score: 0.771957
[120]	valid_0's Score: 0.7742
[130]	valid_0's Score: 0.776123
[140]	valid_0's Score: 0.777537
[150]	valid_0's Score: 0.778568
[160]	valid_0's Score: 0.77998
[170]	valid_0's Score: 0.780969
[180]	valid_0's Score: 0.782339
[190]	valid_0's Score: 0.783382
[200]	valid_0's Score: 0.784626
[210]	valid_0's Score: 0.785584
[220]	valid_0's Score: 0.786315
[230]	valid_0's Score: 0.786979
[240]	valid_0's Score: 0.787289
[250]	valid_0's 






Fold 4 | Training Time: 14:33 | Score = 0.79737






Score: 0.79402



