In [1]:
import pandas as pd

import lightgbm as lgb
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt

In [2]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

train = pd.read_parquet("../data/processed/train_withfeatures.parquet")
test = pd.read_parquet("../data/raw/test.parquet")

In [3]:
# VERSION NAME FOR SAVED MODEL FILES
VER = '01'

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

# TRAIN FOLD
TRAIN_PATH = "../data/processed/train.parquet"

### Competition metric

In [4]:
def amex_metric(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True


### Model training

In [5]:
# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 918 features!


In [6]:
xgb_params = {
        'num_leaves': 10,
        'max_bin': 127,
        'min_data_in_leaf': 11,
        'learning_rate': 0.02,
        'min_sum_hessian_in_leaf': 0.00245,
        'bagging_fraction': 1.0, 
        'bagging_freq': 5, 
        'feature_fraction': 0.05,
        'lambda_l1': 4.972,
        'lambda_l2': 2.276,
        'min_gain_to_split': 0.65,
        'max_depth': 14,
        'save_binary': True,
        'seed': 1337,
        'feature_fraction_seed': 1337,
        'bagging_seed': 1337,
        'drop_seed': 1337,
        'data_random_seed': 1337,
        'objective': 'binary',
        'boosting_type': 'dart',
        'verbose': 1,
        'is_unbalance': True,
        'boost_from_average': False,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0
        }


### Version 02 - Kaggle Metric XGB - 0.7466903183087572
* Dropped features from spearman correlation

### Version 03 - Kaggle Metric XGB - 0.7399987922099773
* Drop features from WOE and IV 
* Feature drop with no knowledge about feature meaning is a problem.

### Version 04 - Kaggle Metric LGBM - 
* Force features

In [7]:
importances = []
oof = []
TRAIN_SUBSAMPLE = 1.0

skf = KFold(n_splits = FOLDS, shuffle=True, random_state=42)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train.target)):

    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
        
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)

    X_train = train.loc[train_idx, FEATURES]
    y_train = train.loc[train_idx, 'target']
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']

    dtrain = lgb.Dataset(X_train, y_train)
    dvalid= lgb.Dataset(X_valid, y_valid)
    
    model = lgb.train(
                    params= xgb_params,
                    train_set=dtrain,
                    valid_sets=[dtrain, dvalid],
                    num_boost_round= 9999,
                    early_stopping_rounds = 100,
                    verbose_eval= 500,
                    feval = lgb_amex_metric
                    )

    model.save_model(f'../models/LGB_V{VER}_fold{fold}.lgb')

    oof_preds = model.predict(X_valid)
    acc = amex_metric(y_valid.values, oof_preds)
    print("Kaggle Metric=", acc,'\n')

    df = train.loc[valid_idx, ['customer_ID', 'target']].copy()
    df['oof_pred']= oof_preds
    oof.append(df)

    del dtrain, X_train, y_train, df
    del X_valid, y_valid, dvalid, model

print('#'*25)
oof = pd.concat(oof, axis=0, ignore_index=True).set_index('customer_ID')
acc= amex_metric(oof.target.values, oof.oof_pred.values)
print('OVERAL CV Kaggle Metric = ', acc)


#########################
### Fold 1
### Train size 367130 Valid size 91783
### Training with 100% fold data...
#########################




[LightGBM] [Info] Number of positive: 95285, number of negative: 271845
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 79515
[LightGBM] [Info] Number of data points in the train set: 367130, number of used features: 913
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 2060, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 482 dense feature groups (169.46 MB) transferred to GPU in 0.069656 secs. 1 sparse feature groups




[500]	training's auc: 0.950668	training's amex_metric: 0.744009	valid_1's auc: 0.949473	valid_1's amex_metric: 0.740578
[1000]	training's auc: 0.955238	training's amex_metric: 0.763613	valid_1's auc: 0.953793	valid_1's amex_metric: 0.760231
