This model is based on https://www.kaggle.com/code/kunheekimkr/amex-lgbm-gpu-starter-0-795.

# Load Libraries

In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os
from tqdm.auto import tqdm

print('RAPIDS version',cudf.__version__)

import datetime
import warnings
import gc
import pickle

RAPIDS version 21.10.01


In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

# Process and Feature Engineer Train Data
We will load @raddar Kaggle dataset from [here][1] with discussion [here][2]. Then we will engineer features suggested by @huseyincot in his notebooks [here][3] and [here][4]. We will use [RAPIDS][5] and the GPU to create new features quickly.

[1]: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
[2]: https://www.kaggle.com/competitions/amex-default-prediction/discussion/328514
[3]: https://www.kaggle.com/code/huseyincot/amex-catboost-0-793
[4]: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
[5]: https://rapids.ai/

In [3]:
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: 
        df = cudf.read_parquet(path)
        df = df.drop(columns=['B_29','D_82','D_75','D_74','D_119','D_77','D_104','D_143','D_141','S_7','S_24','B_33',
                   'B_23','B_15','B_37','B_11','D_87','D_61','D_123','D_69','D_106','D_65','D_137','D_109',
                   'D_49','D_135','D_50','D_71','D_93','D_138','B_40','B_10','B_6','B_12','B_27','B_13',
                   'B_26','B_5','B_21','B_31','B_36'])
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(NAN_VALUE) 
    print('shape of data:', df.shape)
    
    return df

In [4]:
def get_difference(data, num_features):
    second_last = data.groupby(['customer_ID'])[num_features].nth(-2)
    last = data.groupby(['customer_ID'])[num_features].last()
    returned_df = last - second_last
    returned_df.columns = [col + '_diff1' for col in returned_df.columns]
    returned_df.reset_index(inplace = True)
    return returned_df


def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace = True)

    # Lag Features
    for col in test_num_agg:
        if 'last' in col and col.replace('last', 'mean') in test_num_agg:
            test_num_agg[col + '_lag_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'mean')]
            test_num_agg[col + '_lag_div'] = test_num_agg[col] / (test_num_agg[col.replace('last', 'mean')] + 0.001)

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count','last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace = True)
    
    #cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
    #for col in tqdm(cols):
    #    test_num_agg[col] = test_num_agg[col].astype(np.float32)
    ## Transform int64 columns to int32
    #cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
    #for col in tqdm(cols):
    #    test_cat_agg[col] = test_cat_agg[col].astype(np.int32)
    # Get the difference
    df_diff = get_difference(df, num_features)
    df = test_num_agg.merge(test_cat_agg, how = 'inner', on = 'customer_ID').merge(df_diff, how = 'inner', on = 'customer_ID')
    #print(test_num_agg['customer_ID'].head())
    #print(test_cat_agg['customer_ID'].head())
    #print(df_diff['customer_ID'].head())
    del test_num_agg, test_cat_agg
    df.set_index('customer_ID',inplace = True)
    print('shape after engineering', df.shape )
    
    
    #cols = list(df.dtypes[df.dtypes == 'float64'].index)
    #for col in tqdm(cols):
    #    df[col] = df[col].astype(np.float32)
    #cols = list(df.dtypes[df.dtypes == 'int64'].index)
    #cols = [col for col in cols if col != 'customer_ID']
    #for col in tqdm(cols):
    #    df[col] = df[col].astype(np.int32)
    #cols = list(df.dtypes[df.dtypes == 'object'].index)
    #cols = [col for col in cols if col != 'customer_ID']
    #for col in tqdm(cols):
    #    df[col] = df[col].astype('category')
    return df

In [5]:
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    return ('Score',
            amex_metric(y_true, y_pred),
            True)

# Make Predictions
Since the Test Data is big, predicting all the results at once leads to an memory error. Split the data into 4 parts, make each prediction, and append them.

In [6]:
# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

# COMPUTE SIZE OF 4 PARTS FOR TEST DATA
NUM_PARTS = 4
TEST_PATH =  '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

print(f'Reading test data...')
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

Reading test data...
shape of data: (11363762, 2)
We will process test data as 4 separate parts.
There will be 231155 customers in each part (except the last part).
Below are number of rows in each part:
[2841209, 2839857, 2842105, 2840591]


In [7]:
# INFER TEST DATA IN PARTS
skip_rows = 0
skip_cust = 0
test_preds = []

for k in range(NUM_PARTS):
    
    # READ PART OF TEST DATA
    print(f'\nReading test data...')
    test = read_file(path = TEST_PATH)
    test = test.iloc[skip_rows:skip_rows+rows[k]]
    skip_rows += rows[k]
    print(f'=> Test part {k+1} has shape', test.shape )
    
    # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
    test = process_and_feature_engineer(test)
    test = test.fillna(NAN_VALUE)
    if k==NUM_PARTS-1: test = test.loc[customers[skip_cust:]]
    else: test = test.loc[customers[skip_cust:skip_cust+num_cust]]
    skip_cust += num_cust
    features = [f for f in test.columns if f != 'customer_ID' and f != 'target']
    dtest = test[features].as_gpu_matrix()
    del test 
    gc.collect()
    # reduce memory test = test[['P_2_mean']] 必要だったら追加
     # INFER LGBM MODELS ON TEST DATA
    with open(f'../input/iris-model-amex-gpu/LGBM_v{VER}_fold0.pkl', 'rb') as pickle_file:
        model = pickle.load(pickle_file)
        preds = model.predict(dtest,raw_score=True)
    for f in range(1,FOLDS):
        with open(f'../input/iris-model-amex-gpu/LGBM_v{VER}_fold{f}.pkl', 'rb') as pickle_file:
            model = pickle.load(pickle_file)
            preds += model.predict(dtest,raw_score=True)
    preds = preds / FOLDS
    test_preds.append(preds)

# CLEAN MEMORY
del dtest, model
_ = gc.collect()


Reading test data...
shape of data: (11363762, 149)
=> Test part 1 has shape (2841209, 149)
shape after engineering (231155, 1121)





Reading test data...
shape of data: (11363762, 149)
=> Test part 2 has shape (2839857, 149)
shape after engineering (231155, 1121)

Reading test data...
shape of data: (11363762, 149)
=> Test part 3 has shape (2842105, 149)
shape after engineering (231155, 1121)

Reading test data...
shape of data: (11363762, 149)
=> Test part 4 has shape (2840591, 149)
shape after engineering (231156, 1121)


# Make submission

In [8]:
test_predictions = np.concatenate(test_preds)

submission = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
submission.loc[:, "prediction"] = test_predictions
submission.to_csv("submission.csv", index=False)

In [9]:
submission.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,-4.019906
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,-7.037694
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,-3.345754
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,-1.708149
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,1.978882


# Future Ideas

- Use Optuna for hyperparmaeter tuning
- Change Boosting method to DART: slower but better accuracy