In [1]:
%%time
%reset -f
import gc; gc.collect()

import cudf
from sklearn.model_selection import StratifiedKFold


def flatten_columns(df):
    df.columns = ["_".join(column) for column in df.columns]
    return df


def preprocess(dataset):
    dataset['customer_ID'] = dataset['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    dataset['S_2'] = cudf.to_datetime(dataset['S_2'])
    dataset.set_index(['customer_ID', 'S_2'], inplace=True)
    
    return dataset


def engineer(dataset, feature_set):
    if feature_set == 0:
        dataset = dataset.groupby(level='customer_ID').last()
        return dataset
    
    if feature_set == 1:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['last']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat], axis=1)
        return dataset
    
    if feature_set == 2:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['last']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset
    
    if feature_set == 3:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset
    
    if feature_set == 4:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std', 'min', 'max']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std', 'min', 'max']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset

    if feature_set == 5:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        for col in num_feat.columns:
            if 'last' in col and col.replace('last', 'first') in num_feat.columns:
                num_feat[col + '_lag_sub'] = num_feat[col] - num_feat[col.replace('last', 'first')]
                num_feat[col + '_lag_div'] = num_feat[col] / num_feat[col.replace('last', 'first')]
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset

    if feature_set == 6:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        for col in num_feat.columns:
            if 'last' in col:
                num_feat[col + '_round2'] = num_feat[col].round(2)
        
        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset

    if feature_set == 7:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        for col in num_cols:
            num_feat[col + "_sub_mean"] = num_feat[col + "_last"] - num_feat[col + "_mean"]
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset    
    
    
def add_train_labels(train):
    train_labels = cudf.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')

    train_labels['customer_ID'] = train_labels['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    train_labels.set_index('customer_ID', inplace=True)
    
    return cudf.merge(train, train_labels, how='inner', left_index=True, right_index=True).sort_index()


def main(*, feature_set, num_rows):
    train = cudf.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet", num_rows=num_rows)
    train = preprocess(train)
    train = engineer(train, feature_set)
    train = add_train_labels(train)
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    train['fold'] = -1
    for fold_ix, (train_ixs, valid_ixs) in enumerate(kfold.split(train, train['target'].to_array())):
        train['fold'].iloc[valid_ixs] = fold_ix
    
    print(feature_set, train.shape)
    train.to_parquet('train.pq')


main(
    feature_set = 7,
    num_rows = None
)

7 (458913, 976)
CPU times: user 11.6 s, sys: 4.4 s, total: 16 s
Wall time: 38.6 s


In [2]:
%%time
%reset -f
import gc; gc.collect()
import cudf
from colorama import Style, Fore
import pandas as pd
import xgboost as xgb
import cupy as cp


def check_input(arr):
    if type(arr) is pd.DataFrame:
        arr = arr[arr.columns[0]]
        
    if type(arr) is pd.Series:
        arr = arr.values
        
    if len(arr.shape) > 1:
        arr = arr[:, 0]
        
    return arr


def gini(cs_0, cs_1, sum_0, sum_1):
    auc_ = (cs_0 - sum_0 / 2) * sum_1
    tot = cs_0[-1] * cs_1[-1]

    return 2 * float(auc_.sum() / tot) - 1


def recall_at4(cs_0, cs_1, sum_1):
    cs_tot = cs_0 + cs_1
    th = cs_tot[-1] * 0.96
    
    return float(sum_1[cs_tot >= th].sum() / cs_1[-1])


def amex_metric_cupy(y_true, y_pred):
    y_true = cp.asarray(check_input(y_true))
    y_pred = cp.asarray(check_input(y_pred))
    
    unique = cp.unique(y_pred)
    rank = cp.searchsorted(unique, y_pred)
    
    sum_1 = cp.zeros_like(unique, dtype=cp.float64)
    sum_1.scatter_add(rank, y_true)
    
    sum_0 = cp.zeros_like(unique, dtype=cp.float64)
    sum_0.scatter_add(rank, 1 - y_true)
    sum_0 *= 20
    
    cs_0, cs_1 = sum_0.cumsum(), sum_1.cumsum()
    
    g = gini(cs_0, cs_1, sum_0, sum_1)
    d = recall_at4(cs_0, cs_1, sum_1)
    
    return (g + d) / 2


def xgb_amex(y_pred, dmatrix):
    return "amex", amex_metric_cupy(dmatrix.get_label(), y_pred)


def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis="columns").sort_values(
            "prediction", ascending=False
        )
        df["weight"] = df["target"].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df["weight"].sum())
        df["weight_cumsum"] = df["weight"].cumsum()
        df_cutoff = df.loc[df["weight_cumsum"] <= four_pct_cutoff]
        return (df_cutoff["target"] == 1).sum() / (df["target"] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = pd.concat([y_true, y_pred], axis="columns").sort_values(
            "prediction", ascending=False
        )
        df["weight"] = df["target"].apply(lambda x: 20 if x == 0 else 1)
        df["random"] = (df["weight"] / df["weight"].sum()).cumsum()
        total_pos = (df["target"] * df["weight"]).sum()
        df["cum_pos_found"] = (df["target"] * df["weight"]).cumsum()
        df["lorentz"] = df["cum_pos_found"] / total_pos
        df["gini"] = (df["lorentz"] - df["random"]) * df["weight"]
        return df["gini"].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={"target": "prediction"})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)


def main(*, xgb_parameters=None, num_rows=None):
    folds = cudf.read_parquet("train.pq", num_rows=num_rows)
    
    features = [col for col in folds.columns if col not in ['target', 'fold']]
    print(len(features))
    
    predictions = []
    
    for fold_ix in range(5):
        print(Fore.BLUE + "#" * 10, f"Fold {fold_ix}", "#" * 10 + Style.RESET_ALL)
        
        train = folds[folds.fold != fold_ix]
        valid = folds[folds.fold == fold_ix]
        
        dtrain = xgb.DMatrix(data=train[features], label=train['target'])
        dvalid = xgb.DMatrix(data=valid[features], label=valid['target'])

        model = xgb.train(
            xgb_parameters,
            dtrain=dtrain,
            num_boost_round=9999,

            evals=[(dtrain, "train"), (dvalid, "valid")],
            early_stopping_rounds=500,
            
            custom_metric=xgb_amex,
            maximize=True,

            verbose_eval=100
        )
        
        model.save_model(f"xgb_fold{fold_ix}_seed{xgb_parameters['random_state']}.xgb")
        
        prediction = pd.DataFrame({
            "prediction": model.predict(dvalid, iteration_range=(0, model.best_iteration + 1)),
            "target": valid['target'].to_array()
        })
        
        print(f"Fold: {amex_metric(prediction[['target']], prediction[['prediction']]):.4f} CV")
        predictions.append(prediction)

        del dtrain, dvalid, model
        gc.collect()
    
        print(Fore.BLUE + "#" * 28, "\n" + Style.RESET_ALL)
    
    prediction = pd.concat(predictions)
    print(Style.BRIGHT + f"Results: {amex_metric(prediction[['target']], prediction[['prediction']]):.4f} CV" + Style.RESET_ALL)

main(
    xgb_parameters={
        'max_depth': 7,
        'eta': 0.03,

        'subsample': 0.88,
        'colsample_bytree': 0.5,
        
        'objective': 'binary:logistic',
        
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        
        'random_state': 42,
        
        'gamma': 1.5,
        'min_child_weight': 8,
        'lambda': 70,
    },
    num_rows = None
)

974
[34m########## Fold 0 ##########[0m
[0]	train-logloss:0.67366	train-amex:0.71546	valid-logloss:0.67377	valid-amex:0.70876
[100]	train-logloss:0.24071	train-amex:0.78390	valid-logloss:0.24700	valid-amex:0.76878
[200]	train-logloss:0.21649	train-amex:0.79988	valid-logloss:0.22693	valid-amex:0.77900
[300]	train-logloss:0.20786	train-amex:0.81188	valid-logloss:0.22237	valid-amex:0.78365
[400]	train-logloss:0.20226	train-amex:0.82020	valid-logloss:0.22019	valid-amex:0.78675
[500]	train-logloss:0.19752	train-amex:0.82805	valid-logloss:0.21893	valid-amex:0.78911
[600]	train-logloss:0.19323	train-amex:0.83533	valid-logloss:0.21807	valid-amex:0.78982
[700]	train-logloss:0.18945	train-amex:0.84174	valid-logloss:0.21750	valid-amex:0.79048
[800]	train-logloss:0.18564	train-amex:0.84828	valid-logloss:0.21701	valid-amex:0.79045
[900]	train-logloss:0.18213	train-amex:0.85435	valid-logloss:0.21667	valid-amex:0.79131
[1000]	train-logloss:0.17879	train-amex:0.86036	valid-logloss:0.21640	valid-amex

In [3]:
!rm train.pq

In [4]:
%%time
%reset -f
import gc; gc.collect()

CPU times: user 488 ms, sys: 11.1 ms, total: 499 ms
Wall time: 497 ms


0

In [5]:
%%time
%reset -f
import gc; gc.collect()
import cudf
import xgboost as xgb
import numpy as np


def flatten_columns(df):
    df.columns = ["_".join(column) for column in df.columns]
    return df


def preprocess(dataset):
    dataset['customer_ID'] = dataset['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    dataset['S_2'] = cudf.to_datetime(dataset['S_2'])
    dataset.set_index(['customer_ID', 'S_2'], inplace=True)
    
    return dataset


def engineer(dataset, feature_set):
    if feature_set == 0:
        dataset = dataset.groupby(level='customer_ID').last()
        return dataset
    
    if feature_set == 1:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['last']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat], axis=1)
        return dataset
    
    if feature_set == 2:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['last']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset
    
    if feature_set == 3:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset
    
    if feature_set == 4:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std', 'min', 'max']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std', 'min', 'max']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset

    if feature_set == 5:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        for col in num_feat.columns:
            if 'last' in col and col.replace('last', 'first') in num_feat.columns:
                num_feat[col + '_lag_sub'] = num_feat[col] - num_feat[col.replace('last', 'first')]
                num_feat[col + '_lag_div'] = num_feat[col] / num_feat[col.replace('last', 'first')]
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset

    if feature_set == 6:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        for col in num_feat.columns:
            if 'last' in col:
                num_feat[col + '_round2'] = num_feat[col].round(2)
        
        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset

    if feature_set == 7:
        cat_cols = [
            "B_30",
            "B_38",
            "D_114",
            "D_116",
            "D_117",
            "D_120",
            "D_126",
            "D_63",
            "D_64",
            "D_66",
            "D_68",
        ]
        cat_feat = dataset[cat_cols].groupby(level='customer_ID').agg(['count', 'last', 'nunique']).pipe(flatten_columns)

        num_cols = [col for col in dataset.columns if col not in cat_cols + ['target']]
        num_feat = dataset[num_cols].groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)
        
        for col in num_cols:
            num_feat[col + "_sub_mean"] = num_feat[col + "_last"] - num_feat[col + "_mean"]
        
        diff_cols_a = [f"B_{i}" for i in [11, 14, 17]] + ["D_39", "D_131"] + [f"S_{i}" for i in [16, 23]]
        diff_cols_b = ["P_2", "P_3"]
        diff_feat = dataset[diff_cols_a + diff_cols_b]
        for a in diff_cols_a:
            for b in diff_cols_b:
                    diff_feat[f"{a}-{b}"] = diff_feat[a] - diff_feat[b]
        diff_feat.drop(diff_cols_a + diff_cols_b, axis=1, inplace=True)
        diff_feat = diff_feat.groupby(level='customer_ID').agg(['first', 'last', 'mean', 'std']).pipe(flatten_columns)

        dataset = cudf.concat([cat_feat, num_feat, diff_feat], axis=1)
        return dataset


def predict(customers, rows, num_cust):
    skip_rows = 0
    skip_cust = 0
    test_preds = []

    for k in range(len(rows)):
        print("#" * 25)
        print(f"### {k}")
        print("#" * 25)
        
        test = cudf.read_parquet(
            "../input/amex-data-integer-dtypes-parquet-format/test.parquet",
            skiprows=skip_rows, num_rows=rows[k]
        )

        test['customer_ID'] = test['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
        test['S_2'] = cudf.to_datetime(test['S_2'])
        test.set_index(['customer_ID', 'S_2'], inplace=True)

        gc.collect()
        skip_rows += rows[k]

        test = engineer(test, 7)
        
        if k == len(rows) - 1:
            test = test.loc[customers[skip_cust:]]
        else:
            test = test.loc[customers[skip_cust : skip_cust + num_cust]]
        
        skip_cust += num_cust

        # Prepare data for inference
        dtest = xgb.DMatrix(data=test)
        gc.collect()

        # Compute predictions and average blend all fold models
        model = xgb.Booster()
        model.load_model(f"xgb_fold0_seed42.xgb")
        preds = model.predict(dtest)
        for f in range(1, 5):
            model.load_model(f"xgb_fold{f}_seed42.xgb")
            preds += model.predict(dtest, iteration_range=(0, model.best_iteration + 1))
        preds /= 5
        test_preds.append(preds)

        # Cleanup
        del dtest, model
        _ = gc.collect()

    return test_preds


def main():
    test_customers = cudf.read_parquet(
        "../input/amex-data-integer-dtypes-parquet-format/test.parquet",
        columns=['customer_ID']
    )
    test_customers["customer_ID"] = test_customers["customer_ID"].str[-16:].str.hex_to_int().astype("int64")
    
    def get_rows(customers, test, num_parts):
        """Divides the test dataset in `num_parts` parts.
        Each part contains approximately `chunk` customers.
        Returns the number of rows and then number of customers in
        each part, except the last which has fewer.
        """
        chunk = len(customers) // num_parts
        rows = []

        for k in range(num_parts):
            if k == num_parts - 1:
                cc = customers[k * chunk :]
            else:
                cc = customers[k * chunk : (k + 1) * chunk]

            s = test.loc[test.customer_ID.isin(cc)].shape[0]
            rows.append(s)

        return rows, chunk
    
    customers = test_customers[["customer_ID"]].drop_duplicates().sort_index().values.flatten()
    rows, num_cust = get_rows(customers, test_customers[["customer_ID"]], num_parts=10)
    
    test_preds = predict(customers, rows, num_cust)
    
    test_preds = np.concatenate(test_preds)
    test = cudf.DataFrame(index=customers, data={"prediction": test_preds})
    sub = cudf.read_csv("../input/amex-default-prediction/sample_submission.csv")[
        ["customer_ID"]
    ]
    sub["customer_ID_hash"] = sub["customer_ID"].str[-16:].str.hex_to_int().astype("int64")
    sub = sub.set_index("customer_ID_hash")
    sub = sub.merge(test[["prediction"]], left_index=True, right_index=True, how="left")
    sub = sub.reset_index(drop=True)

    # Display predictions
    sub.to_csv(f"submission_xgb.csv", index=False)
    print("Submission file shape is", sub.shape)


main()

#########################
### 0
#########################
#########################
### 1
#########################
#########################
### 2
#########################
#########################
### 3
#########################
#########################
### 4
#########################
#########################
### 5
#########################
#########################
### 6
#########################
#########################
### 7
#########################
#########################
### 8
#########################
#########################
### 9
#########################
Submission file shape is (924621, 2)
CPU times: user 11min 37s, sys: 11.9 s, total: 11min 49s
Wall time: 7min 13s


In [6]:
!ls -al

total 142940
drwxr-xr-x 2 root root     4096 Jul 26 20:39 .
drwxr-xr-x 6 root root     4096 Jul 26 20:03 ..
---------- 1 root root    73921 Jul 26 20:39 __notebook__.ipynb
-rw-r--r-- 1 root root 71094059 Jul 26 20:39 submission_xgb.csv
-rw-r--r-- 1 root root 15577894 Jul 26 20:10 xgb_fold0_seed42.xgb
-rw-r--r-- 1 root root 16471022 Jul 26 20:16 xgb_fold1_seed42.xgb
-rw-r--r-- 1 root root 13484194 Jul 26 20:21 xgb_fold2_seed42.xgb
-rw-r--r-- 1 root root 14857390 Jul 26 20:26 xgb_fold3_seed42.xgb
-rw-r--r-- 1 root root 14785394 Jul 26 20:31 xgb_fold4_seed42.xgb
